Re: bcm2711_thermal: Kernel panic - not syncing: Asynchronous SError Interrupt

2021-02-10 Thread Juerg Haefliger
On Wed, 10 Feb 2021 14:15:46 +0100
Nicolas Saenz Julienne  wrote:

> [ Add Robin, Catalin and Florian in case they want to chime in ]
> 
> Hi Juerg, thanks for the report!
> 
> On Wed, 2021-02-10 at 11:48 +0100, Juerg Haefliger wrote:
> > Trying to dump the BCM2711 registers kills the kernel:
> > 
> > # cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2000/range
> > 0-efc
> > # cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2000/registers 
> > 
> > [   62.857661] SError Interrupt on CPU1, code 0xbf02 -- SError  
> 
> So ESR's IDS (bit 24) is set, which means it's an 'Implementation Defined
> SError,' hence IIUC the rest of the error code is meaningless to anyone 
> outside
> of Broadcom/RPi.
> 
> The regmap is created through the following syscon device:
> 
>   avs_monitor: avs-monitor@7d5d2000 {
>   compatible = "brcm,bcm2711-avs-monitor",
>"syscon", "simple-mfd";
>   reg = <0x7d5d2000 0xf00>;
> 
>   thermal: thermal {
>   compatible = "brcm,bcm2711-thermal";
>   #thermal-sensor-cells = <0>;
>   };
>   };
> 
> I've done some tests with devmem, and the whole <0x7d5d2000 0xf00> range is
> full of addresses that trigger this same error. Also note that as per 
> Florian's
> comments[1]: "AVS_RO_REGISTERS_0: 0x7d5d2200 - 0x7d5d22e3." But from what I 
> can
> tell, at least 0x7d5d22b0 seems to be faulty too.
> 
> Any ideas/comments? My guess is that those addresses are marked somehow as
> secure, and only for VC4 to access (VC4 is RPi4's co-processor). Ultimately,
> the solution is to narrow the register range exposed by avs-monitor to 
> whatever
> bcm2711-thermal needs (which is ATM a single 32bit register).

Yeah, that's what I tried but wasn't sure if that's the correct approach or
if there was something wrong with the DTB (which I know virtually nothing
about).

With [1] I get seemingly the correct behavior:

# cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2200/range 
0-0
# cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2200/registers 
0: 000106fa
# cat /sys/class/thermal/thermal_zone0/temp 
39433

On a different note, how did you come up with that address range in the DTB?
Is that public information?

...Juerg


[1]
diff --git a/arch/arm/boot/dts/bcm2711.dtsi b/arch/arm/boot/dts/bcm2711.dtsi
index 4847dd305317..a7059967aab1 100644
--- a/arch/arm/boot/dts/bcm2711.dtsi
+++ b/arch/arm/boot/dts/bcm2711.dtsi
@@ -65,10 +65,10 @@ gicv2: interrupt-controller@40041000 {
 IRQ_TYPE_LEVEL_HIGH)>;
};
 
-   avs_monitor: avs-monitor@7d5d2000 {
+   avs_monitor: avs-monitor@7d5d2200 {
compatible = "brcm,bcm2711-avs-monitor",
 "syscon", "simple-mfd";
-   reg = <0x7d5d2000 0xf00>;
+   reg = <0x7d5d2200 0x4>;
 
thermal: thermal {
compatible = "brcm,bcm2711-thermal";
diff --git a/drivers/thermal/broadcom/bcm2711_thermal.c 
b/drivers/thermal/broadcom/bcm2711_thermal.c
index 67c2a737bc9d..3b5a84402b89 100644
--- a/drivers/thermal/broadcom/bcm2711_thermal.c
+++ b/drivers/thermal/broadcom/bcm2711_thermal.c
@@ -22,7 +22,7 @@
 
 #include "../thermal_hwmon.h"
 
-#define AVS_RO_TEMP_STATUS 0x200
+#define AVS_RO_TEMP_STATUS 0x0 /* address 0x7d5d2200 */
 #define AVS_RO_TEMP_STATUS_VALID_MSK   (BIT(16) | BIT(10))
 #define AVS_RO_TEMP_STATUS_DATA_MSKGENMASK(9, 0)




> Regards,
> Nicolas
> 
> [1] 
> https://lore.kernel.org/linux-pm/82125042-684a-b4e2-fbaa-45a393b2c...@gmx.net/
> 
> > [   62.857671] CPU: 1 PID: 478 Comm: cat Not tainted 5.11.0-rc7 #4
> > [   62.857674] Hardware name: Raspberry Pi 4 Model B Rev 1.2 (DT)
> > [   62.857676] pstate: 2085 (nzCv daIf -PAN -UAO -TCO BTYPE=--)
> > [   62.857679] pc : regmap_mmio_read32le+0x1c/0x34
> > [   62.857681] lr : regmap_mmio_read+0x50/0x80
> > [   62.857682] sp : 8000105c3c00
> > [   62.857685] x29: 8000105c3c00 x28: 0014 
> > [   62.857694] x27: 0014 x26: d2ea1c2060b0 
> > [   62.857699] x25: 4e34408ecc00 x24: 0efc 
> > [   62.857704] x23: 8000105c3e20 x22: 8000105c3d3c 
> > [   62.857710] x21: 8000105c3d3c x20: 0014 
> > [   62.857715] x19: 4e344037a900 x18: 0020 
> > [   62.857720] x17:  x16:  
> > [   62.857725] x15: 4e3447ac40f0 x14: 000

bcm2711_thermal: Kernel panic - not syncing: Asynchronous SError Interrupt

2021-02-10 Thread Juerg Haefliger
Trying to dump the BCM2711 registers kills the kernel:

# cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2000/range
0-efc
# cat /sys/kernel/debug/regmap/dummy-avs-monitor\@fd5d2000/registers 

[   62.857661] SError Interrupt on CPU1, code 0xbf02 -- SError
[   62.857671] CPU: 1 PID: 478 Comm: cat Not tainted 5.11.0-rc7 #4
[   62.857674] Hardware name: Raspberry Pi 4 Model B Rev 1.2 (DT)
[   62.857676] pstate: 2085 (nzCv daIf -PAN -UAO -TCO BTYPE=--)
[   62.857679] pc : regmap_mmio_read32le+0x1c/0x34
[   62.857681] lr : regmap_mmio_read+0x50/0x80
[   62.857682] sp : 8000105c3c00
[   62.857685] x29: 8000105c3c00 x28: 0014 
[   62.857694] x27: 0014 x26: d2ea1c2060b0 
[   62.857699] x25: 4e34408ecc00 x24: 0efc 
[   62.857704] x23: 8000105c3e20 x22: 8000105c3d3c 
[   62.857710] x21: 8000105c3d3c x20: 0014 
[   62.857715] x19: 4e344037a900 x18: 0020 
[   62.857720] x17:  x16:  
[   62.857725] x15: 4e3447ac40f0 x14: 0003 
[   62.857730] x13: 4e34422c x12: 4e34422a0046 
[   62.857735] x11: d2ea1c8765e0 x10:  
[   62.857741] x9 : d2ea1b9495a0 x8 : 4e34429ef980 
[   62.857746] x7 : 000f x6 : 4e34422a004b 
[   62.857751] x5 : fff9 x4 :  
[   62.857757] x3 : d2ea1b949550 x2 : d2ea1b949330 
[   62.857761] x1 : 0014 x0 :  
[   62.857767] Kernel panic - not syncing: Asynchronous SError Interrupt
[   62.857770] CPU: 1 PID: 478 Comm: cat Not tainted 5.11.0-rc7 #4
[   62.857773] Hardware name: Raspberry Pi 4 Model B Rev 1.2 (DT)
[   62.857775] Call trace:
[   62.85]  dump_backtrace+0x0/0x1e0
[   62.857778]  show_stack+0x24/0x70
[   62.857780]  dump_stack+0xd0/0x12c
[   62.857782]  panic+0x168/0x370
[   62.857783]  nmi_panic+0x98/0xa0
[   62.857786]  arm64_serror_panic+0x8c/0x98
[   62.857787]  do_serror+0x3c/0x6c
[   62.857789]  el1_error+0x78/0xf0
[   62.857791]  regmap_mmio_read32le+0x1c/0x34
[   62.857793]  _regmap_bus_reg_read+0x24/0x30
[   62.857795]  _regmap_read+0x6c/0x17c
[   62.857797]  regmap_read+0x58/0x84
[   62.857799]  regmap_read_debugfs+0x138/0x3f4
[   62.857801]  regmap_map_read_file+0x34/0x40
[   62.857803]  full_proxy_read+0x6c/0xc0
[   62.857805]  vfs_read+0xb8/0x1e4
[   62.857807]  ksys_read+0x78/0x10c
[   62.857809]  __arm64_sys_read+0x28/0x34
[   62.857811]  el0_svc_common.constprop.0+0x7c/0x194
[   62.857813]  do_el0_svc+0x30/0x9c
[   62.857814]  el0_svc+0x20/0x30
[   62.857816]  el0_sync_handler+0x1a4/0x1b0
[   62.857818]  el0_sync+0x174/0x180
[   62.857842] SMP: stopping secondary CPUs
[   62.857845] Kernel Offset: 0x52ea0b08 from 0x80001000
[   62.857847] PHYS_OFFSET: 0xb1cc
[   62.857849] CPU features: 0x00240022,61806000
[   62.857851] Memory Limit: none

Sprinkling printks around regmap_read [1] shows that reading from 0x14 (20)
seems to cause the issue:


[   40.456230] map=020a069c9c00, from=0, to=3836, count=131072
[   40.462520] map=020a069c9c00, i=0
[   40.466319] ret=0, val=0
[   40.468922] map=020a069c9c00, i=4
[   40.472684] ret=0, val=0
[   40.475292] map=020a069c9c00, i=8
[   40.479048] ret=0, val=0
[   40.481649] map=020a069c9c00, i=12
[   40.485492] ret=0, val=0
[   40.488080] map=020a069c9c00, i=16
[   40.491922] ret=0, val=0
[   40.494523] map=020a069c9c00, i=20
[   40.498497] SError Interrupt on CPU0, code 0xbf02 -- SError
[   40.498499] CPU: 0 PID: 486 Comm: cat Not tainted 5.11.0-rc7+ #8
[   40.498501] Hardware name: Raspberry Pi 4 Model B Rev 1.2 (DT)


...Juerg

[1]
diff --git a/drivers/base/regmap/regmap-debugfs.c 
b/drivers/base/regmap/regmap-debugfs.c
index ff2ee87987c7..9465f5a2f3b8 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -229,6 +229,7 @@ static ssize_t regmap_read_debugfs(struct regmap *map, 
unsigned int from,
if (count > (PAGE_SIZE << (MAX_ORDER - 1)))
count = PAGE_SIZE << (MAX_ORDER - 1);
 
+   printk("map=%px, from=%d, to=%d, count=%ld\n", map, from, to, count);
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -253,7 +254,9 @@ static ssize_t regmap_read_debugfs(struct regmap *map, 
unsigned int from,
buf_pos += map->debugfs_reg_len + 2;
 
/* Format the value, write all X if we can't read */
+   printk("map=%px, i=%d\n", map, i);
ret = regmap_read(map, i, );
+   printk("ret=%ld, val=%x\n", ret, val);
if (ret == 0)
snprintf(buf + buf_pos, count - buf_pos,
 "%.*x", map->debugfs_val_len, val);



pgpTE1IFZenmh.pgp
Description: OpenPGP digital signature


[PATCH] staging: bcm2835-audio: Replace unsafe strcpy() with strscpy()

2021-02-04 Thread Juerg Haefliger
Replace strcpy() with strscpy() in bcm2835-audio/bcm2835.c to prevent the
following when loading snd-bcm2835:

[   58.480634] [ cut here ]
[   58.485321] kernel BUG at lib/string.c:1149!
[   58.489650] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[   58.495214] Modules linked in: snd_bcm2835(COE+) snd_pcm snd_timer snd 
dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua btsdio bluetooth 
ecdh_generic ecc bcm2835_v4l2(CE) bcm2835_codec(CE) brcmfmac bcm2835_isp(CE) 
bcm2835_mmal_vchiq(CE) brcmutil cfg80211 v4l2_mem2mem videobuf2_vmalloc 
videobuf2_dma_contig videobuf2_memops raspberrypi_hwmon videobuf2_v4l2 
videobuf2_common videodev bcm2835_gpiomem mc vc_sm_cma(CE) rpivid_mem 
uio_pdrv_genirq uio sch_fq_codel drm ip_tables x_tables autofs4 btrfs 
blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq 
async_xor async_tx xor xor_neon raid6_pq libcrc32c raid1 raid0 multipath linear 
dwc2 roles spidev udc_core crct10dif_ce xhci_pci xhci_pci_renesas phy_generic 
aes_neon_bs aes_neon_blk crypto_simd cryptd
[   58.563787] CPU: 3 PID: 1959 Comm: insmod Tainted: G C OE 
5.11.0-1001-raspi #1
[   58.572172] Hardware name: Raspberry Pi 4 Model B Rev 1.2 (DT)
[   58.578086] pstate: 6045 (nZCv daif +PAN -UAO -TCO BTYPE=--)
[   58.584178] pc : fortify_panic+0x20/0x24
[   58.588161] lr : fortify_panic+0x20/0x24
[   58.592136] sp : 800010a83990
[   58.595491] x29: 800010a83990 x28: 0002
[   58.600879] x27: b0b07cb72928 x26: 
[   58.606268] x25: 39e884973838 x24: b0b07cb74190
[   58.611655] x23: b0b07cb72030 x22: 
[   58.617042] x21: 39e884973014 x20: 39e88b793010
[   58.622428] x19: b0b07cb72670 x18: 0030
[   58.627814] x17:  x16: b0b092ce2c1c
[   58.633200] x15: 39e88b901500 x14: 0720072007200720
[   58.638588] x13: 0720072007200720 x12: 0720072007200720
[   58.643979] x11: b0b0936cbdf0 x10: f000
[   58.649366] x9 : b0b09220cfa8 x8 : 
[   58.654752] x7 : b0b093673df0 x6 : b0b09364e000
[   58.660140] x5 :  x4 : 39e93b7db948
[   58.665526] x3 : 39e93b7ebcf0 x2 : 
[   58.670913] x1 :  x0 : 0022
[   58.676299] Call trace:
[   58.678775]  fortify_panic+0x20/0x24
[   58.682402]  snd_bcm2835_alsa_probe+0x5b8/0x7d8 [snd_bcm2835]
[   58.688247]  platform_probe+0x74/0xe4
[   58.691963]  really_probe+0xf0/0x510
[   58.695585]  driver_probe_device+0xe0/0x100
[   58.699826]  device_driver_attach+0xcc/0xd4
[   58.704068]  __driver_attach+0xb0/0x17c
[   58.707956]  bus_for_each_dev+0x7c/0xd4
[   58.711843]  driver_attach+0x30/0x40
[   58.715467]  bus_add_driver+0x154/0x250
[   58.719354]  driver_register+0x84/0x140
[   58.723242]  __platform_driver_register+0x34/0x40
[   58.728013]  bcm2835_alsa_driver_init+0x30/0x1000 [snd_bcm2835]
[   58.734024]  do_one_initcall+0x54/0x300
[   58.737914]  do_init_module+0x60/0x280
[   58.741719]  load_module+0x680/0x770
[   58.745344]  __do_sys_finit_module+0xbc/0x130
[   58.749761]  __arm64_sys_finit_module+0x2c/0x40
[   58.754356]  el0_svc_common.constprop.0+0x88/0x220
[   58.759216]  do_el0_svc+0x30/0xa0
[   58.762575]  el0_svc+0x28/0x70
[   58.765669]  el0_sync_handler+0x1a4/0x1b0
[   58.769732]  el0_sync+0x178/0x180
[   58.773095] Code: aa0003e1 91366040 910003fd 97ffee21 (d421)
[   58.779275] ---[ end trace 29be5b17497bd898 ]---
[   58.783955] note: insmod[1959] exited with preempt_count 1
[   58.791921] [ cut here ]

For the sake of it, replace all the other occurences of strcpy() under
bcm2835-audio/ as well.

Signed-off-by: Juerg Haefliger 
---
 drivers/staging/vc04_services/bcm2835-audio/bcm2835-ctl.c | 6 +++---
 drivers/staging/vc04_services/bcm2835-audio/bcm2835-pcm.c | 2 +-
 drivers/staging/vc04_services/bcm2835-audio/bcm2835.c | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-ctl.c 
b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-ctl.c
index 4c2cae99776b..3703409715da 100644
--- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-ctl.c
+++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-ctl.c
@@ -224,7 +224,7 @@ int snd_bcm2835_new_ctl(struct bcm2835_chip *chip)
 {
int err;
 
-   strcpy(chip->card->mixername, "Broadcom Mixer");
+   strscpy(chip->card->mixername, "Broadcom Mixer", 
sizeof(chip->card->mixername));
err = create_ctls(chip, ARRAY_SIZE(snd_bcm2835_ctl), snd_bcm2835_ctl);
if (err < 0)
return err;
@@ -261,7 +261,7 @@ static const struct snd_kcontrol_new 
snd_bcm2835_headphones_ctl[] = {
 
 int snd_bcm2835_new_headphones_ctl(struct bcm2835_chip *chip)
 {
-   strcpy(chip->card->mixername, "Broadcom Mixer");
+   strscpy(chip->card->mixername, "Broadcom Mixe

Re: [ovs-dev] openvswitch crash on i386

2019-03-06 Thread Juerg Haefliger
On Tue, 5 Mar 2019 11:58:42 -0800
Joe Stringer  wrote:

> On Tue, Mar 5, 2019 at 2:12 AM Christian Ehrhardt
>  wrote:
> >
> > On Tue, Mar 5, 2019 at 10:58 AM Juerg Haefliger
> >  wrote:  
> > >
> > > Hi,
> > >
> > > Running the following commands in a loop will crash an i386 5.0 kernel
> > > typically within a few iterations:
> > >
> > > ovs-vsctl add-br test
> > > ovs-vsctl del-br test
> > >
> > > [  106.215748] BUG: unable to handle kernel paging request at e8a35f3b
> > > [  106.216733] #PF error: [normal kernel read fault]
> > > [  106.217464] *pdpt = 19a76001 *pde = 
> > > [  106.218346] Oops:  [#1] SMP PTI
> > > [  106.218911] CPU: 0 PID: 2050 Comm: systemd-udevd Tainted: G
> > > E 5.0.0 #25
> > > [  106.220103] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> > > BIOS 1.11.1-1ubuntu1 04/01/2014
> > > [  106.221447] EIP: kmem_cache_alloc_trace+0x7a/0x1b0
> > > [  106.222178] Code: 01 00 00 8b 07 64 8b 50 04 64 03 05 28 61 e8 d2 8b 
> > > 08 89 4d ec 85 c9 0f 84 03 01 00 00 8b 45 ec 8b 5f 14 8d 4a 01 8b 37 01 
> > > c3 <33> 1b 33 9f b4 00 00 00 64 0f c7 0e 75 cb 8b 75 ec 8b 47 14 0f 18
> > > [  106.224752] EAX: e8a35f3b EBX: e8a35f3b ECX: 869f EDX: 869e
> > > [  106.225683] ESI: d2e96ef0 EDI: da401a00 EBP: d9b85dd0 ESP: d9b85db0
> > > [  106.226662] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 
> > > 00010282
> > > [  106.227710] CR0: 80050033 CR2: e8a35f3b CR3: 185b8000 CR4: 06f0
> > > [  106.228703] DR0:  DR1:  DR2:  DR3: 
> > > [  106.229604] DR6: fffe0ff0 DR7: 0400
> > > [  106.230114] Call Trace:
> > > [  106.230525]  ? kernfs_fop_open+0xb4/0x390
> > > [  106.231176]  kernfs_fop_open+0xb4/0x390
> > > [  106.231856]  ? security_file_open+0x7c/0xc0
> > > [  106.232562]  do_dentry_open+0x131/0x370
> > > [  106.233229]  ? kernfs_fop_write+0x180/0x180
> > > [  106.233905]  vfs_open+0x25/0x30
> > > [  106.234432]  path_openat+0x2fd/0x1450
> > > [  106.235084]  ? cp_new_stat64+0x115/0x140
> > > [  106.235754]  ? cp_new_stat64+0x115/0x140
> > > [  106.236427]  do_filp_open+0x6a/0xd0
> > > [  106.237026]  ? cp_new_stat64+0x115/0x140
> > > [  106.237748]  ? strncpy_from_user+0x3d/0x180
> > > [  106.238539]  ? __alloc_fd+0x36/0x120
> > > [  106.239256]  do_sys_open+0x175/0x210
> > > [  106.239955]  sys_openat+0x1b/0x20
> > > [  106.240596]  do_fast_syscall_32+0x7f/0x1e0
> > > [  106.241313]  entry_SYSENTER_32+0x6b/0xbe
> > > [  106.242017] EIP: 0xb7fae871
> > > [  106.242559] Code: 8b 98 58 cd ff ff 89 c8 85 d2 74 02 89 0a 5b 5d c3 
> > > 8b 04 24 c3 8b 14 24 c3 8b 34 24 c3 8b 3c 24 c3 51 52 55 89 e5 0f 34 cd 
> > > 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
> > > [  106.245551] EAX: ffda EBX: ff9c ECX: bffdcb60 EDX: 00088000
> > > [  106.246651] ESI:  EDI: b7f9e000 EBP: 00088000 ESP: bffdc970
> > > [  106.247706] DS: 007b ES: 007b FS:  GS: 0033 SS: 007b EFLAGS: 
> > > 0246
> > > [  106.248851] Modules linked in: openvswitch(E)
> > > [  106.249621] CR2: e8a35f3b
> > > [  106.250218] ---[ end trace 6a8d05679a59cda7 ]---
> > >
> > > I've bisected this down to the following commit that seems to have 
> > > introduced
> > > the issue:
> > >
> > > commit 120645513f55a4ac5543120d9e79925d30a0156f (refs/bisect/bad)
> > > Author: Jarno Rajahalme 
> > > Date:   Fri Apr 21 16:48:06 2017 -0700
> > >
> > > openvswitch: Add eventmask support to CT action.
> > >
> > > Add a new optional conntrack action attribute OVS_CT_ATTR_EVENTMASK,
> > > which can be used in conjunction with the commit flag
> > > (OVS_CT_ATTR_COMMIT) to set the mask of bits specifying which
> > > conntrack events (IPCT_*) should be delivered via the Netfilter
> > > netlink multicast groups.  Default behavior depends on the system
> > > configuration, but typically a lot of events are delivered.  This can 
> > > be
> > > very chatty for the NFNLGRP_CONNTRACK_UPDATE group, even if only some
> > > types of events are of interest.
> > >
> > > Netfilter core init_conntrack() adds the event cache extension, so we
> > > only need to set the ctmask value.  However, if the system is
> > > config

openvswitch crash on i386

2019-03-05 Thread Juerg Haefliger
Hi,

Running the following commands in a loop will crash an i386 5.0 kernel
typically within a few iterations:

ovs-vsctl add-br test
ovs-vsctl del-br test

[  106.215748] BUG: unable to handle kernel paging request at e8a35f3b
[  106.216733] #PF error: [normal kernel read fault]
[  106.217464] *pdpt = 19a76001 *pde =  
[  106.218346] Oops:  [#1] SMP PTI
[  106.218911] CPU: 0 PID: 2050 Comm: systemd-udevd Tainted: GE 
5.0.0 #25
[  106.220103] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.11.1-1ubuntu1 04/01/2014
[  106.221447] EIP: kmem_cache_alloc_trace+0x7a/0x1b0
[  106.222178] Code: 01 00 00 8b 07 64 8b 50 04 64 03 05 28 61 e8 d2 8b 08 89 
4d ec 85 c9 0f 84 03 01 00 00 8b 45 ec 8b 5f 14 8d 4a 01 8b 37 01 c3 <33> 1b 33 
9f b4 00 00 00 64 0f c7 0e 75 cb 8b 75 ec 8b 47 14 0f 18
[  106.224752] EAX: e8a35f3b EBX: e8a35f3b ECX: 869f EDX: 869e
[  106.225683] ESI: d2e96ef0 EDI: da401a00 EBP: d9b85dd0 ESP: d9b85db0
[  106.226662] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010282
[  106.227710] CR0: 80050033 CR2: e8a35f3b CR3: 185b8000 CR4: 06f0
[  106.228703] DR0:  DR1:  DR2:  DR3: 
[  106.229604] DR6: fffe0ff0 DR7: 0400
[  106.230114] Call Trace:
[  106.230525]  ? kernfs_fop_open+0xb4/0x390
[  106.231176]  kernfs_fop_open+0xb4/0x390
[  106.231856]  ? security_file_open+0x7c/0xc0
[  106.232562]  do_dentry_open+0x131/0x370
[  106.233229]  ? kernfs_fop_write+0x180/0x180
[  106.233905]  vfs_open+0x25/0x30
[  106.234432]  path_openat+0x2fd/0x1450
[  106.235084]  ? cp_new_stat64+0x115/0x140
[  106.235754]  ? cp_new_stat64+0x115/0x140
[  106.236427]  do_filp_open+0x6a/0xd0
[  106.237026]  ? cp_new_stat64+0x115/0x140
[  106.237748]  ? strncpy_from_user+0x3d/0x180
[  106.238539]  ? __alloc_fd+0x36/0x120
[  106.239256]  do_sys_open+0x175/0x210
[  106.239955]  sys_openat+0x1b/0x20
[  106.240596]  do_fast_syscall_32+0x7f/0x1e0
[  106.241313]  entry_SYSENTER_32+0x6b/0xbe
[  106.242017] EIP: 0xb7fae871
[  106.242559] Code: 8b 98 58 cd ff ff 89 c8 85 d2 74 02 89 0a 5b 5d c3 8b 04 
24 c3 8b 14 24 c3 8b 34 24 c3 8b 3c 24 c3 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 
c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
[  106.245551] EAX: ffda EBX: ff9c ECX: bffdcb60 EDX: 00088000
[  106.246651] ESI:  EDI: b7f9e000 EBP: 00088000 ESP: bffdc970
[  106.247706] DS: 007b ES: 007b FS:  GS: 0033 SS: 007b EFLAGS: 0246
[  106.248851] Modules linked in: openvswitch(E)
[  106.249621] CR2: e8a35f3b
[  106.250218] ---[ end trace 6a8d05679a59cda7 ]---

I've bisected this down to the following commit that seems to have introduced
the issue:

commit 120645513f55a4ac5543120d9e79925d30a0156f (refs/bisect/bad)
Author: Jarno Rajahalme 
Date:   Fri Apr 21 16:48:06 2017 -0700

openvswitch: Add eventmask support to CT action.

Add a new optional conntrack action attribute OVS_CT_ATTR_EVENTMASK,
which can be used in conjunction with the commit flag
(OVS_CT_ATTR_COMMIT) to set the mask of bits specifying which
conntrack events (IPCT_*) should be delivered via the Netfilter
netlink multicast groups.  Default behavior depends on the system
configuration, but typically a lot of events are delivered.  This can be
very chatty for the NFNLGRP_CONNTRACK_UPDATE group, even if only some
types of events are of interest.

Netfilter core init_conntrack() adds the event cache extension, so we
only need to set the ctmask value.  However, if the system is
configured without support for events, the setting will be skipped due
to extension not being found.

Signed-off-by: Jarno Rajahalme 
Reviewed-by: Greg Rose 
Acked-by: Joe Stringer 
Signed-off-by: David S. Miller 

Reverting that commit from 5.0 makes the problem go away. I'm not able to
reproduce the crash on x86_64.

...Juerg


pgp3sh03PqDoo.pgp
Description: OpenPGP digital signature


Re: [RESEND PATCH] selftests/ftrace: Handle the absence of tput

2019-02-25 Thread Juerg Haefliger
On Mon, 25 Feb 2019 07:51:13 -0700
shuah  wrote:

> On 2/25/19 6:14 AM, Juerg Haefliger wrote:
> > In environments where tput is not available, we get the following
> > error
> > $ ./ftracetest: 163: [: Illegal number:
> > because ncolors is an empty string. Fix that by setting it to 0 if the
> > tput command fails.
> > 
> > Acked-by: Steven Rostedt (VMware) 
> > Acked-by: Masami Hiramatsu 
> > Signed-off-by: Juerg Haefliger 
> > ---
> >   tools/testing/selftests/ftrace/ftracetest | 4 ++--
> >   1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/ftrace/ftracetest 
> > b/tools/testing/selftests/ftrace/ftracetest
> > index 75244db70331..fc755e1b50f1 100755
> > --- a/tools/testing/selftests/ftrace/ftracetest
> > +++ b/tools/testing/selftests/ftrace/ftracetest
> > @@ -154,13 +154,13 @@ fi
> >   
> >   # Define text colors
> >   # Check available colors on the terminal, if any
> > -ncolors=`tput colors 2>/dev/null`
> > +ncolors=`tput colors 2>/dev/null || echo 0`
> >   color_reset=
> >   color_red=
> >   color_green=
> >   color_blue=
> >   # If stdout exists and number of colors is eight or more, use them
> > -if [ -t 1 -a "$ncolors" -a "$ncolors" -ge 8 ]; then
> > +if [ -t 1 -a "$ncolors" -ge 8 ]; then
> > color_reset="\e[0m"
> > color_red="\e[31m"
> > color_green="\e[32m"
> >   
> 
> Juerg!
> 
> Thanks for the resend. Applied to linux-kselftest next for 5.1-rc1.
> 
> Since I applied this patch out of order, I had to resolve minor merge
> conflict. Please review.

Looks good.

Thanks Shuah!
..Juerg


> thanks,
> -- Shuah



pgpuTD4S45JDx.pgp
Description: OpenPGP digital signature


[RESEND PATCH] selftests/ftrace: Handle the absence of tput

2019-02-25 Thread Juerg Haefliger
In environments where tput is not available, we get the following
error
$ ./ftracetest: 163: [: Illegal number:
because ncolors is an empty string. Fix that by setting it to 0 if the
tput command fails.

Acked-by: Steven Rostedt (VMware) 
Acked-by: Masami Hiramatsu 
Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 75244db70331..fc755e1b50f1 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -154,13 +154,13 @@ fi
 
 # Define text colors
 # Check available colors on the terminal, if any
-ncolors=`tput colors 2>/dev/null`
+ncolors=`tput colors 2>/dev/null || echo 0`
 color_reset=
 color_red=
 color_green=
 color_blue=
 # If stdout exists and number of colors is eight or more, use them
-if [ -t 1 -a "$ncolors" -a "$ncolors" -ge 8 ]; then
+if [ -t 1 -a "$ncolors" -ge 8 ]; then
   color_reset="\e[0m"
   color_red="\e[31m"
   color_green="\e[32m"
-- 
2.19.1



Re: [RESEND PATCH v2 2/2] selftests/ftrace: Replace \e with \033

2019-02-23 Thread Juerg Haefliger
On Fri, 22 Feb 2019 15:46:03 -0700
shuah  wrote:

> On 2/22/19 1:53 PM, Juerg Haefliger wrote:
> > The \e sequence character is not POSIX. Fix that by using \033 instead.
> > 
> > Acked-by: Steven Rostedt (VMware) 
> > Acked-by: Masami Hiramatsu 
> > Signed-off-by: Juerg Haefliger 
> > ---
> >   tools/testing/selftests/ftrace/ftracetest | 8 
> >   1 file changed, 4 insertions(+), 4 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/ftrace/ftracetest 
> > b/tools/testing/selftests/ftrace/ftracetest
> > index 20c9c0ad8682..136387422b00 100755
> > --- a/tools/testing/selftests/ftrace/ftracetest
> > +++ b/tools/testing/selftests/ftrace/ftracetest
> > @@ -161,10 +161,10 @@ color_green=
> >   color_blue=
> >   # If stdout exists and number of colors is eight or more, use them
> >   if [ -t 1 -a "$ncolors" -ge 8 ]; then
> > -  color_reset="\e[0m"
> > -  color_red="\e[31m"
> > -  color_green="\e[32m"
> > -  color_blue="\e[34m"
> > +  color_reset="\033[0m"
> > +  color_red="\033[31m"
> > +  color_green="\033[32m"
> > +  color_blue="\033[34m"
> >   fi
> >   
> >   strip_esc() {
> >   
> 
> Which release is this patch based on?
> 
> This one failed to apply to linux-kselftest next - I resolved a minor
> merge conflict and applied it. Please review to make sure it looks good.

Looks good. It didn't apply cleanly because you don't have
https://lore.kernel.org/lkml/20190220153706.24686-1-jue...@canonical.com/T/#u
Sorry about not copying you on that patch. Want me to resend?

...Juerg


> thanks,
> -- Shuah



pgp2KEPUu4gv5.pgp
Description: OpenPGP digital signature


[RESEND PATCH v2 0/2] selftests/ftrace: Make ftracetest POSIX compliant

2019-02-22 Thread Juerg Haefliger
Add sh...@kernel.org and linux-kselft...@vger.kernel.org.

The recent addition of colored output introduced some non-POSIX-compliant
constructs. Fix that.

Juerg Haefliger (2):
  selftests/ftrace: Replace echo -e with printf
  selftests/ftrace: Replace \e with \033

 tools/testing/selftests/ftrace/ftracetest | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

-- 
2.19.1



[RESEND PATCH v2 2/2] selftests/ftrace: Replace \e with \033

2019-02-22 Thread Juerg Haefliger
The \e sequence character is not POSIX. Fix that by using \033 instead.

Acked-by: Steven Rostedt (VMware) 
Acked-by: Masami Hiramatsu 
Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 20c9c0ad8682..136387422b00 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -161,10 +161,10 @@ color_green=
 color_blue=
 # If stdout exists and number of colors is eight or more, use them
 if [ -t 1 -a "$ncolors" -ge 8 ]; then
-  color_reset="\e[0m"
-  color_red="\e[31m"
-  color_green="\e[32m"
-  color_blue="\e[34m"
+  color_reset="\033[0m"
+  color_red="\033[31m"
+  color_green="\033[32m"
+  color_blue="\033[34m"
 fi
 
 strip_esc() {
-- 
2.19.1



[RESEND PATCH v2 1/2] selftests/ftrace: Replace echo -e with printf

2019-02-22 Thread Juerg Haefliger
echo -e is not POSIX. Depending on what /bin/sh is, we can get
incorrect output like:
$ -e -n [1] Basic trace file check
$ -e[PASS]

Fix that by using printf instead.

Acked-by: Steven Rostedt (VMware) 
Acked-by: Masami Hiramatsu 
Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index fc755e1b50f1..20c9c0ad8682 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -173,8 +173,13 @@ strip_esc() {
 }
 
 prlog() { # messages
-  echo -e "$@"
-  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
+  newline="\n"
+  if [ "$1" = "-n" ] ; then
+newline=
+shift
+  fi
+  printf "$*$newline"
+  [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
   cat $1
-- 
2.19.1



[RESEND PATCH v2 1/2] selftests/ftrace: Replace echo -e with printf

2019-02-22 Thread Juerg Haefliger
echo -e is not POSIX. Depending on what /bin/sh is, we can get
incorrect output like:
$ -e -n [1] Basic trace file check
$ -e[PASS]

Fix that by using printf instead.

Acked-by: Steven Rostedt (VMware) 
Acked-by: Masami Hiramatsu 
Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index fc755e1b50f1..20c9c0ad8682 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -173,8 +173,13 @@ strip_esc() {
 }
 
 prlog() { # messages
-  echo -e "$@"
-  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
+  newline="\n"
+  if [ "$1" = "-n" ] ; then
+newline=
+shift
+  fi
+  printf "$*$newline"
+  [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
   cat $1
-- 
2.19.1



[RESEND PATCH v2 0/2] selftests/ftrace: Make ftracetest POSIX compliant

2019-02-22 Thread Juerg Haefliger
Add sh...@kernel.org and linux-kselft...@vger.kernel.org.

The recent addition of colored output introduced some non-POSIX-compliant
constructs. Fix that.

Juerg Haefliger (2):
  selftests/ftrace: Replace echo -e with printf
  selftests/ftrace: Replace \e with \033

 tools/testing/selftests/ftrace/ftracetest | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

-- 
2.19.1



[RESEND PATCH v2 1/2] selftests/ftrace: Replace echo -e with printf

2019-02-22 Thread Juerg Haefliger
echo -e is not POSIX. Depending on what /bin/sh is, we can get
incorrect output like:
$ -e -n [1] Basic trace file check
$ -e[PASS]

Fix that by using printf instead.

Acked-by: Steven Rostedt (VMware) 
Acked-by: Masami Hiramatsu 
Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index fc755e1b50f1..20c9c0ad8682 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -173,8 +173,13 @@ strip_esc() {
 }
 
 prlog() { # messages
-  echo -e "$@"
-  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
+  newline="\n"
+  if [ "$1" = "-n" ] ; then
+newline=
+shift
+  fi
+  printf "$*$newline"
+  [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
   cat $1
-- 
2.19.1



[RESEND PATCH v2 0/2] selftests/ftrace: Make ftracetest POSIX compliant

2019-02-22 Thread Juerg Haefliger
Add sh...@kernel.org and linux-kselft...@vger.kernel.org.

The recent addition of colored output introduced some non-POSIX-compliant
constructs. Fix that.

Juerg Haefliger (2):
  selftests/ftrace: Replace echo -e with printf
  selftests/ftrace: Replace \e with \033

 tools/testing/selftests/ftrace/ftracetest | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

-- 
2.19.1



[PATCH v2 0/2] selftests/ftrace: Make ftracetest POSIX compliant

2019-02-22 Thread Juerg Haefliger
The recent addition of colored output introduced some non-POSIX-compliant
constructs. Fix that.

Juerg Haefliger (2):
  selftests/ftrace: Replace echo -e with printf
  selftests/ftrace: Replace \e with \033

 tools/testing/selftests/ftrace/ftracetest | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

-- 
2.19.1



[PATCH v2 1/2] selftests/ftrace: Replace echo -e with printf

2019-02-22 Thread Juerg Haefliger
echo -e is not POSIX. Depending on what /bin/sh is, we can get
incorrect output like:
$ -e -n [1] Basic trace file check
$ -e[PASS]

Fix that by using printf instead.

Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index fc755e1b50f1..20c9c0ad8682 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -173,8 +173,13 @@ strip_esc() {
 }
 
 prlog() { # messages
-  echo -e "$@"
-  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
+  newline="\n"
+  if [ "$1" = "-n" ] ; then
+newline=
+shift
+  fi
+  printf "$*$newline"
+  [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
   cat $1
-- 
2.19.1



[PATCH v2 2/2] selftests/ftrace: Replace \e with \033

2019-02-22 Thread Juerg Haefliger
The \e sequence character is not POSIX. Fix that by using \033 instead.

Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 20c9c0ad8682..136387422b00 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -161,10 +161,10 @@ color_green=
 color_blue=
 # If stdout exists and number of colors is eight or more, use them
 if [ -t 1 -a "$ncolors" -ge 8 ]; then
-  color_reset="\e[0m"
-  color_red="\e[31m"
-  color_green="\e[32m"
-  color_blue="\e[34m"
+  color_reset="\033[0m"
+  color_red="\033[31m"
+  color_green="\033[32m"
+  color_blue="\033[34m"
 fi
 
 strip_esc() {
-- 
2.19.1



Re: [PATCH] selftests/ftrace: Make the coloring POSIX compliant

2019-02-20 Thread Juerg Haefliger
On Wed, 20 Feb 2019 14:49:34 -0500
Steven Rostedt  wrote:

> On Wed, 20 Feb 2019 17:13:33 +0100
> Juerg Haefliger  wrote:
> 
> > echo -e and \e are not POSIX. Depending on what /bin/sh is, we can get
> > incorrect output like:  
> 
> I'm curious to which shell this is.

Quite frankly I don't know but that's the output we get when we run it in
Jenkins. I'll try to find out.


> > $ -e -n [1] Basic trace file check
> > $ -e[PASS]
> > 
> > Fix that by using \033 instead of \e and printf.
> > 
> > Signed-off-by: Juerg Haefliger 
> > ---
> >  tools/testing/selftests/ftrace/ftracetest | 17 +++--
> >  1 file changed, 11 insertions(+), 6 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/ftrace/ftracetest 
> > b/tools/testing/selftests/ftrace/ftracetest
> > index fc755e1b50f1..f200898e3e2c 100755
> > --- a/tools/testing/selftests/ftrace/ftracetest
> > +++ b/tools/testing/selftests/ftrace/ftracetest
> > @@ -161,10 +161,10 @@ color_green=
> >  color_blue=
> >  # If stdout exists and number of colors is eight or more, use them
> >  if [ -t 1 -a "$ncolors" -ge 8 ]; then
> > -  color_reset="\e[0m"
> > -  color_red="\e[31m"
> > -  color_green="\e[32m"
> > -  color_blue="\e[34m"
> > +  color_reset="\033[0m"
> > +  color_red="\033[31m"
> > +  color_green="\033[32m"
> > +  color_blue="\033[34m"
> >  fi
> >  
> >  strip_esc() {
> > @@ -173,8 +173,13 @@ strip_esc() {
> >  }
> >  
> >  prlog() { # messages
> > -  echo -e "$@"
> > -  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
> > +  newline="\n"
> > +  if [ "$1" = "-n" ] ; then
> > +newline=
> > +shift
> > +  fi
> > +  printf "$@$newline"
> > +  [ "$LOG_FILE" ] && printf "$@$newline" | strip_esc >> $LOG_FILE
> >  }
> >  catlog() { #file
> >cat $1  
> 
> This should probably be split into two patches, as they are two
> different issues.

Will do.

...Juerg

 
> -- Steve



pgpircA3KIpTo.pgp
Description: OpenPGP digital signature


[PATCH] selftests/ftrace: Make the coloring POSIX compliant

2019-02-20 Thread Juerg Haefliger
echo -e and \e are not POSIX. Depending on what /bin/sh is, we can get
incorrect output like:
$ -e -n [1] Basic trace file check
$ -e[PASS]

Fix that by using \033 instead of \e and printf.

Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index fc755e1b50f1..f200898e3e2c 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -161,10 +161,10 @@ color_green=
 color_blue=
 # If stdout exists and number of colors is eight or more, use them
 if [ -t 1 -a "$ncolors" -ge 8 ]; then
-  color_reset="\e[0m"
-  color_red="\e[31m"
-  color_green="\e[32m"
-  color_blue="\e[34m"
+  color_reset="\033[0m"
+  color_red="\033[31m"
+  color_green="\033[32m"
+  color_blue="\033[34m"
 fi
 
 strip_esc() {
@@ -173,8 +173,13 @@ strip_esc() {
 }
 
 prlog() { # messages
-  echo -e "$@"
-  [ "$LOG_FILE" ] && echo -e "$@" | strip_esc >> $LOG_FILE
+  newline="\n"
+  if [ "$1" = "-n" ] ; then
+newline=
+shift
+  fi
+  printf "$@$newline"
+  [ "$LOG_FILE" ] && printf "$@$newline" | strip_esc >> $LOG_FILE
 }
 catlog() { #file
   cat $1
-- 
2.19.1



[PATCH] selftests/ftrace: Handle the absence of tput

2019-02-20 Thread Juerg Haefliger
In environments where tput is not availbale, we get the following
error
$ ./ftracetest: 163: [: Illegal number:
because ncolors is an empty string. Fix that by setting it to 0 if the
tput command fails.

Signed-off-by: Juerg Haefliger 
---
 tools/testing/selftests/ftrace/ftracetest | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ftrace/ftracetest 
b/tools/testing/selftests/ftrace/ftracetest
index 75244db70331..fc755e1b50f1 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -154,13 +154,13 @@ fi
 
 # Define text colors
 # Check available colors on the terminal, if any
-ncolors=`tput colors 2>/dev/null`
+ncolors=`tput colors 2>/dev/null || echo 0`
 color_reset=
 color_red=
 color_green=
 color_blue=
 # If stdout exists and number of colors is eight or more, use them
-if [ -t 1 -a "$ncolors" -a "$ncolors" -ge 8 ]; then
+if [ -t 1 -a "$ncolors" -ge 8 ]; then
   color_reset="\e[0m"
   color_red="\e[31m"
   color_green="\e[32m"
-- 
2.19.1



Re: Redoing eXclusive Page Frame Ownership (XPFO) with isolated CPUs in mind (for KVM to isolate its guests per CPU)

2018-09-13 Thread Juerg Haefliger
On Wed, Sep 12, 2018 at 5:37 PM, Julian Stecklina  wrote:
> Julian Stecklina  writes:
>
>> Linus Torvalds  writes:
>>
>>> On Fri, Aug 31, 2018 at 12:45 AM Julian Stecklina  
>>> wrote:

 I've been spending some cycles on the XPFO patch set this week. For the
 patch set as it was posted for v4.13, the performance overhead of
 compiling a Linux kernel is ~40% on x86_64[1]. The overhead comes almost
 completely from TLB flushing. If we can live with stale TLB entries
 allowing temporary access (which I think is reasonable), we can remove
 all TLB flushing (on x86). This reduces the overhead to 2-3% for
 kernel compile.
>>>
>>> I have to say, even 2-3% for a kernel compile sounds absolutely horrendous.
>>
>> Well, it's at least in a range where it doesn't look hopeless.
>>
>>> Kernel bullds are 90% user space at least for me, so a 2-3% slowdown
>>> from a kernel is not some small unnoticeable thing.
>>
>> The overhead seems to come from the hooks that XPFO adds to
>> alloc/free_pages. These hooks add a couple of atomic operations per
>> allocated (4K) page for book keeping. Some of these atomic ops are only
>> for debugging and could be removed. There is also some opportunity to
>> streamline the per-page space overhead of XPFO.
>
> I've updated my XPFO branch[1] to make some of the debugging optional
> and also integrated the XPFO bookkeeping with struct page, instead of
> requiring CONFIG_PAGE_EXTENSION, which removes some checks in the hot
> path.

FWIW, that was my original design but there was some resistance to
adding more to the page struct and page extension was suggested
instead.


> These changes push the overhead down to somewhere between 1.5 and
> 2% for my quad core box in kernel compile. This is close to the
> measurement noise, so I take suggestions for a better benchmark here.
>
> Of course, if you hit contention on the xpfo spinlock then performance
> will suffer. I guess this is what happened on Khalid's large box.
>
> I'll try to remove the spinlocks and add fixup code to the pagefault
> handler to see whether this improves the situation on large boxes. This
> might turn out to be ugly, though.

I'm wondering how much performance we're loosing by having to split
hugepages. Any chance this can be quantified somehow? Maybe we can
have a pool of some sorts reserved for userpages and group allocations
so that we can track the XPFO state at the hugepage level instead of
at the 4k level to prevent/reduce page splitting. Not sure if that
causes issues or has any unwanted side effects though...

...Juerg


> Julian
>
> [1] 
> http://git.infradead.org/users/jsteckli/linux-xpfo.git/shortlog/refs/heads/xpfo-master
> --
> Amazon Development Center Germany GmbH
> Berlin - Dresden - Aachen
> main office: Krausenstr. 38, 10117 Berlin
> Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
> Ust-ID: DE289237879
> Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
>


Re: Redoing eXclusive Page Frame Ownership (XPFO) with isolated CPUs in mind (for KVM to isolate its guests per CPU)

2018-09-13 Thread Juerg Haefliger
On Wed, Sep 12, 2018 at 5:37 PM, Julian Stecklina  wrote:
> Julian Stecklina  writes:
>
>> Linus Torvalds  writes:
>>
>>> On Fri, Aug 31, 2018 at 12:45 AM Julian Stecklina  
>>> wrote:

 I've been spending some cycles on the XPFO patch set this week. For the
 patch set as it was posted for v4.13, the performance overhead of
 compiling a Linux kernel is ~40% on x86_64[1]. The overhead comes almost
 completely from TLB flushing. If we can live with stale TLB entries
 allowing temporary access (which I think is reasonable), we can remove
 all TLB flushing (on x86). This reduces the overhead to 2-3% for
 kernel compile.
>>>
>>> I have to say, even 2-3% for a kernel compile sounds absolutely horrendous.
>>
>> Well, it's at least in a range where it doesn't look hopeless.
>>
>>> Kernel bullds are 90% user space at least for me, so a 2-3% slowdown
>>> from a kernel is not some small unnoticeable thing.
>>
>> The overhead seems to come from the hooks that XPFO adds to
>> alloc/free_pages. These hooks add a couple of atomic operations per
>> allocated (4K) page for book keeping. Some of these atomic ops are only
>> for debugging and could be removed. There is also some opportunity to
>> streamline the per-page space overhead of XPFO.
>
> I've updated my XPFO branch[1] to make some of the debugging optional
> and also integrated the XPFO bookkeeping with struct page, instead of
> requiring CONFIG_PAGE_EXTENSION, which removes some checks in the hot
> path.

FWIW, that was my original design but there was some resistance to
adding more to the page struct and page extension was suggested
instead.


> These changes push the overhead down to somewhere between 1.5 and
> 2% for my quad core box in kernel compile. This is close to the
> measurement noise, so I take suggestions for a better benchmark here.
>
> Of course, if you hit contention on the xpfo spinlock then performance
> will suffer. I guess this is what happened on Khalid's large box.
>
> I'll try to remove the spinlocks and add fixup code to the pagefault
> handler to see whether this improves the situation on large boxes. This
> might turn out to be ugly, though.

I'm wondering how much performance we're loosing by having to split
hugepages. Any chance this can be quantified somehow? Maybe we can
have a pool of some sorts reserved for userpages and group allocations
so that we can track the XPFO state at the hugepage level instead of
at the 4k level to prevent/reduce page splitting. Not sure if that
causes issues or has any unwanted side effects though...

...Juerg


> Julian
>
> [1] 
> http://git.infradead.org/users/jsteckli/linux-xpfo.git/shortlog/refs/heads/xpfo-master
> --
> Amazon Development Center Germany GmbH
> Berlin - Dresden - Aachen
> main office: Krausenstr. 38, 10117 Berlin
> Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
> Ust-ID: DE289237879
> Eingetragen am Amtsgericht Charlottenburg HRB 149173 B
>


KAISER: kexec triggers a warning

2017-12-01 Thread Juerg Haefliger
Loading a kexec kernel using today's linux-tip master with KAISER=y
triggers the following warning:

[   18.054017] [ cut here ]
[   18.054024] WARNING: CPU: 0 PID: 1183 at
./arch/x86/include/asm/pgtable_64.h:258 native_set_p4d+0x5f/0x80
[   18.054025] Modules linked in: nls_utf8 isofs ppdev nls_iso8859_1
kvm_intel kvm irqbypass input_leds serio_raw i2c_piix4 parport_pc
parport qemu_fw_cfg mac_hid 9p fscache ib_iser rdma_cm iw_cm ib_cm
ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi
9pnet_virtio 9pnet ip_tables x_tables autofs4 btrfs zstd_decompress
zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy
async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath
linear cirrus ttm drm_kms_helper syscopyarea sysfillrect sysimgblt
fb_sys_fops psmouse virtio_blk virtio_net drm floppy pata_acpi
[   18.054047] CPU: 0 PID: 1183 Comm: kexec Not tainted 4.14.0-kaiser+ #2
[   18.054047] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.10.2-1ubuntu1 04/01/2014
[   18.054048] task: 8a53f9d0ab80 task.stack: aba64089
[   18.054049] RIP: 0010:native_set_p4d+0x5f/0x80
[   18.054050] RSP: 0018:aba640893e20 EFLAGS: 00010246
[   18.054051] RAX: 38ac9063 RBX: 3ffda000 RCX:
3ffda000
[   18.054051] RDX: 8a53fd1f6ff8 RSI: 38ac9063 RDI:
8a53f71ba000
[   18.054051] RBP: 3ffda000 R08: 75abc000 R09:
8a53f8ac9000
[   18.054052] R10: 0003 R11: 3ffda000 R12:
8a53f71ba000
[   18.054052] R13: aba640893e78 R14:  R15:
ff80
[   18.054053] FS:  7f0e95188740() GS:8a53ffc0()
knlGS:
[   18.054054] CS:  0010 DS:  ES:  CR0: 80050033
[   18.054054] CR2: 7f0e94bf0fa0 CR3: 3c452000 CR4:
06f0
[   18.054056] DR0:  DR1:  DR2:

[   18.054056] DR3:  DR6: fffe0ff0 DR7:
0400
[   18.054057] Call Trace:
[   18.054065]  kernel_ident_mapping_init+0x147/0x190
[   18.054069]  machine_kexec_prepare+0xc8/0x490
[   18.054071]  ? trace_clock_x86_tsc+0x10/0x10
[   18.054074]  do_kexec_load+0x1d7/0x2d0
[   18.054079]  SyS_kexec_load+0x84/0xc0
[   18.054083]  entry_SYSCALL_64_fastpath+0x1e/0x81
[   18.054087] RIP: 0033:0x7f0e94c9c9f9
[   18.054087] RSP: 002b:7ffe0d1a83e8 EFLAGS: 0246 ORIG_RAX:
00f6
[   18.054088] RAX: ffda RBX: 55a491d71240 RCX:
7f0e94c9c9f9
[   18.054089] RDX: 55a492aaa570 RSI: 0003 RDI:
3ffd1730
[   18.054089] RBP: 7ffe0d1a8510 R08: 0008 R09:
0001
[   18.054089] R10: 003e R11: 0246 R12:
0100
[   18.054090] R13: 0040 R14: 01b1d820 R15:
0007c7e0
[   18.054090] Code: 37 c3 f6 07 04 74 1b 48 89 f8 25 ff 0f 00 00 48 3d
ff 07 00 00 77 18 48 89 f8 80 cc 10 48 89 30 eb dc 83 3d 07 6d ff 00 02
75 d3 <0f> ff eb cf 0f ff eb cb 48 b8 00 00 00 00 00 00 00 80 48 09 c6
[   18.054104] ---[ end trace f206deb161cf8af0 ]---

...Juerg



signature.asc
Description: OpenPGP digital signature


KAISER: kexec triggers a warning

2017-12-01 Thread Juerg Haefliger
Loading a kexec kernel using today's linux-tip master with KAISER=y
triggers the following warning:

[   18.054017] [ cut here ]
[   18.054024] WARNING: CPU: 0 PID: 1183 at
./arch/x86/include/asm/pgtable_64.h:258 native_set_p4d+0x5f/0x80
[   18.054025] Modules linked in: nls_utf8 isofs ppdev nls_iso8859_1
kvm_intel kvm irqbypass input_leds serio_raw i2c_piix4 parport_pc
parport qemu_fw_cfg mac_hid 9p fscache ib_iser rdma_cm iw_cm ib_cm
ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi
9pnet_virtio 9pnet ip_tables x_tables autofs4 btrfs zstd_decompress
zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy
async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath
linear cirrus ttm drm_kms_helper syscopyarea sysfillrect sysimgblt
fb_sys_fops psmouse virtio_blk virtio_net drm floppy pata_acpi
[   18.054047] CPU: 0 PID: 1183 Comm: kexec Not tainted 4.14.0-kaiser+ #2
[   18.054047] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.10.2-1ubuntu1 04/01/2014
[   18.054048] task: 8a53f9d0ab80 task.stack: aba64089
[   18.054049] RIP: 0010:native_set_p4d+0x5f/0x80
[   18.054050] RSP: 0018:aba640893e20 EFLAGS: 00010246
[   18.054051] RAX: 38ac9063 RBX: 3ffda000 RCX:
3ffda000
[   18.054051] RDX: 8a53fd1f6ff8 RSI: 38ac9063 RDI:
8a53f71ba000
[   18.054051] RBP: 3ffda000 R08: 75abc000 R09:
8a53f8ac9000
[   18.054052] R10: 0003 R11: 3ffda000 R12:
8a53f71ba000
[   18.054052] R13: aba640893e78 R14:  R15:
ff80
[   18.054053] FS:  7f0e95188740() GS:8a53ffc0()
knlGS:
[   18.054054] CS:  0010 DS:  ES:  CR0: 80050033
[   18.054054] CR2: 7f0e94bf0fa0 CR3: 3c452000 CR4:
06f0
[   18.054056] DR0:  DR1:  DR2:

[   18.054056] DR3:  DR6: fffe0ff0 DR7:
0400
[   18.054057] Call Trace:
[   18.054065]  kernel_ident_mapping_init+0x147/0x190
[   18.054069]  machine_kexec_prepare+0xc8/0x490
[   18.054071]  ? trace_clock_x86_tsc+0x10/0x10
[   18.054074]  do_kexec_load+0x1d7/0x2d0
[   18.054079]  SyS_kexec_load+0x84/0xc0
[   18.054083]  entry_SYSCALL_64_fastpath+0x1e/0x81
[   18.054087] RIP: 0033:0x7f0e94c9c9f9
[   18.054087] RSP: 002b:7ffe0d1a83e8 EFLAGS: 0246 ORIG_RAX:
00f6
[   18.054088] RAX: ffda RBX: 55a491d71240 RCX:
7f0e94c9c9f9
[   18.054089] RDX: 55a492aaa570 RSI: 0003 RDI:
3ffd1730
[   18.054089] RBP: 7ffe0d1a8510 R08: 0008 R09:
0001
[   18.054089] R10: 003e R11: 0246 R12:
0100
[   18.054090] R13: 0040 R14: 01b1d820 R15:
0007c7e0
[   18.054090] Code: 37 c3 f6 07 04 74 1b 48 89 f8 25 ff 0f 00 00 48 3d
ff 07 00 00 77 18 48 89 f8 80 cc 10 48 89 30 eb dc 83 3d 07 6d ff 00 02
75 d3 <0f> ff eb cf 0f ff eb cb 48 b8 00 00 00 00 00 00 00 80 48 09 c6
[   18.054104] ---[ end trace f206deb161cf8af0 ]---

...Juerg



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 00/30] [v3] KAISER: unmap most of the kernel from userspace page tables

2017-11-20 Thread Juerg Haefliger
On Fri, Nov 10, 2017 at 8:30 PM, Dave Hansen
 wrote:
> Thanks, everyone for all the reviews thus far.  I hope I managed to
> address all the feedback given so far, except for the TODOs of
> course.  This is a pretty minor update compared to v1->v2.
>
> These patches are all on top of Andy's entry changes here:
>
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
>
> Changes from v2:
>  * Reword documentation removing "we"
>  * Fix some whitespace damage
>  * Fix up MAX ASID values off-by-one noted by Peter Z
>  * Change CodingStyle stuff from Borislav comments
>  * Always use _KERNPG_TABLE for pmd_populate_kernel().
>
> Changes from v1:
>  * Updated to be on top of Andy L's new entry code
>  * Allow global pages again, and use them for pages mapped into
>userspace page tables.
>  * Use trampoline stack instead of process stack at entry so no
>longer need to map process stack (big win in fork() speed)
>  * Made the page table walking less generic by restricting it
>to kernel addresses and !_PAGE_USER pages.
>  * Added a debugfs file to enable/disable CR3 switching at
>runtime.  This does not remove all the KAISER overhead, but
>it removes the largest source.
>  * Use runtime disable with Xen to permit Xen-PV guests with
>KAISER=y.
>  * Moved assembly code from "core" to "prepare assembly" patch
>  * Pass full register name to asm macros
>  * Remove double stack switch in entry_SYSENTER_compat
>  * Disable vsyscall native case when KAISER=y
>  * Separate PER_CPU_USER_MAPPED generic definitions from use
>by arch/x86/.
>
> TODO:
>  * Allow dumping the shadow page tables with the ptdump code
>  * Put LDT at top of userspace
>  * Create separate tlb flushing functions for user and kernel
>  * Chase down the source of the new !CR4.PGE warning that 0day
>found with i386
>
> ---
>
> tl;dr:
>
> KAISER makes it harder to defeat KASLR, but makes syscalls and
> interrupts slower.  These patches are based on work from a team at
> Graz University of Technology posted here[1].  The major addition is
> support for Intel PCIDs which builds on top of Andy Lutomorski's PCID
> work merged for 4.14.  PCIDs make KAISER's overhead very reasonable
> for a wide variety of use cases.
>
> Full Description:
>
> KAISER is a countermeasure against attacks on kernel address
> information.  There are at least three existing, published,
> approaches using the shared user/kernel mapping and hardware features
> to defeat KASLR.  One approach referenced in the paper locates the
> kernel by observing differences in page fault timing between
> present-but-inaccessable kernel pages and non-present pages.
>
> KAISER addresses this by unmapping (most of) the kernel when
> userspace runs.  It leaves the existing page tables largely alone and
> refers to them as "kernel page tables".  For running userspace, a new
> "shadow" copy of the page tables is allocated for each process.  The
> shadow page tables map all the same user memory as the "kernel" copy,
> but only maps a minimal set of kernel memory.
>
> When we enter the kernel via syscalls, interrupts or exceptions,
> page tables are switched to the full "kernel" copy.  When the system
> switches back to user mode, the "shadow" copy is used.  Process
> Context IDentifiers (PCIDs) are used to to ensure that the TLB is not
> flushed when switching between page tables, which makes syscalls
> roughly 2x faster than without it.  PCIDs are usable on Haswell and
> newer CPUs (the ones with "v4", or called fourth-generation Core).
>
> The minimal kernel page tables try to map only what is needed to
> enter/exit the kernel such as the entry/exit functions, interrupt
> descriptors (IDT) and the kernel trampoline stacks.  This minimal set
> of data can still reveal the kernel's ASLR base address.  But, this
> minimal kernel data is all trusted, which makes it harder to exploit
> than data in the kernel direct map which contains loads of
> user-controlled data.
>
> KAISER will affect performance for anything that does system calls or
> interrupts: everything.  Just the new instructions (CR3 manipulation)
> add a few hundred cycles to a syscall or interrupt.  Most workloads
> that we have run show single-digit regressions.  5% is a good round
> number for what is typical.  The worst we have seen is a roughly 30%
> regression on a loopback networking test that did a ton of syscalls
> and context switches.  More details about possible performance
> impacts are in the new Documentation/ file.
>
> This code is based on a version I downloaded from
> (https://github.com/IAIK/KAISER).  It has been heavily modified.
>
> The approach is described in detail in a paper[2].  However, there is
> some incorrect and information in the paper, both on how Linux and
> the hardware works.  For instance, I do not share the opinion that
> KAISER has "runtime overhead of only 0.28%".  Please rely on this
> patch 

Re: [PATCH 00/30] [v3] KAISER: unmap most of the kernel from userspace page tables

2017-11-20 Thread Juerg Haefliger
On Fri, Nov 10, 2017 at 8:30 PM, Dave Hansen
 wrote:
> Thanks, everyone for all the reviews thus far.  I hope I managed to
> address all the feedback given so far, except for the TODOs of
> course.  This is a pretty minor update compared to v1->v2.
>
> These patches are all on top of Andy's entry changes here:
>
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
>
> Changes from v2:
>  * Reword documentation removing "we"
>  * Fix some whitespace damage
>  * Fix up MAX ASID values off-by-one noted by Peter Z
>  * Change CodingStyle stuff from Borislav comments
>  * Always use _KERNPG_TABLE for pmd_populate_kernel().
>
> Changes from v1:
>  * Updated to be on top of Andy L's new entry code
>  * Allow global pages again, and use them for pages mapped into
>userspace page tables.
>  * Use trampoline stack instead of process stack at entry so no
>longer need to map process stack (big win in fork() speed)
>  * Made the page table walking less generic by restricting it
>to kernel addresses and !_PAGE_USER pages.
>  * Added a debugfs file to enable/disable CR3 switching at
>runtime.  This does not remove all the KAISER overhead, but
>it removes the largest source.
>  * Use runtime disable with Xen to permit Xen-PV guests with
>KAISER=y.
>  * Moved assembly code from "core" to "prepare assembly" patch
>  * Pass full register name to asm macros
>  * Remove double stack switch in entry_SYSENTER_compat
>  * Disable vsyscall native case when KAISER=y
>  * Separate PER_CPU_USER_MAPPED generic definitions from use
>by arch/x86/.
>
> TODO:
>  * Allow dumping the shadow page tables with the ptdump code
>  * Put LDT at top of userspace
>  * Create separate tlb flushing functions for user and kernel
>  * Chase down the source of the new !CR4.PGE warning that 0day
>found with i386
>
> ---
>
> tl;dr:
>
> KAISER makes it harder to defeat KASLR, but makes syscalls and
> interrupts slower.  These patches are based on work from a team at
> Graz University of Technology posted here[1].  The major addition is
> support for Intel PCIDs which builds on top of Andy Lutomorski's PCID
> work merged for 4.14.  PCIDs make KAISER's overhead very reasonable
> for a wide variety of use cases.
>
> Full Description:
>
> KAISER is a countermeasure against attacks on kernel address
> information.  There are at least three existing, published,
> approaches using the shared user/kernel mapping and hardware features
> to defeat KASLR.  One approach referenced in the paper locates the
> kernel by observing differences in page fault timing between
> present-but-inaccessable kernel pages and non-present pages.
>
> KAISER addresses this by unmapping (most of) the kernel when
> userspace runs.  It leaves the existing page tables largely alone and
> refers to them as "kernel page tables".  For running userspace, a new
> "shadow" copy of the page tables is allocated for each process.  The
> shadow page tables map all the same user memory as the "kernel" copy,
> but only maps a minimal set of kernel memory.
>
> When we enter the kernel via syscalls, interrupts or exceptions,
> page tables are switched to the full "kernel" copy.  When the system
> switches back to user mode, the "shadow" copy is used.  Process
> Context IDentifiers (PCIDs) are used to to ensure that the TLB is not
> flushed when switching between page tables, which makes syscalls
> roughly 2x faster than without it.  PCIDs are usable on Haswell and
> newer CPUs (the ones with "v4", or called fourth-generation Core).
>
> The minimal kernel page tables try to map only what is needed to
> enter/exit the kernel such as the entry/exit functions, interrupt
> descriptors (IDT) and the kernel trampoline stacks.  This minimal set
> of data can still reveal the kernel's ASLR base address.  But, this
> minimal kernel data is all trusted, which makes it harder to exploit
> than data in the kernel direct map which contains loads of
> user-controlled data.
>
> KAISER will affect performance for anything that does system calls or
> interrupts: everything.  Just the new instructions (CR3 manipulation)
> add a few hundred cycles to a syscall or interrupt.  Most workloads
> that we have run show single-digit regressions.  5% is a good round
> number for what is typical.  The worst we have seen is a roughly 30%
> regression on a loopback networking test that did a ton of syscalls
> and context switches.  More details about possible performance
> impacts are in the new Documentation/ file.
>
> This code is based on a version I downloaded from
> (https://github.com/IAIK/KAISER).  It has been heavily modified.
>
> The approach is described in detail in a paper[2].  However, there is
> some incorrect and information in the paper, both on how Linux and
> the hardware works.  For instance, I do not share the opinion that
> KAISER has "runtime overhead of only 0.28%".  Please rely on this
> patch series as the canonical source 

Re: [Jfs-discussion] [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-11-02 Thread Juerg Haefliger


On 11/02/2017 02:15 PM, Dave Kleikamp wrote:
> On 11/02/2017 01:59 AM, Juerg Haefliger wrote:
>>
>>
>> On 10/30/2017 11:13 PM, Dave Kleikamp wrote:
>>> On 10/25/2017 02:50 AM, Juerg Haefliger wrote:
>>>> Is this a patch you might consider?
>>>
>>> Sorry it's taken me so long to respond.
>>>
>>> I don't think this is the right fix. A failed allocation will still
>>> result in a null pointer dereference by the caller, __get_metapage(). I
>>> think the check needs to be put there. Like this:
>>>
>>> --- a/fs/jfs/jfs_metapage.c
>>> +++ b/fs/jfs/jfs_metapage.c
>>> @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode,
>>> unsigned long lblock,
>>> } else {
>>> INCREMENT(mpStat.pagealloc);
>>> mp = alloc_metapage(GFP_NOFS);
>>> +   if (!mp)
>>> +   goto unlock;
>>> mp->page = page;
>>> mp->sb = inode->i_sb;
>>> mp->flag = 0;
>>
>> I don't understand. This is part of the patch that I sent.
> 
> Doh! How'd I miss that?

:-)


>>
>>
>>>
>>> Furthermore, it looks like all the callers of __get_metapage() check for
>>> a null return, so I'm not sure we need to handle the error at this
>>> point. I might have to look a bit harder at that, since there are many
>>> callers.
>>
>> I don't understand this either :-) Yes, the callers do check for a null
>> pointer but things blow up (in __get_metapage) before that check without
>> the above fix.
> 
> Yeah, the fix to __get_metapage() is necessary. I'm not convinced the
> first part of the patch, to alloc_metapage(), is necessary.

It's not. I just thought it'd be nice to get some sort of notification
in the log when the alloc fails. But if the callers log it then that's fine.

...Juerg


>>
>> ...Juerg
>>
>>
>>>
>>> Thanks,
>>> Shaggy
>>>
>>>>
>>>> Thanks
>>>> ...Juerg
>>>>
>>>>
>>>> On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
>>>>> alloc_metapage can return a NULL pointer so check for that. And also emit
>>>>> an error message if that happens.
>>>>>
>>>>> Signed-off-by: Juerg Haefliger <juerg.haefli...@canonical.com>
>>>>> ---
>>>>>  fs/jfs/jfs_metapage.c | 20 +---
>>>>>  1 file changed, 13 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
>>>>> index 1c4b9ad4d7ab..00f21af66872 100644
>>>>> --- a/fs/jfs/jfs_metapage.c
>>>>> +++ b/fs/jfs/jfs_metapage.c
>>>>> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
>>>>> gfp_mask)
>>>>>  {
>>>>>   struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>>>>>  
>>>>> - if (mp) {
>>>>> - mp->lid = 0;
>>>>> - mp->lsn = 0;
>>>>> - mp->data = NULL;
>>>>> - mp->clsn = 0;
>>>>> - mp->log = NULL;
>>>>> - init_waitqueue_head(>wait);
>>>>> + if (!mp) {
>>>>> + jfs_err("mempool_alloc failed!\n");
>>>>> + return NULL;
>>>>>   }
>>>>> +
>>>>> + mp->lid = 0;
>>>>> + mp->lsn = 0;
>>>>> + mp->data = NULL;
>>>>> + mp->clsn = 0;
>>>>> + mp->log = NULL;
>>>>> + init_waitqueue_head(>wait);
>>>>> +
>>>>>   return mp;
>>>>>  }
>>>>>  
>>>>> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
>>>>> unsigned long lblock,
>>>>>   } else {
>>>>>   INCREMENT(mpStat.pagealloc);
>>>>>   mp = alloc_metapage(GFP_NOFS);
>>>>> + if (!mp)
>>>>> + goto unlock;
>>>>>   mp->page = page;
>>>>>   mp->sb = inode->i_sb;
>>>>>   mp->flag = 0;
>>>>>
>>



signature.asc
Description: OpenPGP digital signature


Re: [Jfs-discussion] [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-11-02 Thread Juerg Haefliger


On 11/02/2017 02:15 PM, Dave Kleikamp wrote:
> On 11/02/2017 01:59 AM, Juerg Haefliger wrote:
>>
>>
>> On 10/30/2017 11:13 PM, Dave Kleikamp wrote:
>>> On 10/25/2017 02:50 AM, Juerg Haefliger wrote:
>>>> Is this a patch you might consider?
>>>
>>> Sorry it's taken me so long to respond.
>>>
>>> I don't think this is the right fix. A failed allocation will still
>>> result in a null pointer dereference by the caller, __get_metapage(). I
>>> think the check needs to be put there. Like this:
>>>
>>> --- a/fs/jfs/jfs_metapage.c
>>> +++ b/fs/jfs/jfs_metapage.c
>>> @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode,
>>> unsigned long lblock,
>>> } else {
>>> INCREMENT(mpStat.pagealloc);
>>> mp = alloc_metapage(GFP_NOFS);
>>> +   if (!mp)
>>> +   goto unlock;
>>> mp->page = page;
>>> mp->sb = inode->i_sb;
>>> mp->flag = 0;
>>
>> I don't understand. This is part of the patch that I sent.
> 
> Doh! How'd I miss that?

:-)


>>
>>
>>>
>>> Furthermore, it looks like all the callers of __get_metapage() check for
>>> a null return, so I'm not sure we need to handle the error at this
>>> point. I might have to look a bit harder at that, since there are many
>>> callers.
>>
>> I don't understand this either :-) Yes, the callers do check for a null
>> pointer but things blow up (in __get_metapage) before that check without
>> the above fix.
> 
> Yeah, the fix to __get_metapage() is necessary. I'm not convinced the
> first part of the patch, to alloc_metapage(), is necessary.

It's not. I just thought it'd be nice to get some sort of notification
in the log when the alloc fails. But if the callers log it then that's fine.

...Juerg


>>
>> ...Juerg
>>
>>
>>>
>>> Thanks,
>>> Shaggy
>>>
>>>>
>>>> Thanks
>>>> ...Juerg
>>>>
>>>>
>>>> On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
>>>>> alloc_metapage can return a NULL pointer so check for that. And also emit
>>>>> an error message if that happens.
>>>>>
>>>>> Signed-off-by: Juerg Haefliger 
>>>>> ---
>>>>>  fs/jfs/jfs_metapage.c | 20 +---
>>>>>  1 file changed, 13 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
>>>>> index 1c4b9ad4d7ab..00f21af66872 100644
>>>>> --- a/fs/jfs/jfs_metapage.c
>>>>> +++ b/fs/jfs/jfs_metapage.c
>>>>> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
>>>>> gfp_mask)
>>>>>  {
>>>>>   struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>>>>>  
>>>>> - if (mp) {
>>>>> - mp->lid = 0;
>>>>> - mp->lsn = 0;
>>>>> - mp->data = NULL;
>>>>> - mp->clsn = 0;
>>>>> - mp->log = NULL;
>>>>> - init_waitqueue_head(>wait);
>>>>> + if (!mp) {
>>>>> + jfs_err("mempool_alloc failed!\n");
>>>>> + return NULL;
>>>>>   }
>>>>> +
>>>>> + mp->lid = 0;
>>>>> + mp->lsn = 0;
>>>>> + mp->data = NULL;
>>>>> + mp->clsn = 0;
>>>>> + mp->log = NULL;
>>>>> + init_waitqueue_head(>wait);
>>>>> +
>>>>>   return mp;
>>>>>  }
>>>>>  
>>>>> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
>>>>> unsigned long lblock,
>>>>>   } else {
>>>>>   INCREMENT(mpStat.pagealloc);
>>>>>   mp = alloc_metapage(GFP_NOFS);
>>>>> + if (!mp)
>>>>> + goto unlock;
>>>>>   mp->page = page;
>>>>>   mp->sb = inode->i_sb;
>>>>>   mp->flag = 0;
>>>>>
>>



signature.asc
Description: OpenPGP digital signature


Re: [Jfs-discussion] [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-11-02 Thread Juerg Haefliger


On 10/30/2017 11:13 PM, Dave Kleikamp wrote:
> On 10/25/2017 02:50 AM, Juerg Haefliger wrote:
>> Is this a patch you might consider?
> 
> Sorry it's taken me so long to respond.
> 
> I don't think this is the right fix. A failed allocation will still
> result in a null pointer dereference by the caller, __get_metapage(). I
> think the check needs to be put there. Like this:
> 
> --- a/fs/jfs/jfs_metapage.c
> +++ b/fs/jfs/jfs_metapage.c
> @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode,
> unsigned long lblock,
>   } else {
>   INCREMENT(mpStat.pagealloc);
>   mp = alloc_metapage(GFP_NOFS);
> + if (!mp)
> + goto unlock;
>   mp->page = page;
>   mp->sb = inode->i_sb;
>   mp->flag = 0;

I don't understand. This is part of the patch that I sent.


> 
> Furthermore, it looks like all the callers of __get_metapage() check for
> a null return, so I'm not sure we need to handle the error at this
> point. I might have to look a bit harder at that, since there are many
> callers.

I don't understand this either :-) Yes, the callers do check for a null
pointer but things blow up (in __get_metapage) before that check without
the above fix.

...Juerg


> 
> Thanks,
> Shaggy
> 
>>
>> Thanks
>> ...Juerg
>>
>>
>> On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
>>> alloc_metapage can return a NULL pointer so check for that. And also emit
>>> an error message if that happens.
>>>
>>> Signed-off-by: Juerg Haefliger <juerg.haefli...@canonical.com>
>>> ---
>>>  fs/jfs/jfs_metapage.c | 20 +---
>>>  1 file changed, 13 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
>>> index 1c4b9ad4d7ab..00f21af66872 100644
>>> --- a/fs/jfs/jfs_metapage.c
>>> +++ b/fs/jfs/jfs_metapage.c
>>> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
>>> gfp_mask)
>>>  {
>>> struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>>>  
>>> -   if (mp) {
>>> -   mp->lid = 0;
>>> -   mp->lsn = 0;
>>> -   mp->data = NULL;
>>> -   mp->clsn = 0;
>>> -   mp->log = NULL;
>>> -   init_waitqueue_head(>wait);
>>> +   if (!mp) {
>>> +   jfs_err("mempool_alloc failed!\n");
>>> +   return NULL;
>>> }
>>> +
>>> +   mp->lid = 0;
>>> +   mp->lsn = 0;
>>> +   mp->data = NULL;
>>> +   mp->clsn = 0;
>>> +   mp->log = NULL;
>>> +   init_waitqueue_head(>wait);
>>> +
>>> return mp;
>>>  }
>>>  
>>> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
>>> unsigned long lblock,
>>> } else {
>>> INCREMENT(mpStat.pagealloc);
>>> mp = alloc_metapage(GFP_NOFS);
>>> +   if (!mp)
>>> +   goto unlock;
>>> mp->page = page;
>>> mp->sb = inode->i_sb;
>>> mp->flag = 0;
>>>



signature.asc
Description: OpenPGP digital signature


Re: [Jfs-discussion] [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-11-02 Thread Juerg Haefliger


On 10/30/2017 11:13 PM, Dave Kleikamp wrote:
> On 10/25/2017 02:50 AM, Juerg Haefliger wrote:
>> Is this a patch you might consider?
> 
> Sorry it's taken me so long to respond.
> 
> I don't think this is the right fix. A failed allocation will still
> result in a null pointer dereference by the caller, __get_metapage(). I
> think the check needs to be put there. Like this:
> 
> --- a/fs/jfs/jfs_metapage.c
> +++ b/fs/jfs/jfs_metapage.c
> @@ -663,6 +663,8 @@ struct metapage *__get_metapage(struct inode *inode,
> unsigned long lblock,
>   } else {
>   INCREMENT(mpStat.pagealloc);
>   mp = alloc_metapage(GFP_NOFS);
> + if (!mp)
> + goto unlock;
>   mp->page = page;
>   mp->sb = inode->i_sb;
>   mp->flag = 0;

I don't understand. This is part of the patch that I sent.


> 
> Furthermore, it looks like all the callers of __get_metapage() check for
> a null return, so I'm not sure we need to handle the error at this
> point. I might have to look a bit harder at that, since there are many
> callers.

I don't understand this either :-) Yes, the callers do check for a null
pointer but things blow up (in __get_metapage) before that check without
the above fix.

...Juerg


> 
> Thanks,
> Shaggy
> 
>>
>> Thanks
>> ...Juerg
>>
>>
>> On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
>>> alloc_metapage can return a NULL pointer so check for that. And also emit
>>> an error message if that happens.
>>>
>>> Signed-off-by: Juerg Haefliger 
>>> ---
>>>  fs/jfs/jfs_metapage.c | 20 +---
>>>  1 file changed, 13 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
>>> index 1c4b9ad4d7ab..00f21af66872 100644
>>> --- a/fs/jfs/jfs_metapage.c
>>> +++ b/fs/jfs/jfs_metapage.c
>>> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
>>> gfp_mask)
>>>  {
>>> struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>>>  
>>> -   if (mp) {
>>> -   mp->lid = 0;
>>> -   mp->lsn = 0;
>>> -   mp->data = NULL;
>>> -   mp->clsn = 0;
>>> -   mp->log = NULL;
>>> -   init_waitqueue_head(>wait);
>>> +   if (!mp) {
>>> +   jfs_err("mempool_alloc failed!\n");
>>> +   return NULL;
>>> }
>>> +
>>> +   mp->lid = 0;
>>> +   mp->lsn = 0;
>>> +   mp->data = NULL;
>>> +   mp->clsn = 0;
>>> +   mp->log = NULL;
>>> +   init_waitqueue_head(>wait);
>>> +
>>> return mp;
>>>  }
>>>  
>>> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
>>> unsigned long lblock,
>>> } else {
>>> INCREMENT(mpStat.pagealloc);
>>> mp = alloc_metapage(GFP_NOFS);
>>> +   if (!mp)
>>> +   goto unlock;
>>> mp->page = page;
>>> mp->sb = inode->i_sb;
>>> mp->flag = 0;
>>>



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-10-25 Thread Juerg Haefliger
Is this a patch you might consider?

Thanks
...Juerg


On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
> alloc_metapage can return a NULL pointer so check for that. And also emit
> an error message if that happens.
> 
> Signed-off-by: Juerg Haefliger <juerg.haefli...@canonical.com>
> ---
>  fs/jfs/jfs_metapage.c | 20 +---
>  1 file changed, 13 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
> index 1c4b9ad4d7ab..00f21af66872 100644
> --- a/fs/jfs/jfs_metapage.c
> +++ b/fs/jfs/jfs_metapage.c
> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
> gfp_mask)
>  {
>   struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>  
> - if (mp) {
> - mp->lid = 0;
> - mp->lsn = 0;
> - mp->data = NULL;
> - mp->clsn = 0;
> - mp->log = NULL;
> - init_waitqueue_head(>wait);
> + if (!mp) {
> + jfs_err("mempool_alloc failed!\n");
> + return NULL;
>   }
> +
> + mp->lid = 0;
> + mp->lsn = 0;
> + mp->data = NULL;
> + mp->clsn = 0;
> + mp->log = NULL;
> + init_waitqueue_head(>wait);
> +
>   return mp;
>  }
>  
> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
> unsigned long lblock,
>   } else {
>   INCREMENT(mpStat.pagealloc);
>   mp = alloc_metapage(GFP_NOFS);
> + if (!mp)
> + goto unlock;
>   mp->page = page;
>   mp->sb = inode->i_sb;
>   mp->flag = 0;
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-10-25 Thread Juerg Haefliger
Is this a patch you might consider?

Thanks
...Juerg


On 10/04/2017 10:24 AM, Juerg Haefliger wrote:
> alloc_metapage can return a NULL pointer so check for that. And also emit
> an error message if that happens.
> 
> Signed-off-by: Juerg Haefliger 
> ---
>  fs/jfs/jfs_metapage.c | 20 +---
>  1 file changed, 13 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
> index 1c4b9ad4d7ab..00f21af66872 100644
> --- a/fs/jfs/jfs_metapage.c
> +++ b/fs/jfs/jfs_metapage.c
> @@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
> gfp_mask)
>  {
>   struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
>  
> - if (mp) {
> - mp->lid = 0;
> - mp->lsn = 0;
> - mp->data = NULL;
> - mp->clsn = 0;
> - mp->log = NULL;
> - init_waitqueue_head(>wait);
> + if (!mp) {
> + jfs_err("mempool_alloc failed!\n");
> + return NULL;
>   }
> +
> + mp->lid = 0;
> + mp->lsn = 0;
> + mp->data = NULL;
> + mp->clsn = 0;
> + mp->log = NULL;
> + init_waitqueue_head(>wait);
> +
>   return mp;
>  }
>  
> @@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
> unsigned long lblock,
>   } else {
>   INCREMENT(mpStat.pagealloc);
>   mp = alloc_metapage(GFP_NOFS);
> + if (!mp)
> + goto unlock;
>   mp->page = page;
>   mp->sb = inode->i_sb;
>   mp->flag = 0;
> 



signature.asc
Description: OpenPGP digital signature


[PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-10-04 Thread Juerg Haefliger
alloc_metapage can return a NULL pointer so check for that. And also emit
an error message if that happens.

Signed-off-by: Juerg Haefliger <juerg.haefli...@canonical.com>
---
 fs/jfs/jfs_metapage.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1c4b9ad4d7ab..00f21af66872 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
gfp_mask)
 {
struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
 
-   if (mp) {
-   mp->lid = 0;
-   mp->lsn = 0;
-   mp->data = NULL;
-   mp->clsn = 0;
-   mp->log = NULL;
-   init_waitqueue_head(>wait);
+   if (!mp) {
+   jfs_err("mempool_alloc failed!\n");
+   return NULL;
}
+
+   mp->lid = 0;
+   mp->lsn = 0;
+   mp->data = NULL;
+   mp->clsn = 0;
+   mp->log = NULL;
+   init_waitqueue_head(>wait);
+
return mp;
 }
 
@@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
unsigned long lblock,
} else {
INCREMENT(mpStat.pagealloc);
mp = alloc_metapage(GFP_NOFS);
+   if (!mp)
+   goto unlock;
mp->page = page;
mp->sb = inode->i_sb;
mp->flag = 0;
-- 
2.14.1



[PATCH] jfs: Add missing NULL pointer check in __get_metapage

2017-10-04 Thread Juerg Haefliger
alloc_metapage can return a NULL pointer so check for that. And also emit
an error message if that happens.

Signed-off-by: Juerg Haefliger 
---
 fs/jfs/jfs_metapage.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1c4b9ad4d7ab..00f21af66872 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -187,14 +187,18 @@ static inline struct metapage *alloc_metapage(gfp_t 
gfp_mask)
 {
struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
 
-   if (mp) {
-   mp->lid = 0;
-   mp->lsn = 0;
-   mp->data = NULL;
-   mp->clsn = 0;
-   mp->log = NULL;
-   init_waitqueue_head(>wait);
+   if (!mp) {
+   jfs_err("mempool_alloc failed!\n");
+   return NULL;
}
+
+   mp->lid = 0;
+   mp->lsn = 0;
+   mp->data = NULL;
+   mp->clsn = 0;
+   mp->log = NULL;
+   init_waitqueue_head(>wait);
+
return mp;
 }
 
@@ -663,6 +667,8 @@ struct metapage *__get_metapage(struct inode *inode, 
unsigned long lblock,
} else {
INCREMENT(mpStat.pagealloc);
mp = alloc_metapage(GFP_NOFS);
+   if (!mp)
+   goto unlock;
mp->page = page;
mp->sb = inode->i_sb;
mp->flag = 0;
-- 
2.14.1



Re: [PATCH v6 00/11] Add support for eXclusive Page Frame Ownership

2017-09-12 Thread Juerg Haefliger


On 09/12/2017 09:07 AM, Yisheng Xie wrote:
> Hi Tycho,
> 
> On 2017/9/11 23:02, Tycho Andersen wrote:
>> Hi Yisheng,
>>
>> On Mon, Sep 11, 2017 at 06:34:45PM +0800, Yisheng Xie wrote:
>>> Hi Tycho ,
>>>
>>> On 2017/9/8 1:35, Tycho Andersen wrote:
 Hi all,

 Here is v6 of the XPFO set; see v5 discussion here:
 https://lkml.org/lkml/2017/8/9/803

 Changelogs are in the individual patch notes, but the highlights are:
 * add primitives for ensuring memory areas are mapped (although these are 
 quite
   ugly, using stack allocation; I'm open to better suggestions)
 * instead of not flushing caches, re-map pages using the above
 * TLB flushing is much more correct (i.e. we're always flushing everything
   everywhere). I suspect we may be able to back this off in some cases, 
 but I'm
   still trying to collect performance numbers to prove this is worth doing.

 I have no TODOs left for this set myself, other than fixing whatever review
 feedback people have. Thoughts and testing welcome!
>>>
>>> According to the paper of Vasileios P. Kemerlis et al, the mainline kernel
>>> will not set the Pro. of physmap(direct map area) to RW(X), so do we really
>>> need XPFO to protect from ret2dir attack?
>>
>> I guess you're talking about section 4.3? 
> Yes
> 
>> They mention that that x86
>> only gets rw, but that aarch64 is rwx still.
> IIRC, the in kernel of v4.13 the aarch64 is not rwx, I will check it.
> 
>>
>> But in either case this still provides access protection, similar to
>> SMAP. Also, if I understand things correctly the protections are
>> unmanaged, so a page that had the +x bit set at some point, it could
>> be used for ret2dir.
> So you means that the Pro. of direct map area maybe changed to +x, then 
> ret2dir attack can use it?

XPFO protects against malicious reads from userspace (potentially
accessing sensitive data). I've also been told by a security expert that
ROP attacks are still possible even if user space memory is
non-executable. XPFO is supposed to prevent that but I haven't been able
to confirm this. It's way out of my comfort zone.

...Juerg


> Thanks
> Yisheng Xie
> 
> 


Re: [PATCH v6 00/11] Add support for eXclusive Page Frame Ownership

2017-09-12 Thread Juerg Haefliger


On 09/12/2017 09:07 AM, Yisheng Xie wrote:
> Hi Tycho,
> 
> On 2017/9/11 23:02, Tycho Andersen wrote:
>> Hi Yisheng,
>>
>> On Mon, Sep 11, 2017 at 06:34:45PM +0800, Yisheng Xie wrote:
>>> Hi Tycho ,
>>>
>>> On 2017/9/8 1:35, Tycho Andersen wrote:
 Hi all,

 Here is v6 of the XPFO set; see v5 discussion here:
 https://lkml.org/lkml/2017/8/9/803

 Changelogs are in the individual patch notes, but the highlights are:
 * add primitives for ensuring memory areas are mapped (although these are 
 quite
   ugly, using stack allocation; I'm open to better suggestions)
 * instead of not flushing caches, re-map pages using the above
 * TLB flushing is much more correct (i.e. we're always flushing everything
   everywhere). I suspect we may be able to back this off in some cases, 
 but I'm
   still trying to collect performance numbers to prove this is worth doing.

 I have no TODOs left for this set myself, other than fixing whatever review
 feedback people have. Thoughts and testing welcome!
>>>
>>> According to the paper of Vasileios P. Kemerlis et al, the mainline kernel
>>> will not set the Pro. of physmap(direct map area) to RW(X), so do we really
>>> need XPFO to protect from ret2dir attack?
>>
>> I guess you're talking about section 4.3? 
> Yes
> 
>> They mention that that x86
>> only gets rw, but that aarch64 is rwx still.
> IIRC, the in kernel of v4.13 the aarch64 is not rwx, I will check it.
> 
>>
>> But in either case this still provides access protection, similar to
>> SMAP. Also, if I understand things correctly the protections are
>> unmanaged, so a page that had the +x bit set at some point, it could
>> be used for ret2dir.
> So you means that the Pro. of direct map area maybe changed to +x, then 
> ret2dir attack can use it?

XPFO protects against malicious reads from userspace (potentially
accessing sensitive data). I've also been told by a security expert that
ROP attacks are still possible even if user space memory is
non-executable. XPFO is supposed to prevent that but I haven't been able
to confirm this. It's way out of my comfort zone.

...Juerg


> Thanks
> Yisheng Xie
> 
> 


Re: [PATCH v6 03/11] mm, x86: Add support for eXclusive Page Frame Ownership (XPFO)

2017-09-11 Thread Juerg Haefliger


On 09/11/2017 04:50 PM, Tycho Andersen wrote:
> Hi Yisheng,
> 
> On Mon, Sep 11, 2017 at 03:24:09PM +0800, Yisheng Xie wrote:
>>> +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
>>> +{
>>> +   int i, flush_tlb = 0;
>>> +   struct xpfo *xpfo;
>>> +
>>> +   if (!static_branch_unlikely(_inited))
>>> +   return;
>>> +
>>> +   for (i = 0; i < (1 << order); i++)  {
>>> +   xpfo = lookup_xpfo(page + i);
>>> +   if (!xpfo)
>>> +   continue;
>>> +
>>> +   WARN(test_bit(XPFO_PAGE_UNMAPPED, >flags),
>>> +"xpfo: unmapped page being allocated\n");
>>> +
>>> +   /* Initialize the map lock and map counter */
>>> +   if (unlikely(!xpfo->inited)) {
>>> +   spin_lock_init(>maplock);
>>> +   atomic_set(>mapcount, 0);
>>> +   xpfo->inited = true;
>>> +   }
>>> +   WARN(atomic_read(>mapcount),
>>> +"xpfo: already mapped page being allocated\n");
>>> +
>>> +   if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
>>> +   /*
>>> +* Tag the page as a user page and flush the TLB if it
>>> +* was previously allocated to the kernel.
>>> +*/
>>> +   if (!test_and_set_bit(XPFO_PAGE_USER, >flags))
>>> +   flush_tlb = 1;
>>
>> I'm not sure whether I am miss anything, however, when the page was 
>> previously allocated
>> to kernel,  should we unmap the physmap (the kernel's page table) here? For 
>> we allocate
>> the page to user now
>> 
> Yes, I think you're right. Oddly, the XPFO_READ_USER test works
> correctly for me, but I think (?) should not because of this bug...

IIRC, this is an optimization carried forward from the initial
implementation. The assumption is that the kernel will map the user
buffer so it's not unmapped on allocation but only on the first (and
subsequent) call of kunmap. I.e.:
 - alloc  -> noop
 - kmap   -> noop
 - kunmap -> unmapped from the kernel
 - kmap   -> mapped into the kernel
 - kunmap -> unmapped from the kernel
and so on until:
 - free   -> mapped back into the kernel

I'm not sure if that make sense though since it leaves a window.

...Juerg



> Tycho
> 


Re: [PATCH v6 03/11] mm, x86: Add support for eXclusive Page Frame Ownership (XPFO)

2017-09-11 Thread Juerg Haefliger


On 09/11/2017 04:50 PM, Tycho Andersen wrote:
> Hi Yisheng,
> 
> On Mon, Sep 11, 2017 at 03:24:09PM +0800, Yisheng Xie wrote:
>>> +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
>>> +{
>>> +   int i, flush_tlb = 0;
>>> +   struct xpfo *xpfo;
>>> +
>>> +   if (!static_branch_unlikely(_inited))
>>> +   return;
>>> +
>>> +   for (i = 0; i < (1 << order); i++)  {
>>> +   xpfo = lookup_xpfo(page + i);
>>> +   if (!xpfo)
>>> +   continue;
>>> +
>>> +   WARN(test_bit(XPFO_PAGE_UNMAPPED, >flags),
>>> +"xpfo: unmapped page being allocated\n");
>>> +
>>> +   /* Initialize the map lock and map counter */
>>> +   if (unlikely(!xpfo->inited)) {
>>> +   spin_lock_init(>maplock);
>>> +   atomic_set(>mapcount, 0);
>>> +   xpfo->inited = true;
>>> +   }
>>> +   WARN(atomic_read(>mapcount),
>>> +"xpfo: already mapped page being allocated\n");
>>> +
>>> +   if ((gfp & GFP_HIGHUSER) == GFP_HIGHUSER) {
>>> +   /*
>>> +* Tag the page as a user page and flush the TLB if it
>>> +* was previously allocated to the kernel.
>>> +*/
>>> +   if (!test_and_set_bit(XPFO_PAGE_USER, >flags))
>>> +   flush_tlb = 1;
>>
>> I'm not sure whether I am miss anything, however, when the page was 
>> previously allocated
>> to kernel,  should we unmap the physmap (the kernel's page table) here? For 
>> we allocate
>> the page to user now
>> 
> Yes, I think you're right. Oddly, the XPFO_READ_USER test works
> correctly for me, but I think (?) should not because of this bug...

IIRC, this is an optimization carried forward from the initial
implementation. The assumption is that the kernel will map the user
buffer so it's not unmapped on allocation but only on the first (and
subsequent) call of kunmap. I.e.:
 - alloc  -> noop
 - kmap   -> noop
 - kunmap -> unmapped from the kernel
 - kmap   -> mapped into the kernel
 - kunmap -> unmapped from the kernel
and so on until:
 - free   -> mapped back into the kernel

I'm not sure if that make sense though since it leaves a window.

...Juerg



> Tycho
> 


Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()

2017-08-31 Thread Juerg Haefliger
On 08/30/2017 06:47 PM, Tycho Andersen wrote:
> On Wed, Aug 30, 2017 at 07:31:25AM +0200, Juerg Haefliger wrote:
>>
>>
>> On 08/23/2017 07:04 PM, Mark Rutland wrote:
>>> On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
>>>> Hi Mark,
>>>>
>>>> On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
>>>>> That said, is there any reason not to use flush_tlb_kernel_range()
>>>>> directly?
>>>>
>>>> So it turns out that there is a difference between __flush_tlb_one() and
>>>> flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the 
>>>> TLBs
>>>> via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB 
>>>> (which
>>>> I think is enough here).
>>>
>>> That sounds suspicious; I don't think that __flush_tlb_one() is
>>> sufficient.
>>>
>>> If you only do local TLB maintenance, then the page is left accessible
>>> to other CPUs via the (stale) kernel mappings. i.e. the page isn't
>>> exclusively mapped by userspace.
>>
>> We flush all CPUs to get rid of stale entries when a new page is
>> allocated to userspace that was previously allocated to the kernel.
>> Is that the scenario you were thinking of?
> 
> I think there are two cases, the one you describe above, where the
> pages are first allocated, and a second one, where e.g. the pages are
> mapped into the kernel because of DMA or whatever. In the case you
> describe above, I think we're doing the right thing (which is why my
> test worked correctly, because it tested this case).
> 
> In the second case, when the pages are unmapped (i.e. the kernel is
> done doing DMA), do we need to flush the other CPUs TLBs? I think the
> current code is not quite correct, because if multiple tasks (CPUs)
> map the pages, only the TLB of the last one is flushed when the
> mapping is cleared, because the tlb is only flushed when ->mapcount
> drops to zero, leaving stale entries in the other TLBs. It's not clear
> to me what to do about this case.

For this to happen, multiple CPUs need to have the same userspace page
mapped at the same time. Is this a valid scenario?

...Juerg


> Thoughts?
> 
> Tycho
> 
>> ...Juerg
>>
>>
>>> Thanks,
>>> Mark.
>>>
>>
> 
> 
> 



signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()

2017-08-31 Thread Juerg Haefliger
On 08/30/2017 06:47 PM, Tycho Andersen wrote:
> On Wed, Aug 30, 2017 at 07:31:25AM +0200, Juerg Haefliger wrote:
>>
>>
>> On 08/23/2017 07:04 PM, Mark Rutland wrote:
>>> On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
>>>> Hi Mark,
>>>>
>>>> On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
>>>>> That said, is there any reason not to use flush_tlb_kernel_range()
>>>>> directly?
>>>>
>>>> So it turns out that there is a difference between __flush_tlb_one() and
>>>> flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the 
>>>> TLBs
>>>> via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB 
>>>> (which
>>>> I think is enough here).
>>>
>>> That sounds suspicious; I don't think that __flush_tlb_one() is
>>> sufficient.
>>>
>>> If you only do local TLB maintenance, then the page is left accessible
>>> to other CPUs via the (stale) kernel mappings. i.e. the page isn't
>>> exclusively mapped by userspace.
>>
>> We flush all CPUs to get rid of stale entries when a new page is
>> allocated to userspace that was previously allocated to the kernel.
>> Is that the scenario you were thinking of?
> 
> I think there are two cases, the one you describe above, where the
> pages are first allocated, and a second one, where e.g. the pages are
> mapped into the kernel because of DMA or whatever. In the case you
> describe above, I think we're doing the right thing (which is why my
> test worked correctly, because it tested this case).
> 
> In the second case, when the pages are unmapped (i.e. the kernel is
> done doing DMA), do we need to flush the other CPUs TLBs? I think the
> current code is not quite correct, because if multiple tasks (CPUs)
> map the pages, only the TLB of the last one is flushed when the
> mapping is cleared, because the tlb is only flushed when ->mapcount
> drops to zero, leaving stale entries in the other TLBs. It's not clear
> to me what to do about this case.

For this to happen, multiple CPUs need to have the same userspace page
mapped at the same time. Is this a valid scenario?

...Juerg


> Thoughts?
> 
> Tycho
> 
>> ...Juerg
>>
>>
>>> Thanks,
>>> Mark.
>>>
>>
> 
> 
> 



signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()

2017-08-29 Thread Juerg Haefliger


On 08/23/2017 07:04 PM, Mark Rutland wrote:
> On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
>> Hi Mark,
>>
>> On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
>>> That said, is there any reason not to use flush_tlb_kernel_range()
>>> directly?
>>
>> So it turns out that there is a difference between __flush_tlb_one() and
>> flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the 
>> TLBs
>> via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB 
>> (which
>> I think is enough here).
> 
> That sounds suspicious; I don't think that __flush_tlb_one() is
> sufficient.
> 
> If you only do local TLB maintenance, then the page is left accessible
> to other CPUs via the (stale) kernel mappings. i.e. the page isn't
> exclusively mapped by userspace.

We flush all CPUs to get rid of stale entries when a new page is
allocated to userspace that was previously allocated to the kernel.
Is that the scenario you were thinking of?

...Juerg


> Thanks,
> Mark.
> 



signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()

2017-08-29 Thread Juerg Haefliger


On 08/23/2017 07:04 PM, Mark Rutland wrote:
> On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
>> Hi Mark,
>>
>> On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
>>> That said, is there any reason not to use flush_tlb_kernel_range()
>>> directly?
>>
>> So it turns out that there is a difference between __flush_tlb_one() and
>> flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the 
>> TLBs
>> via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB 
>> (which
>> I think is enough here).
> 
> That sounds suspicious; I don't think that __flush_tlb_one() is
> sufficient.
> 
> If you only do local TLB maintenance, then the page is left accessible
> to other CPUs via the (stale) kernel mappings. i.e. the page isn't
> exclusively mapped by userspace.

We flush all CPUs to get rid of stale entries when a new page is
allocated to userspace that was previously allocated to the kernel.
Is that the scenario you were thinking of?

...Juerg


> Thanks,
> Mark.
> 



signature.asc
Description: OpenPGP digital signature


[PATCH] lkdtm: Fix Oops when unloading the module

2017-01-19 Thread Juerg Haefliger
No jprobe is registered when the module is loaded without specifying a
crashpoint that uses a jprobe. At the moment, we unconditionally try to
unregister the jprobe on module unload which results in an Oops. Add a
check to fix this.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 drivers/misc/lkdtm_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
index 7eeb71a75549..4d44084071d8 100644
--- a/drivers/misc/lkdtm_core.c
+++ b/drivers/misc/lkdtm_core.c
@@ -535,7 +535,9 @@ static void __exit lkdtm_module_exit(void)
/* Handle test-specific clean-up. */
lkdtm_usercopy_exit();
 
-   unregister_jprobe(lkdtm_jprobe);
+   if (lkdtm_jprobe != NULL)
+   unregister_jprobe(lkdtm_jprobe);
+
pr_info("Crash point unregistered\n");
 }
 
-- 
2.11.0



[PATCH] lkdtm: Fix Oops when unloading the module

2017-01-19 Thread Juerg Haefliger
No jprobe is registered when the module is loaded without specifying a
crashpoint that uses a jprobe. At the moment, we unconditionally try to
unregister the jprobe on module unload which results in an Oops. Add a
check to fix this.

Signed-off-by: Juerg Haefliger 
---
 drivers/misc/lkdtm_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
index 7eeb71a75549..4d44084071d8 100644
--- a/drivers/misc/lkdtm_core.c
+++ b/drivers/misc/lkdtm_core.c
@@ -535,7 +535,9 @@ static void __exit lkdtm_module_exit(void)
/* Handle test-specific clean-up. */
lkdtm_usercopy_exit();
 
-   unregister_jprobe(lkdtm_jprobe);
+   if (lkdtm_jprobe != NULL)
+   unregister_jprobe(lkdtm_jprobe);
+
pr_info("Crash point unregistered\n");
 }
 
-- 
2.11.0



Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-28 Thread Juerg Haefliger
On 11/24/2016 11:56 AM, AKASHI Takahiro wrote:
> Hi,
> 
> I'm trying to give it a spin on arm64, but ...

Thanks for trying this.


>> +/*
>> + * Update a single kernel page table entry
>> + */
>> +static inline void set_kpte(struct page *page, unsigned long kaddr,
>> +pgprot_t prot) {
>> +unsigned int level;
>> +pte_t *kpte = lookup_address(kaddr, );
>> +
>> +/* We only support 4k pages for now */
>> +BUG_ON(!kpte || level != PG_LEVEL_4K);
>> +
>> +set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
>> +}
> 
> As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific,
> would it be better to put the whole definition into arch-specific part?

Well yes but I haven't really looked into splitting up the arch specific stuff.


>> +/*
>> + * Map the page back into the kernel if it was previously
>> + * allocated to user space.
>> + */
>> +if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
>> +   _ext->flags)) {
>> +kaddr = (unsigned long)page_address(page + i);
>> +set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> 
> Why not PAGE_KERNEL?

Good catch, thanks!


>> +/*
>> + * The page is to be allocated back to user space, so unmap it from the
>> + * kernel, flush the TLB and tag it as a user page.
>> + */
>> +if (atomic_dec_return(_ext->mapcount) == 0) {
>> +BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, _ext->flags));
>> +set_bit(PAGE_EXT_XPFO_UNMAPPED, _ext->flags);
>> +set_kpte(page, (unsigned long)kaddr, __pgprot(0));
>> +__flush_tlb_one((unsigned long)kaddr);
> 
> Again __flush_tlb_one() is x86-specific.
> flush_tlb_kernel_range() instead?

I'll take a look. If you can tell me what the relevant arm64 equivalents are 
for the arch-specific
functions, that would help tremendously.

Thanks for the comments!

...Juerg



> Thanks,
> -Takahiro AKASHI


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-28 Thread Juerg Haefliger
On 11/24/2016 11:56 AM, AKASHI Takahiro wrote:
> Hi,
> 
> I'm trying to give it a spin on arm64, but ...

Thanks for trying this.


>> +/*
>> + * Update a single kernel page table entry
>> + */
>> +static inline void set_kpte(struct page *page, unsigned long kaddr,
>> +pgprot_t prot) {
>> +unsigned int level;
>> +pte_t *kpte = lookup_address(kaddr, );
>> +
>> +/* We only support 4k pages for now */
>> +BUG_ON(!kpte || level != PG_LEVEL_4K);
>> +
>> +set_pte_atomic(kpte, pfn_pte(page_to_pfn(page), canon_pgprot(prot)));
>> +}
> 
> As lookup_address() and set_pte_atomic() (and PG_LEVEL_4K), are arch-specific,
> would it be better to put the whole definition into arch-specific part?

Well yes but I haven't really looked into splitting up the arch specific stuff.


>> +/*
>> + * Map the page back into the kernel if it was previously
>> + * allocated to user space.
>> + */
>> +if (test_and_clear_bit(PAGE_EXT_XPFO_UNMAPPED,
>> +   _ext->flags)) {
>> +kaddr = (unsigned long)page_address(page + i);
>> +set_kpte(page + i,  kaddr, __pgprot(__PAGE_KERNEL));
> 
> Why not PAGE_KERNEL?

Good catch, thanks!


>> +/*
>> + * The page is to be allocated back to user space, so unmap it from the
>> + * kernel, flush the TLB and tag it as a user page.
>> + */
>> +if (atomic_dec_return(_ext->mapcount) == 0) {
>> +BUG_ON(test_bit(PAGE_EXT_XPFO_UNMAPPED, _ext->flags));
>> +set_bit(PAGE_EXT_XPFO_UNMAPPED, _ext->flags);
>> +set_kpte(page, (unsigned long)kaddr, __pgprot(0));
>> +__flush_tlb_one((unsigned long)kaddr);
> 
> Again __flush_tlb_one() is x86-specific.
> flush_tlb_kernel_range() instead?

I'll take a look. If you can tell me what the relevant arm64 equivalents are 
for the arch-specific
functions, that would help tremendously.

Thanks for the comments!

...Juerg



> Thanks,
> -Takahiro AKASHI


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-15 Thread Juerg Haefliger
On 11/10/2016 08:24 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefli...@hpe.com> 
> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
>>
>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>> to access userspace which need to be made XPFO-aware
>>   - Performance penalty
>>
>> Reference paper by the original patch authors:
>>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Would it be possible to create an lkdtm test that can exercise this 
> protection?

I'll look into it.


>> diff --git a/security/Kconfig b/security/Kconfig
>> index 118f4549404e..4502e15c8419 100644
>> --- a/security/Kconfig
>> +++ b/security/Kconfig
>> @@ -6,6 +6,25 @@ menu "Security options"
>>
>>  source security/keys/Kconfig
>>
>> +config ARCH_SUPPORTS_XPFO
>> +   bool
> 
> Can you include a "help" section here to describe what requirements an
> architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and
> HAVE_ARCH_VMAP_STACK or some examples.

Will do.


>> +config XPFO
>> +   bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +   default n
>> +   depends on ARCH_SUPPORTS_XPFO
>> +   select PAGE_EXTENSION
>> +   help
>> + This option offers protection against 'ret2dir' kernel attacks.
>> + When enabled, every time a page frame is allocated to user space, 
>> it
>> + is unmapped from the direct mapped RAM region in kernel space
>> + (physmap). Similarly, when a page frame is freed/reclaimed, it is
>> + mapped back to physmap.
>> +
>> + There is a slight performance impact when this option is enabled.
>> +
>> + If in doubt, say "N".
>> +
>>  config SECURITY_DMESG_RESTRICT
>> bool "Restrict unprivileged access to the kernel syslog"
>> default n
> 
> I've added these patches to my kspp tree on kernel.org, so it should
> get some 0-day testing now...

Very good. Thanks!


> Thanks!

Appreciate the feedback.

...Juerg


> -Kees
> 




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-15 Thread Juerg Haefliger
On 11/10/2016 08:24 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger  
> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
>>
>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>> to access userspace which need to be made XPFO-aware
>>   - Performance penalty
>>
>> Reference paper by the original patch authors:
>>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Would it be possible to create an lkdtm test that can exercise this 
> protection?

I'll look into it.


>> diff --git a/security/Kconfig b/security/Kconfig
>> index 118f4549404e..4502e15c8419 100644
>> --- a/security/Kconfig
>> +++ b/security/Kconfig
>> @@ -6,6 +6,25 @@ menu "Security options"
>>
>>  source security/keys/Kconfig
>>
>> +config ARCH_SUPPORTS_XPFO
>> +   bool
> 
> Can you include a "help" section here to describe what requirements an
> architecture needs to support XPFO? See HAVE_ARCH_SECCOMP_FILTER and
> HAVE_ARCH_VMAP_STACK or some examples.

Will do.


>> +config XPFO
>> +   bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +   default n
>> +   depends on ARCH_SUPPORTS_XPFO
>> +   select PAGE_EXTENSION
>> +   help
>> + This option offers protection against 'ret2dir' kernel attacks.
>> + When enabled, every time a page frame is allocated to user space, 
>> it
>> + is unmapped from the direct mapped RAM region in kernel space
>> + (physmap). Similarly, when a page frame is freed/reclaimed, it is
>> + mapped back to physmap.
>> +
>> + There is a slight performance impact when this option is enabled.
>> +
>> + If in doubt, say "N".
>> +
>>  config SECURITY_DMESG_RESTRICT
>> bool "Restrict unprivileged access to the kernel syslog"
>> default n
> 
> I've added these patches to my kspp tree on kernel.org, so it should
> get some 0-day testing now...

Very good. Thanks!


> Thanks!

Appreciate the feedback.

...Juerg


> -Kees
> 




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-15 Thread Juerg Haefliger
Sorry for the late reply, I just found your email in my cluttered inbox.

On 11/10/2016 08:11 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger <juerg.haefli...@hpe.com> 
> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
> 
> Thanks for keeping on this! I'd really like to see it land and then
> get more architectures to support it.

Good to hear :-)


>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>> to access userspace which need to be made XPFO-aware
>>   - Performance penalty
> 
> In the Kconfig you say "slight", but I'm curious what kinds of
> benchmarks you've done and if there's a more specific cost we can
> declare, just to give people more of an idea what the hit looks like?
> (What workloads would trigger a lot of XPFO unmapping, for example?)

That 'slight' wording is based on the performance numbers published in the 
referenced paper.

So far I've only run kernel compilation tests. For that workload, the big 
performance hit comes from
disabling >4k page sizes (around 10%). Adding XPFO on top causes 'only' another 
0.5% performance
penalty. I'm currently looking into adding support for larger page sizes to see 
what the real impact
is and then generate some more relevant numbers.

...Juerg


> Thanks!
> 
> -Kees
> 




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-15 Thread Juerg Haefliger
Sorry for the late reply, I just found your email in my cluttered inbox.

On 11/10/2016 08:11 PM, Kees Cook wrote:
> On Fri, Nov 4, 2016 at 7:45 AM, Juerg Haefliger  
> wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userspace, unless explicitly requested by the
>> kernel. Whenever a page destined for userspace is allocated, it is
>> unmapped from physmap (the kernel's page table). When such a page is
>> reclaimed from userspace, it is mapped back to physmap.
>>
>> Additional fields in the page_ext struct are used for XPFO housekeeping.
>> Specifically two flags to distinguish user vs. kernel pages and to tag
>> unmapped pages and a reference counter to balance kmap/kunmap operations
>> and a lock to serialize access to the XPFO fields.
> 
> Thanks for keeping on this! I'd really like to see it land and then
> get more architectures to support it.

Good to hear :-)


>> Known issues/limitations:
>>   - Only supports x86-64 (for now)
>>   - Only supports 4k pages (for now)
>>   - There are most likely some legitimate uses cases where the kernel needs
>> to access userspace which need to be made XPFO-aware
>>   - Performance penalty
> 
> In the Kconfig you say "slight", but I'm curious what kinds of
> benchmarks you've done and if there's a more specific cost we can
> declare, just to give people more of an idea what the hit looks like?
> (What workloads would trigger a lot of XPFO unmapping, for example?)

That 'slight' wording is based on the performance numbers published in the 
referenced paper.

So far I've only run kernel compilation tests. For that workload, the big 
performance hit comes from
disabling >4k page sizes (around 10%). Adding XPFO on top causes 'only' another 
0.5% performance
penalty. I'm currently looking into adding support for larger page sizes to see 
what the real impact
is and then generate some more relevant numbers.

...Juerg


> Thanks!
> 
> -Kees
> 




signature.asc
Description: OpenPGP digital signature


[RFC PATCH v3 2/2] xpfo: Only put previous userspace pages into the hot cache

2016-11-04 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 100e80e008e2..09ef4f7cfd14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2440,7 +2440,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index 8e3a6a694b6a..0e447e38008a 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -204,3 +204,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
 EXPORT_SYMBOL(xpfo_page_is_unmapped);
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.10.1



[RFC PATCH v3 0/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-04 Thread Juerg Haefliger
Changes from:
  v2 -> v3:
- Removed 'depends on DEBUG_KERNEL' and 'select DEBUG_TLBFLUSH'.
  These are left-overs from the original patch and are not required.
- Make libata XPFO-aware, i.e., properly handle pages that were
  unmapped by XPFO. This takes care of the temporary hack in v2 that
  forced the use of a bounce buffer in block/blk-map.c.
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (removed from the kernel's page table). When such a
page is reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (2):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 drivers/ata/libata-sff.c |   4 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 214 +++
 security/Kconfig |  19 +
 12 files changed, 315 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.10.1



[RFC PATCH v3 2/2] xpfo: Only put previous userspace pages into the hot cache

2016-11-04 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger 
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 100e80e008e2..09ef4f7cfd14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2440,7 +2440,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index 8e3a6a694b6a..0e447e38008a 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -204,3 +204,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
 EXPORT_SYMBOL(xpfo_page_is_unmapped);
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.10.1



[RFC PATCH v3 0/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-04 Thread Juerg Haefliger
Changes from:
  v2 -> v3:
- Removed 'depends on DEBUG_KERNEL' and 'select DEBUG_TLBFLUSH'.
  These are left-overs from the original patch and are not required.
- Make libata XPFO-aware, i.e., properly handle pages that were
  unmapped by XPFO. This takes care of the temporary hack in v2 that
  forced the use of a bounce buffer in block/blk-map.c.
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (removed from the kernel's page table). When such a
page is reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (2):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 drivers/ata/libata-sff.c |   4 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 214 +++
 security/Kconfig |  19 +
 12 files changed, 315 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.10.1



[RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-04 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis <v...@cs.columbia.edu>
Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 drivers/ata/libata-sff.c |   4 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 206 +++
 security/Kconfig |  19 +
 12 files changed, 298 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636d1065..38b334f8fde5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 22af912d66d2..a6fafbae02bb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 051b6158d1b7..58af734be25d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-   if (PageHighMem(page)) {
+   if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
unsigned long flags;
 
/* FIXME: use a bounce buffer */
@@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, 
unsigned int bytes)
 
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-   if (PageHighMem(page)) {
+   if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
unsigned long flags;
 
/* FIXME: use bounce buffer */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page

[RFC PATCH v3 1/2] Add support for eXclusive Page Frame Ownership (XPFO)

2016-11-04 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis 
Signed-off-by: Juerg Haefliger 
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 drivers/ata/libata-sff.c |   4 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 206 +++
 security/Kconfig |  19 +
 12 files changed, 298 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636d1065..38b334f8fde5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1361,7 +1362,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 22af912d66d2..a6fafbae02bb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 051b6158d1b7..58af734be25d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -715,7 +715,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
 
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-   if (PageHighMem(page)) {
+   if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
unsigned long flags;
 
/* FIXME: use a bounce buffer */
@@ -860,7 +860,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, 
unsigned int bytes)
 
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
 
-   if (PageHighMem(page)) {
+   if (PageHighMem(page) || xpfo_page_is_unmapped(page)) {
unsigned long flags;
 
/* FIXME: use bounce buffer */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 #define kmap_atomic_prot(page, 

Re: [kernel-hardening] [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-20 Thread Juerg Haefliger
On 09/14/2016 04:48 PM, Dave Hansen wrote:
>> On 09/02/2016 10:39 PM, Dave Hansen wrote:
>>> On 09/02/2016 04:39 AM, Juerg Haefliger wrote:
>>> Does this
>>> just mean that kernel allocations usually have to pay the penalty to
>>> convert a page?
>>
>> Only pages that are allocated for userspace (gfp & GFP_HIGHUSER == 
>> GFP_HIGHUSER) which were
>> previously allocated for the kernel (gfp & GFP_HIGHUSER != GFP_HIGHUSER) 
>> have to pay the penalty.
>>
>>> So, what's the logic here?  You're assuming that order-0 kernel
>>> allocations are more rare than allocations for userspace?
>>
>> The logic is to put reclaimed kernel pages into the cold cache to
>> postpone their allocation as long as possible to minimize (potential)
>> TLB flushes.
> 
> OK, but if we put them in the cold area but kernel allocations pull them
> from the hot cache, aren't we virtually guaranteeing that kernel
> allocations will have to to TLB shootdown to convert a page?

No. Allocations for the kernel never require a TLB shootdown. Only allocations 
for userspace (and
only if the page was previously a kernel page).


> It seems like you also need to convert all kernel allocations to pull
> from the cold area.

Kernel allocations can continue to pull from the hot cache. Maybe introduce 
another cache for the
userspace pages? But I'm not sure what other implications this might have.

...Juerg




signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-20 Thread Juerg Haefliger
On 09/14/2016 04:48 PM, Dave Hansen wrote:
>> On 09/02/2016 10:39 PM, Dave Hansen wrote:
>>> On 09/02/2016 04:39 AM, Juerg Haefliger wrote:
>>> Does this
>>> just mean that kernel allocations usually have to pay the penalty to
>>> convert a page?
>>
>> Only pages that are allocated for userspace (gfp & GFP_HIGHUSER == 
>> GFP_HIGHUSER) which were
>> previously allocated for the kernel (gfp & GFP_HIGHUSER != GFP_HIGHUSER) 
>> have to pay the penalty.
>>
>>> So, what's the logic here?  You're assuming that order-0 kernel
>>> allocations are more rare than allocations for userspace?
>>
>> The logic is to put reclaimed kernel pages into the cold cache to
>> postpone their allocation as long as possible to minimize (potential)
>> TLB flushes.
> 
> OK, but if we put them in the cold area but kernel allocations pull them
> from the hot cache, aren't we virtually guaranteeing that kernel
> allocations will have to to TLB shootdown to convert a page?

No. Allocations for the kernel never require a TLB shootdown. Only allocations 
for userspace (and
only if the page was previously a kernel page).


> It seems like you also need to convert all kernel allocations to pull
> from the cold area.

Kernel allocations can continue to pull from the hot cache. Maybe introduce 
another cache for the
userspace pages? But I'm not sure what other implications this might have.

...Juerg




signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-14 Thread Juerg Haefliger
Hi Dave,

On 09/14/2016 04:33 PM, Dave Hansen wrote:
> On 09/14/2016 12:19 AM, Juerg Haefliger wrote:
>> Allocating a page to userspace that was previously allocated to the
>> kernel requires an expensive TLB shootdown. To minimize this, we only
>> put non-kernel pages into the hot cache to favor their allocation.
> 
> Hi, I had some questions about this the last time you posted it.  Maybe
> you want to address them now.

I did reply: https://lkml.org/lkml/2016/9/5/249

...Juerg


> --
> 
> But kernel allocations do allocate from these pools, right?  Does this
> just mean that kernel allocations usually have to pay the penalty to
> convert a page?
> 
> So, what's the logic here?  You're assuming that order-0 kernel
> allocations are more rare than allocations for userspace?
> 




signature.asc
Description: OpenPGP digital signature


Re: [kernel-hardening] [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-14 Thread Juerg Haefliger
Hi Dave,

On 09/14/2016 04:33 PM, Dave Hansen wrote:
> On 09/14/2016 12:19 AM, Juerg Haefliger wrote:
>> Allocating a page to userspace that was previously allocated to the
>> kernel requires an expensive TLB shootdown. To minimize this, we only
>> put non-kernel pages into the hot cache to favor their allocation.
> 
> Hi, I had some questions about this the last time you posted it.  Maybe
> you want to address them now.

I did reply: https://lkml.org/lkml/2016/9/5/249

...Juerg


> --
> 
> But kernel allocations do allocate from these pools, right?  Does this
> just mean that kernel allocations usually have to pay the penalty to
> convert a page?
> 
> So, what's the logic here?  You're assuming that order-0 kernel
> allocations are more rare than allocations for userspace?
> 




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
Resending to include the kernel-hardening list. Sorry, I wasn't subscribed with 
the correct email
address when I sent this the first time.

...Juerg

On 09/14/2016 09:18 AM, Juerg Haefliger wrote:
> Changes from:
>   v1 -> v2:
> - Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
>   arch-agnostic.
> - Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
>   for x86.
> - Use page_ext for the additional per-page data.
> - Removed the clearing of pages. This can be accomplished by using
>   PAGE_POISONING.
> - Split up the patch into multiple patches.
> - Fixed additional issues identified by reviewers.
> 
> This patch series adds support for XPFO which protects against 'ret2dir'
> kernel attacks. The basic idea is to enforce exclusive ownership of page
> frames by either the kernel or userspace, unless explicitly requested by
> the kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
> 
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.
> 
> Known issues/limitations:
>   - Only supports x86-64 (for now)
>   - Only supports 4k pages (for now)
>   - There are most likely some legitimate uses cases where the kernel needs
> to access userspace which need to be made XPFO-aware
>   - Performance penalty
> 
> Reference paper by the original patch authors:
>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Juerg Haefliger (3):
>   Add support for eXclusive Page Frame Ownership (XPFO)
>   xpfo: Only put previous userspace pages into the hot cache
>   block: Always use a bounce buffer when XPFO is enabled
> 
>  arch/x86/Kconfig |   3 +-
>  arch/x86/mm/init.c   |   2 +-
>  block/blk-map.c  |   2 +-
>  include/linux/highmem.h  |  15 +++-
>  include/linux/page_ext.h |   7 ++
>  include/linux/xpfo.h |  41 +
>  lib/swiotlb.c|   3 +-
>  mm/Makefile  |   1 +
>  mm/page_alloc.c  |  10 ++-
>  mm/page_ext.c|   4 +
>  mm/xpfo.c| 213 
> +++
>  security/Kconfig     |  20 +
>  12 files changed, 314 insertions(+), 7 deletions(-)
>  create mode 100644 include/linux/xpfo.h
>  create mode 100644 mm/xpfo.c
> 


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
Resending to include the kernel-hardening list. Sorry, I wasn't subscribed with 
the correct email
address when I sent this the first time.

...Juerg

On 09/14/2016 09:18 AM, Juerg Haefliger wrote:
> Changes from:
>   v1 -> v2:
> - Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
>   arch-agnostic.
> - Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
>   for x86.
> - Use page_ext for the additional per-page data.
> - Removed the clearing of pages. This can be accomplished by using
>   PAGE_POISONING.
> - Split up the patch into multiple patches.
> - Fixed additional issues identified by reviewers.
> 
> This patch series adds support for XPFO which protects against 'ret2dir'
> kernel attacks. The basic idea is to enforce exclusive ownership of page
> frames by either the kernel or userspace, unless explicitly requested by
> the kernel. Whenever a page destined for userspace is allocated, it is
> unmapped from physmap (the kernel's page table). When such a page is
> reclaimed from userspace, it is mapped back to physmap.
> 
> Additional fields in the page_ext struct are used for XPFO housekeeping.
> Specifically two flags to distinguish user vs. kernel pages and to tag
> unmapped pages and a reference counter to balance kmap/kunmap operations
> and a lock to serialize access to the XPFO fields.
> 
> Known issues/limitations:
>   - Only supports x86-64 (for now)
>   - Only supports 4k pages (for now)
>   - There are most likely some legitimate uses cases where the kernel needs
> to access userspace which need to be made XPFO-aware
>   - Performance penalty
> 
> Reference paper by the original patch authors:
>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
> 
> Juerg Haefliger (3):
>   Add support for eXclusive Page Frame Ownership (XPFO)
>   xpfo: Only put previous userspace pages into the hot cache
>   block: Always use a bounce buffer when XPFO is enabled
> 
>  arch/x86/Kconfig |   3 +-
>  arch/x86/mm/init.c   |   2 +-
>  block/blk-map.c  |   2 +-
>  include/linux/highmem.h  |  15 +++-
>  include/linux/page_ext.h |   7 ++
>  include/linux/xpfo.h |  41 +
>  lib/swiotlb.c|   3 +-
>  mm/Makefile  |   1 +
>  mm/page_alloc.c  |  10 ++-
>  mm/page_ext.c|   4 +
>  mm/xpfo.c| 213 
> +++
>  security/Kconfig     |  20 +
>  12 files changed, 314 insertions(+), 7 deletions(-)
>  create mode 100644 include/linux/xpfo.h
>  create mode 100644 mm/xpfo.c
> 


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


[RFC PATCH v2 1/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis <v...@cs.columbia.edu>
Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 205 +++
 security/Kconfig |  20 +
 11 files changed, 296 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..dc5604a710c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1350,7 +1351,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d28a2d741f9e..426427b54639 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
+   xpfo_kunmap(addr, virt_to_page(addr));
pagefault_enable();
preempt_enable();
 }
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 03f2a3e7d76d..fdf63dcc399e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -27,6 +27,8 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON,  /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+   PAGE_EXT_XPFO_KERNEL,   /* Page is a kernel page */
+   PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
@@ -48,6 +50,11 @@ struct page_ext {
int last_migrate_reason;
depot_stack_handle_t handle;
 #endif
+#ifdef CONFIG_XPFO
+   int inited; /* Map counte

[RFC PATCH v2 1/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis 
Signed-off-by: Juerg Haefliger 
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 205 +++
 security/Kconfig |  20 +
 11 files changed, 296 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..dc5604a710c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1350,7 +1351,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d28a2d741f9e..426427b54639 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
+   xpfo_kunmap(addr, virt_to_page(addr));
pagefault_enable();
preempt_enable();
 }
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 03f2a3e7d76d..fdf63dcc399e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -27,6 +27,8 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON,  /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+   PAGE_EXT_XPFO_KERNEL,   /* Page is a kernel page */
+   PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
@@ -48,6 +50,11 @@ struct page_ext {
int last_migrate_reason;
depot_stack_handle_t handle;
 #endif
+#ifdef CONFIG_XPFO
+   int inited; /* Map counter and lock initialized */
+   atomic_t ma

[RFC PATCH v2 3/3] block: Always use a bounce buffer when XPFO is enabled

2016-09-14 Thread Juerg Haefliger
This is a temporary hack to prevent the use of bio_map_user_iov()
which causes XPFO page faults.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9a..e889dbfee6fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -52,7 +52,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
struct bio *bio, *orig_bio;
int ret;
 
-   if (copy)
+   if (copy || IS_ENABLED(CONFIG_XPFO))
bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
bio = bio_map_user_iov(q, iter, gfp_mask);
-- 
2.9.3



[RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-14 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0241c8a7e72a..83404b41e52d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2421,7 +2421,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index ddb1be05485d..f8dffda0c961 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -203,3 +203,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
 
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.9.3



[RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
Changes from:
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (3):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache
  block: Always use a bounce buffer when XPFO is enabled

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 block/blk-map.c  |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 213 +++
 security/Kconfig |  20 +
 12 files changed, 314 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.9.3



[RFC PATCH v2 3/3] block: Always use a bounce buffer when XPFO is enabled

2016-09-14 Thread Juerg Haefliger
This is a temporary hack to prevent the use of bio_map_user_iov()
which causes XPFO page faults.

Signed-off-by: Juerg Haefliger 
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9a..e889dbfee6fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -52,7 +52,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
struct bio *bio, *orig_bio;
int ret;
 
-   if (copy)
+   if (copy || IS_ENABLED(CONFIG_XPFO))
bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
bio = bio_map_user_iov(q, iter, gfp_mask);
-- 
2.9.3



[RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-14 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger 
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0241c8a7e72a..83404b41e52d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2421,7 +2421,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index ddb1be05485d..f8dffda0c961 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -203,3 +203,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
 
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.9.3



[RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-14 Thread Juerg Haefliger
Changes from:
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (3):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache
  block: Always use a bounce buffer when XPFO is enabled

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 block/blk-map.c  |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 213 +++
 security/Kconfig |  20 +
 12 files changed, 314 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.9.3



Re: [PATCH 4.4 0/4] CVE fixes for 4.4

2016-09-07 Thread Juerg Haefliger
Hi Greg,

Did you have a chance to look at the below 4 patches?

Did I do something wrong when submitting them or are there other reasons not to 
include them in the
4.4 kernel?

Btw, they still apply on top of 4.4.20.

Thanks
...Juerg


On 08/29/2016 03:38 PM, Juerg Haefliger wrote:
> This patch series fixes the following CVEs in the 4.4 kernel:
>   - CVE-2016-0758
>   - CVE-2016-5243
>   - CVE-2016-5244
>   - CVE-2016-6130
> 
> David Howells (1):
>   KEYS: Fix ASN.1 indefinite length object parsing
> 
> Kangjie Lu (2):
>   tipc: fix an infoleak in tipc_nl_compat_link_dump
>   rds: fix an infoleak in rds_inc_info_copy
> 
> Martin Schwidefsky (1):
>   s390/sclp_ctl: fix potential information leak with /dev/sclp
> 
>  drivers/s390/char/sclp_ctl.c | 12 +++-
>  lib/asn1_decoder.c   | 16 +---
>  net/rds/recv.c   |  2 ++
>  net/tipc/netlink_compat.c|  3 ++-
>  4 files changed, 20 insertions(+), 13 deletions(-)
> 


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 4.4 0/4] CVE fixes for 4.4

2016-09-07 Thread Juerg Haefliger
Hi Greg,

Did you have a chance to look at the below 4 patches?

Did I do something wrong when submitting them or are there other reasons not to 
include them in the
4.4 kernel?

Btw, they still apply on top of 4.4.20.

Thanks
...Juerg


On 08/29/2016 03:38 PM, Juerg Haefliger wrote:
> This patch series fixes the following CVEs in the 4.4 kernel:
>   - CVE-2016-0758
>   - CVE-2016-5243
>   - CVE-2016-5244
>   - CVE-2016-6130
> 
> David Howells (1):
>   KEYS: Fix ASN.1 indefinite length object parsing
> 
> Kangjie Lu (2):
>   tipc: fix an infoleak in tipc_nl_compat_link_dump
>   rds: fix an infoleak in rds_inc_info_copy
> 
> Martin Schwidefsky (1):
>   s390/sclp_ctl: fix potential information leak with /dev/sclp
> 
>  drivers/s390/char/sclp_ctl.c | 12 +++-
>  lib/asn1_decoder.c   | 16 +---
>  net/rds/recv.c   |  2 ++
>  net/tipc/netlink_compat.c|  3 ++-
>  4 files changed, 20 insertions(+), 13 deletions(-)
> 


-- 
Juerg Haefliger
Hewlett Packard Enterprise



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-05 Thread Juerg Haefliger
On 09/02/2016 10:39 PM, Dave Hansen wrote:
> On 09/02/2016 04:39 AM, Juerg Haefliger wrote:
>> Allocating a page to userspace that was previously allocated to the
>> kernel requires an expensive TLB shootdown. To minimize this, we only
>> put non-kernel pages into the hot cache to favor their allocation.
> 
> But kernel allocations do allocate from these pools, right?

Yes.


> Does this
> just mean that kernel allocations usually have to pay the penalty to
> convert a page?

Only pages that are allocated for userspace (gfp & GFP_HIGHUSER == 
GFP_HIGHUSER) which were
previously allocated for the kernel (gfp & GFP_HIGHUSER != GFP_HIGHUSER) have 
to pay the penalty.


> So, what's the logic here?  You're assuming that order-0 kernel
> allocations are more rare than allocations for userspace?

The logic is to put reclaimed kernel pages into the cold cache to postpone 
their allocation as long
as possible to minimize (potential) TLB flushes.

...Juerg




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-05 Thread Juerg Haefliger
On 09/02/2016 10:39 PM, Dave Hansen wrote:
> On 09/02/2016 04:39 AM, Juerg Haefliger wrote:
>> Allocating a page to userspace that was previously allocated to the
>> kernel requires an expensive TLB shootdown. To minimize this, we only
>> put non-kernel pages into the hot cache to favor their allocation.
> 
> But kernel allocations do allocate from these pools, right?

Yes.


> Does this
> just mean that kernel allocations usually have to pay the penalty to
> convert a page?

Only pages that are allocated for userspace (gfp & GFP_HIGHUSER == 
GFP_HIGHUSER) which were
previously allocated for the kernel (gfp & GFP_HIGHUSER != GFP_HIGHUSER) have 
to pay the penalty.


> So, what's the logic here?  You're assuming that order-0 kernel
> allocations are more rare than allocations for userspace?

The logic is to put reclaimed kernel pages into the cold cache to postpone 
their allocation as long
as possible to minimize (potential) TLB flushes.

...Juerg




signature.asc
Description: OpenPGP digital signature


[RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-02 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0241c8a7e72a..83404b41e52d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2421,7 +2421,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index ddb1be05485d..f8dffda0c961 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -203,3 +203,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
 
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.9.3



[RFC PATCH v2 2/3] xpfo: Only put previous userspace pages into the hot cache

2016-09-02 Thread Juerg Haefliger
Allocating a page to userspace that was previously allocated to the
kernel requires an expensive TLB shootdown. To minimize this, we only
put non-kernel pages into the hot cache to favor their allocation.

Signed-off-by: Juerg Haefliger 
---
 include/linux/xpfo.h | 2 ++
 mm/page_alloc.c  | 8 +++-
 mm/xpfo.c| 8 
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index 77187578ca33..077d1cfadfa2 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -24,6 +24,7 @@ extern void xpfo_alloc_page(struct page *page, int order, 
gfp_t gfp);
 extern void xpfo_free_page(struct page *page, int order);
 
 extern bool xpfo_page_is_unmapped(struct page *page);
+extern bool xpfo_page_is_kernel(struct page *page);
 
 #else /* !CONFIG_XPFO */
 
@@ -33,6 +34,7 @@ static inline void xpfo_alloc_page(struct page *page, int 
order, gfp_t gfp) { }
 static inline void xpfo_free_page(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
+static inline bool xpfo_page_is_kernel(struct page *page) { return false; }
 
 #endif /* CONFIG_XPFO */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0241c8a7e72a..83404b41e52d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2421,7 +2421,13 @@ void free_hot_cold_page(struct page *page, bool cold)
}
 
pcp = _cpu_ptr(zone->pageset)->pcp;
-   if (!cold)
+   /*
+* XPFO: Allocating a page to userspace that was previously allocated
+* to the kernel requires an expensive TLB shootdown. To minimize this,
+* we only put non-kernel pages into the hot cache to favor their
+* allocation.
+*/
+   if (!cold && !xpfo_page_is_kernel(page))
list_add(>lru, >lists[migratetype]);
else
list_add_tail(>lru, >lists[migratetype]);
diff --git a/mm/xpfo.c b/mm/xpfo.c
index ddb1be05485d..f8dffda0c961 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -203,3 +203,11 @@ inline bool xpfo_page_is_unmapped(struct page *page)
 
return test_bit(PAGE_EXT_XPFO_UNMAPPED, _page_ext(page)->flags);
 }
+
+inline bool xpfo_page_is_kernel(struct page *page)
+{
+   if (!static_branch_unlikely(_inited))
+   return false;
+
+   return test_bit(PAGE_EXT_XPFO_KERNEL, _page_ext(page)->flags);
+}
-- 
2.9.3



[RFC PATCH v2 3/3] block: Always use a bounce buffer when XPFO is enabled

2016-09-02 Thread Juerg Haefliger
This is a temporary hack to prevent the use of bio_map_user_iov()
which causes XPFO page faults.

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9a..e889dbfee6fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -52,7 +52,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
struct bio *bio, *orig_bio;
int ret;
 
-   if (copy)
+   if (copy || IS_ENABLED(CONFIG_XPFO))
bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
bio = bio_map_user_iov(q, iter, gfp_mask);
-- 
2.9.3



[RFC PATCH v2 3/3] block: Always use a bounce buffer when XPFO is enabled

2016-09-02 Thread Juerg Haefliger
This is a temporary hack to prevent the use of bio_map_user_iov()
which causes XPFO page faults.

Signed-off-by: Juerg Haefliger 
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9a..e889dbfee6fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -52,7 +52,7 @@ static int __blk_rq_map_user_iov(struct request *rq,
struct bio *bio, *orig_bio;
int ret;
 
-   if (copy)
+   if (copy || IS_ENABLED(CONFIG_XPFO))
bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
bio = bio_map_user_iov(q, iter, gfp_mask);
-- 
2.9.3



[RFC PATCH v2 1/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-02 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis <v...@cs.columbia.edu>
Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 205 +++
 security/Kconfig |  20 +
 11 files changed, 296 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..dc5604a710c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1350,7 +1351,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d28a2d741f9e..426427b54639 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
+   xpfo_kunmap(addr, virt_to_page(addr));
pagefault_enable();
preempt_enable();
 }
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 03f2a3e7d76d..fdf63dcc399e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -27,6 +27,8 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON,  /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+   PAGE_EXT_XPFO_KERNEL,   /* Page is a kernel page */
+   PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
@@ -48,6 +50,11 @@ struct page_ext {
int last_migrate_reason;
depot_stack_handle_t handle;
 #endif
+#ifdef CONFIG_XPFO
+   int inited; /* Map counte

[RFC PATCH v2 1/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-02 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userspace, unless explicitly requested by the
kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis 
Signed-off-by: Juerg Haefliger 
---
 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  39 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |   2 +
 mm/page_ext.c|   4 +
 mm/xpfo.c| 205 +++
 security/Kconfig |  20 +
 11 files changed, 296 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c580d8c33562..dc5604a710c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
select HAVE_STACK_VALIDATIONif X86_64
select ARCH_USES_HIGH_VMA_FLAGS if 
X86_INTEL_MEMORY_PROTECTION_KEYS
select ARCH_HAS_PKEYS   if 
X86_INTEL_MEMORY_PROTECTION_KEYS
+   select ARCH_SUPPORTS_XPFO   if X86_64
 
 config INSTRUCTION_DECODER
def_bool y
@@ -1350,7 +1351,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d28a2d741f9e..426427b54639 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -161,7 +161,7 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_XPFO)
/*
 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
 * use small pages.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index bb3f3297062a..7a17c166532f 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -55,24 +56,34 @@ static inline struct page *kmap_to_page(void *addr)
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
+   void *kaddr;
+
might_sleep();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 
 static inline void kunmap(struct page *page)
 {
+   xpfo_kunmap(page_address(page), page);
 }
 
 static inline void *kmap_atomic(struct page *page)
 {
+   void *kaddr;
+
preempt_disable();
pagefault_disable();
-   return page_address(page);
+   kaddr = page_address(page);
+   xpfo_kmap(kaddr, page);
+   return kaddr;
 }
 #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
 
 static inline void __kunmap_atomic(void *addr)
 {
+   xpfo_kunmap(addr, virt_to_page(addr));
pagefault_enable();
preempt_enable();
 }
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 03f2a3e7d76d..fdf63dcc399e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -27,6 +27,8 @@ enum page_ext_flags {
PAGE_EXT_DEBUG_POISON,  /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
+   PAGE_EXT_XPFO_KERNEL,   /* Page is a kernel page */
+   PAGE_EXT_XPFO_UNMAPPED, /* Page is unmapped */
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
@@ -48,6 +50,11 @@ struct page_ext {
int last_migrate_reason;
depot_stack_handle_t handle;
 #endif
+#ifdef CONFIG_XPFO
+   int inited; /* Map counter and lock initialized */
+   atomic_t ma

[RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-02 Thread Juerg Haefliger
Changes from:
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (3):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache
  block: Always use a bounce buffer when XPFO is enabled

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 block/blk-map.c  |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 213 +++
 security/Kconfig |  20 +
 12 files changed, 314 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.9.3



[RFC PATCH v2 0/3] Add support for eXclusive Page Frame Ownership (XPFO)

2016-09-02 Thread Juerg Haefliger
Changes from:
  v1 -> v2:
- Moved the code from arch/x86/mm/ to mm/ since it's (mostly)
  arch-agnostic.
- Moved the config to the generic layer and added ARCH_SUPPORTS_XPFO
  for x86.
- Use page_ext for the additional per-page data.
- Removed the clearing of pages. This can be accomplished by using
  PAGE_POISONING.
- Split up the patch into multiple patches.
- Fixed additional issues identified by reviewers.

This patch series adds support for XPFO which protects against 'ret2dir'
kernel attacks. The basic idea is to enforce exclusive ownership of page
frames by either the kernel or userspace, unless explicitly requested by
the kernel. Whenever a page destined for userspace is allocated, it is
unmapped from physmap (the kernel's page table). When such a page is
reclaimed from userspace, it is mapped back to physmap.

Additional fields in the page_ext struct are used for XPFO housekeeping.
Specifically two flags to distinguish user vs. kernel pages and to tag
unmapped pages and a reference counter to balance kmap/kunmap operations
and a lock to serialize access to the XPFO fields.

Known issues/limitations:
  - Only supports x86-64 (for now)
  - Only supports 4k pages (for now)
  - There are most likely some legitimate uses cases where the kernel needs
to access userspace which need to be made XPFO-aware
  - Performance penalty

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Juerg Haefliger (3):
  Add support for eXclusive Page Frame Ownership (XPFO)
  xpfo: Only put previous userspace pages into the hot cache
  block: Always use a bounce buffer when XPFO is enabled

 arch/x86/Kconfig |   3 +-
 arch/x86/mm/init.c   |   2 +-
 block/blk-map.c  |   2 +-
 include/linux/highmem.h  |  15 +++-
 include/linux/page_ext.h |   7 ++
 include/linux/xpfo.h |  41 +
 lib/swiotlb.c|   3 +-
 mm/Makefile  |   1 +
 mm/page_alloc.c  |  10 ++-
 mm/page_ext.c|   4 +
 mm/xpfo.c| 213 +++
 security/Kconfig |  20 +
 12 files changed, 314 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/xpfo.h
 create mode 100644 mm/xpfo.c

-- 
2.9.3



[PATCH] hyperv: Fix compilation issue with 4.4.19-rt27

2016-09-01 Thread Juerg Haefliger
Fix a compilation issue introduced by upstream commit
4b44f2d18a330565227a7348844493c59366171e

The upstream commit exports the symbol add_interrupt_randomness()
which is now being used in the Hyper-V driver. The RT patch adds another
argument to that function so its usage in the Hyper-V driver needs to
be fixed.

This patch should be merged with:
patches/0216-random-Make-it-work-on-rt.patch
commit 20985550b01a21ba00a587d17d7c26da61e01acc random: Make it work on rt

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
Reviewed-by: T Makphaibulchoke <t...@hpe.com>
---
 drivers/hv/vmbus_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 9b5440f6b3b4..c9c4899834f2 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -828,7 +828,7 @@ static void vmbus_isr(void)
tasklet_schedule(_dpc);
}
 
-   add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
+   add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
 }
 
 
-- 
2.9.3



[PATCH] hyperv: Fix compilation issue with 4.4.19-rt27

2016-09-01 Thread Juerg Haefliger
Fix a compilation issue introduced by upstream commit
4b44f2d18a330565227a7348844493c59366171e

The upstream commit exports the symbol add_interrupt_randomness()
which is now being used in the Hyper-V driver. The RT patch adds another
argument to that function so its usage in the Hyper-V driver needs to
be fixed.

This patch should be merged with:
patches/0216-random-Make-it-work-on-rt.patch
commit 20985550b01a21ba00a587d17d7c26da61e01acc random: Make it work on rt

Signed-off-by: Juerg Haefliger 
Reviewed-by: T Makphaibulchoke 
---
 drivers/hv/vmbus_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 9b5440f6b3b4..c9c4899834f2 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -828,7 +828,7 @@ static void vmbus_isr(void)
tasklet_schedule(_dpc);
}
 
-   add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
+   add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
 }
 
 
-- 
2.9.3



[PATCH] net/ixgbe: Allow resetting VF admin mac to zero

2016-07-01 Thread Juerg Haefliger
The VF administrative mac addresses (stored in the PF driver) are
initialized to zero when the PF driver starts up.

These addresses may be modified in the PF driver through ndo calls
initiated by iproute2 or libvirt.

While we allow the PF/host to change the VF admin mac address from zero
to a valid unicast mac, we do not allow restoring the VF admin mac to
zero. We currently only allow changing this mac to a different unicast mac.

This leads to problems when libvirt scripts are used to deal with
VF mac addresses, and libvirt attempts to revoke the mac so this
host will not use it anymore.

Fix this by allowing resetting a VF administrative MAC back to zero.

Implementation and commit message shamelessly stolen from:
commit 6e5224224faa ("net/mlx4_core: Allow resetting VF admin mac to zero")

Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index c5caacd..d075387 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1280,7 +1280,7 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter)
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 {
struct ixgbe_adapter *adapter = netdev_priv(netdev);
-   if (!is_valid_ether_addr(mac) || (vf >= adapter->num_vfs))
+   if (is_multicast_ether_addr(mac) || (vf >= adapter->num_vfs))
return -EINVAL;
adapter->vfinfo[vf].pf_set_mac = true;
dev_info(>pdev->dev, "setting MAC %pM on VF %d\n", mac, vf);
-- 
2.8.1



[PATCH] net/ixgbe: Allow resetting VF admin mac to zero

2016-07-01 Thread Juerg Haefliger
The VF administrative mac addresses (stored in the PF driver) are
initialized to zero when the PF driver starts up.

These addresses may be modified in the PF driver through ndo calls
initiated by iproute2 or libvirt.

While we allow the PF/host to change the VF admin mac address from zero
to a valid unicast mac, we do not allow restoring the VF admin mac to
zero. We currently only allow changing this mac to a different unicast mac.

This leads to problems when libvirt scripts are used to deal with
VF mac addresses, and libvirt attempts to revoke the mac so this
host will not use it anymore.

Fix this by allowing resetting a VF administrative MAC back to zero.

Implementation and commit message shamelessly stolen from:
commit 6e5224224faa ("net/mlx4_core: Allow resetting VF admin mac to zero")

Signed-off-by: Juerg Haefliger 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index c5caacd..d075387 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1280,7 +1280,7 @@ void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter)
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 {
struct ixgbe_adapter *adapter = netdev_priv(netdev);
-   if (!is_valid_ether_addr(mac) || (vf >= adapter->num_vfs))
+   if (is_multicast_ether_addr(mac) || (vf >= adapter->num_vfs))
return -EINVAL;
adapter->vfinfo[vf].pf_set_mac = true;
dev_info(>pdev->dev, "setting MAC %pM on VF %d\n", mac, vf);
-- 
2.8.1



Re: [RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-03-21 Thread Juerg Haefliger
Hi Balbir,

Apologies for the slow reply.


On 03/01/2016 03:10 AM, Balbir Singh wrote:
> 
> 
> On 27/02/16 01:21, Juerg Haefliger wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userland, unless explicitly requested by the
>> kernel. Whenever a page destined for userland is allocated, it is
>> unmapped from physmap. When such a page is reclaimed from userland, it is
>> mapped back to physmap.
> physmap == xen physmap? Please clarify

No, it's not XEN related. I might have the terminology wrong. Physmap is what
the original authors used for describing  a large, contiguous virtual
memory region inside kernel address space that contains a direct mapping of part
or all (depending on the architecture) physical memory. 


>> Mapping/unmapping from physmap is accomplished by modifying the PTE
>> permission bits to allow/disallow access to the page.
>>
>> Additional fields are added to the page struct for XPFO housekeeping.
>> Specifically a flags field to distinguish user vs. kernel pages, a
>> reference counter to track physmap map/unmap operations and a lock to
>> protect the XPFO fields.
>>
>> Known issues/limitations:
>>   - Only supported on x86-64.
> Is it due to lack of porting or a design limitation?

Lack of porting. Support for other architectures will come later.


>>   - Only supports 4k pages.
>>   - Adds additional data to the page struct.
>>   - There are most likely some additional and legitimate uses cases where
>> the kernel needs to access userspace. Those need to be identified and
>> made XPFO-aware.
> Why not build an audit mode for it?

Can you elaborate what you mean by this?


>>   - There's a performance impact if XPFO is turned on. Per the paper
>> referenced below it's in the 1-3% ballpark. More performance testing
>> wouldn't hurt. What tests to run though?
>>
>> Reference paper by the original patch authors:
>>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
>>
>> Suggested-by: Vasileios P. Kemerlis <v...@cs.brown.edu>
>> Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
> This patch needs to be broken down into smaller patches - a series

Agreed.


>> ---
>>  arch/x86/Kconfig |   2 +-
>>  arch/x86/Kconfig.debug   |  17 +
>>  arch/x86/mm/Makefile |   2 +
>>  arch/x86/mm/init.c   |   3 +-
>>  arch/x86/mm/xpfo.c   | 176 
>> +++
>>  block/blk-map.c  |   7 +-
>>  include/linux/highmem.h  |  23 +--
>>  include/linux/mm_types.h |   4 ++
>>  include/linux/xpfo.h |  88 
>>  lib/swiotlb.c|   3 +-
>>  mm/page_alloc.c  |   7 +-
>>  11 files changed, 323 insertions(+), 9 deletions(-)
>>  create mode 100644 arch/x86/mm/xpfo.c
>>  create mode 100644 include/linux/xpfo.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index c46662f..9d32b4a 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
>>  
>>  config X86_DIRECT_GBPAGES
>>  def_bool y
>> -depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
>> +depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>>  ---help---
>>Certain kernel features effectively disable kernel
>>linear 1 GB mappings (even if the CPU otherwise
>> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
>> index 9b18ed9..1331da5 100644
>> --- a/arch/x86/Kconfig.debug
>> +++ b/arch/x86/Kconfig.debug
>> @@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
>>  
>>  source "lib/Kconfig.debug"
>>  
>> +config XPFO
>> +bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +default n
>> +depends on DEBUG_KERNEL
>> +depends on X86_64
>> +select DEBUG_TLBFLUSH
>> +---help---
>> +  This option offers protection against 'ret2dir' (kernel) attacks.
>> +  When enabled, every time a page frame is allocated to user space, it
>> +  is unmapped from the direct mapped RAM region in kernel space
>> +  (physmap). Similarly, whenever page frames are freed/reclaimed, they
>> +  are mapped back to physmap. Special care is taken to minimize the
>> +  impact on performance by reducing TLB shootdowns and unnecessary page
>> +  zero fills.
>> +
>> +  If in d

Re: [RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-03-21 Thread Juerg Haefliger
Hi Balbir,

Apologies for the slow reply.


On 03/01/2016 03:10 AM, Balbir Singh wrote:
> 
> 
> On 27/02/16 01:21, Juerg Haefliger wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userland, unless explicitly requested by the
>> kernel. Whenever a page destined for userland is allocated, it is
>> unmapped from physmap. When such a page is reclaimed from userland, it is
>> mapped back to physmap.
> physmap == xen physmap? Please clarify

No, it's not XEN related. I might have the terminology wrong. Physmap is what
the original authors used for describing  a large, contiguous virtual
memory region inside kernel address space that contains a direct mapping of part
or all (depending on the architecture) physical memory. 


>> Mapping/unmapping from physmap is accomplished by modifying the PTE
>> permission bits to allow/disallow access to the page.
>>
>> Additional fields are added to the page struct for XPFO housekeeping.
>> Specifically a flags field to distinguish user vs. kernel pages, a
>> reference counter to track physmap map/unmap operations and a lock to
>> protect the XPFO fields.
>>
>> Known issues/limitations:
>>   - Only supported on x86-64.
> Is it due to lack of porting or a design limitation?

Lack of porting. Support for other architectures will come later.


>>   - Only supports 4k pages.
>>   - Adds additional data to the page struct.
>>   - There are most likely some additional and legitimate uses cases where
>> the kernel needs to access userspace. Those need to be identified and
>> made XPFO-aware.
> Why not build an audit mode for it?

Can you elaborate what you mean by this?


>>   - There's a performance impact if XPFO is turned on. Per the paper
>> referenced below it's in the 1-3% ballpark. More performance testing
>> wouldn't hurt. What tests to run though?
>>
>> Reference paper by the original patch authors:
>>   http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
>>
>> Suggested-by: Vasileios P. Kemerlis 
>> Signed-off-by: Juerg Haefliger 
> This patch needs to be broken down into smaller patches - a series

Agreed.


>> ---
>>  arch/x86/Kconfig |   2 +-
>>  arch/x86/Kconfig.debug   |  17 +
>>  arch/x86/mm/Makefile |   2 +
>>  arch/x86/mm/init.c   |   3 +-
>>  arch/x86/mm/xpfo.c   | 176 
>> +++
>>  block/blk-map.c  |   7 +-
>>  include/linux/highmem.h  |  23 +--
>>  include/linux/mm_types.h |   4 ++
>>  include/linux/xpfo.h |  88 
>>  lib/swiotlb.c|   3 +-
>>  mm/page_alloc.c  |   7 +-
>>  11 files changed, 323 insertions(+), 9 deletions(-)
>>  create mode 100644 arch/x86/mm/xpfo.c
>>  create mode 100644 include/linux/xpfo.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index c46662f..9d32b4a 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
>>  
>>  config X86_DIRECT_GBPAGES
>>  def_bool y
>> -depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
>> +depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>>  ---help---
>>Certain kernel features effectively disable kernel
>>linear 1 GB mappings (even if the CPU otherwise
>> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
>> index 9b18ed9..1331da5 100644
>> --- a/arch/x86/Kconfig.debug
>> +++ b/arch/x86/Kconfig.debug
>> @@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
>>  
>>  source "lib/Kconfig.debug"
>>  
>> +config XPFO
>> +bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +default n
>> +depends on DEBUG_KERNEL
>> +depends on X86_64
>> +select DEBUG_TLBFLUSH
>> +---help---
>> +  This option offers protection against 'ret2dir' (kernel) attacks.
>> +  When enabled, every time a page frame is allocated to user space, it
>> +  is unmapped from the direct mapped RAM region in kernel space
>> +  (physmap). Similarly, whenever page frames are freed/reclaimed, they
>> +  are mapped back to physmap. Special care is taken to minimize the
>> +  impact on performance by reducing TLB shootdowns and unnecessary page
>> +  zero fills.
>> +
>> +  If in doubt, say "N".
>> +
>>  config X

Re: [RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-03-21 Thread Juerg Haefliger
Hi Laura,

Sorry for the late reply. I was on FTO and then traveling for the past couple of
days.


On 03/01/2016 02:31 AM, Laura Abbott wrote:
> On 02/26/2016 06:21 AM, Juerg Haefliger wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userland, unless explicitly requested by the
>> kernel. Whenever a page destined for userland is allocated, it is
>> unmapped from physmap. When such a page is reclaimed from userland, it is
>> mapped back to physmap.
>>
>> Mapping/unmapping from physmap is accomplished by modifying the PTE
>> permission bits to allow/disallow access to the page.
>>
>> Additional fields are added to the page struct for XPFO housekeeping.
>> Specifically a flags field to distinguish user vs. kernel pages, a
>> reference counter to track physmap map/unmap operations and a lock to
>> protect the XPFO fields.
>>
>> Known issues/limitations:
>>- Only supported on x86-64.
>>- Only supports 4k pages.
>>- Adds additional data to the page struct.
>>- There are most likely some additional and legitimate uses cases where
>>  the kernel needs to access userspace. Those need to be identified and
>>  made XPFO-aware.
>>- There's a performance impact if XPFO is turned on. Per the paper
>>  referenced below it's in the 1-3% ballpark. More performance testing
>>  wouldn't hurt. What tests to run though?
>>
>> Reference paper by the original patch authors:
>>http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
>>
> 
> General note: Make sure to cc the x86 maintainers on the next version of
> the patch. I'd also recommend ccing the kernel hardening list (see the wiki
> page http://kernsec.org/wiki/index.php/Kernel_Self_Protection_Project for
> details)

Good idea. Thanks for the suggestion.


> If you can find a way to break this up into x86 specific vs. generic patches
> that would be better. Perhaps move the Kconfig for XPFO to the generic
> Kconfig layer and make it depend on ARCH_HAS_XPFO? x86 can then select
> ARCH_HAS_XPFO as the last option.

Good idea.


> There also isn't much that's actually x86 specific here except for
> some of the page table manipulation functions and even those can probably
> be abstracted away. It would be good to get more of this out of x86 to
> let other arches take advantage of it. The arm64 implementation would
> look pretty similar if you save the old kernel mapping and restore
> it on free.

OK. I need to familiarize myself with ARM to figure out which pieces can move
out of the arch subdir.


>  
>> Suggested-by: Vasileios P. Kemerlis <v...@cs.brown.edu>
>> Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
>> ---
>>   arch/x86/Kconfig |   2 +-
>>   arch/x86/Kconfig.debug   |  17 +
>>   arch/x86/mm/Makefile |   2 +
>>   arch/x86/mm/init.c   |   3 +-
>>   arch/x86/mm/xpfo.c   | 176 
>> +++
>>   block/blk-map.c  |   7 +-
>>   include/linux/highmem.h  |  23 +--
>>   include/linux/mm_types.h |   4 ++
>>   include/linux/xpfo.h |  88 
>>   lib/swiotlb.c|   3 +-
>>   mm/page_alloc.c  |   7 +-
>>   11 files changed, 323 insertions(+), 9 deletions(-)
>>   create mode 100644 arch/x86/mm/xpfo.c
>>   create mode 100644 include/linux/xpfo.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index c46662f..9d32b4a 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
>>
>>   config X86_DIRECT_GBPAGES
>>   def_bool y
>> -depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
>> +depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>>   ---help---
>> Certain kernel features effectively disable kernel
>> linear 1 GB mappings (even if the CPU otherwise
>> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
>> index 9b18ed9..1331da5 100644
>> --- a/arch/x86/Kconfig.debug
>> +++ b/arch/x86/Kconfig.debug
>> @@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
>>
>>   source "lib/Kconfig.debug"
>>
>> +config XPFO
>> +bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +default n
>> +depends on DEBUG_KERNEL
>> +depends on X86_64
>> +select DEBUG_TLBFLUSH
>> +---help---
>> +  Th

Re: [RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-03-21 Thread Juerg Haefliger
Hi Laura,

Sorry for the late reply. I was on FTO and then traveling for the past couple of
days.


On 03/01/2016 02:31 AM, Laura Abbott wrote:
> On 02/26/2016 06:21 AM, Juerg Haefliger wrote:
>> This patch adds support for XPFO which protects against 'ret2dir' kernel
>> attacks. The basic idea is to enforce exclusive ownership of page frames
>> by either the kernel or userland, unless explicitly requested by the
>> kernel. Whenever a page destined for userland is allocated, it is
>> unmapped from physmap. When such a page is reclaimed from userland, it is
>> mapped back to physmap.
>>
>> Mapping/unmapping from physmap is accomplished by modifying the PTE
>> permission bits to allow/disallow access to the page.
>>
>> Additional fields are added to the page struct for XPFO housekeeping.
>> Specifically a flags field to distinguish user vs. kernel pages, a
>> reference counter to track physmap map/unmap operations and a lock to
>> protect the XPFO fields.
>>
>> Known issues/limitations:
>>- Only supported on x86-64.
>>- Only supports 4k pages.
>>- Adds additional data to the page struct.
>>- There are most likely some additional and legitimate uses cases where
>>  the kernel needs to access userspace. Those need to be identified and
>>  made XPFO-aware.
>>- There's a performance impact if XPFO is turned on. Per the paper
>>  referenced below it's in the 1-3% ballpark. More performance testing
>>  wouldn't hurt. What tests to run though?
>>
>> Reference paper by the original patch authors:
>>http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf
>>
> 
> General note: Make sure to cc the x86 maintainers on the next version of
> the patch. I'd also recommend ccing the kernel hardening list (see the wiki
> page http://kernsec.org/wiki/index.php/Kernel_Self_Protection_Project for
> details)

Good idea. Thanks for the suggestion.


> If you can find a way to break this up into x86 specific vs. generic patches
> that would be better. Perhaps move the Kconfig for XPFO to the generic
> Kconfig layer and make it depend on ARCH_HAS_XPFO? x86 can then select
> ARCH_HAS_XPFO as the last option.

Good idea.


> There also isn't much that's actually x86 specific here except for
> some of the page table manipulation functions and even those can probably
> be abstracted away. It would be good to get more of this out of x86 to
> let other arches take advantage of it. The arm64 implementation would
> look pretty similar if you save the old kernel mapping and restore
> it on free.

OK. I need to familiarize myself with ARM to figure out which pieces can move
out of the arch subdir.


>  
>> Suggested-by: Vasileios P. Kemerlis 
>> Signed-off-by: Juerg Haefliger 
>> ---
>>   arch/x86/Kconfig |   2 +-
>>   arch/x86/Kconfig.debug   |  17 +
>>   arch/x86/mm/Makefile |   2 +
>>   arch/x86/mm/init.c   |   3 +-
>>   arch/x86/mm/xpfo.c   | 176 
>> +++
>>   block/blk-map.c  |   7 +-
>>   include/linux/highmem.h  |  23 +--
>>   include/linux/mm_types.h |   4 ++
>>   include/linux/xpfo.h |  88 
>>   lib/swiotlb.c|   3 +-
>>   mm/page_alloc.c  |   7 +-
>>   11 files changed, 323 insertions(+), 9 deletions(-)
>>   create mode 100644 arch/x86/mm/xpfo.c
>>   create mode 100644 include/linux/xpfo.h
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index c46662f..9d32b4a 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
>>
>>   config X86_DIRECT_GBPAGES
>>   def_bool y
>> -depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
>> +depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
>>   ---help---
>> Certain kernel features effectively disable kernel
>> linear 1 GB mappings (even if the CPU otherwise
>> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
>> index 9b18ed9..1331da5 100644
>> --- a/arch/x86/Kconfig.debug
>> +++ b/arch/x86/Kconfig.debug
>> @@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
>>
>>   source "lib/Kconfig.debug"
>>
>> +config XPFO
>> +bool "Enable eXclusive Page Frame Ownership (XPFO)"
>> +default n
>> +depends on DEBUG_KERNEL
>> +depends on X86_64
>> +select DEBUG_TLBFLUSH
>> +---help---
>> +  This option offers protection against 'ret2dir' (kernel)

[RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-02-26 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userland, unless explicitly requested by the
kernel. Whenever a page destined for userland is allocated, it is
unmapped from physmap. When such a page is reclaimed from userland, it is
mapped back to physmap.

Mapping/unmapping from physmap is accomplished by modifying the PTE
permission bits to allow/disallow access to the page.

Additional fields are added to the page struct for XPFO housekeeping.
Specifically a flags field to distinguish user vs. kernel pages, a
reference counter to track physmap map/unmap operations and a lock to
protect the XPFO fields.

Known issues/limitations:
  - Only supported on x86-64.
  - Only supports 4k pages.
  - Adds additional data to the page struct.
  - There are most likely some additional and legitimate uses cases where
the kernel needs to access userspace. Those need to be identified and
made XPFO-aware.
  - There's a performance impact if XPFO is turned on. Per the paper
referenced below it's in the 1-3% ballpark. More performance testing
wouldn't hurt. What tests to run though?

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis <v...@cs.brown.edu>
Signed-off-by: Juerg Haefliger <juerg.haefli...@hpe.com>
---
 arch/x86/Kconfig |   2 +-
 arch/x86/Kconfig.debug   |  17 +
 arch/x86/mm/Makefile |   2 +
 arch/x86/mm/init.c   |   3 +-
 arch/x86/mm/xpfo.c   | 176 +++
 block/blk-map.c  |   7 +-
 include/linux/highmem.h  |  23 +--
 include/linux/mm_types.h |   4 ++
 include/linux/xpfo.h |  88 
 lib/swiotlb.c|   3 +-
 mm/page_alloc.c  |   7 +-
 11 files changed, 323 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/mm/xpfo.c
 create mode 100644 include/linux/xpfo.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f..9d32b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed9..1331da5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config XPFO
+   bool "Enable eXclusive Page Frame Ownership (XPFO)"
+   default n
+   depends on DEBUG_KERNEL
+   depends on X86_64
+   select DEBUG_TLBFLUSH
+   ---help---
+ This option offers protection against 'ret2dir' (kernel) attacks.
+ When enabled, every time a page frame is allocated to user space, it
+ is unmapped from the direct mapped RAM region in kernel space
+ (physmap). Similarly, whenever page frames are freed/reclaimed, they
+ are mapped back to physmap. Special care is taken to minimize the
+ impact on performance by reducing TLB shootdowns and unnecessary page
+ zero fills.
+
+ If in doubt, say "N".
+
 config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a4..8bf52b6 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA)   += srat.o
 obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)+= mpx.o
+
+obj-$(CONFIG_XPFO) += xpfo.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f541..27fc8a6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,7 +150,8 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) && \
+   !defined(CONFIG_XPFO)
/*
 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/xpfo.c b/arch/x86/mm/xpfo.c
new file mode 100644
index 000..6bc24d3
--- /dev/null
+++ b/arch/x86/mm/xpfo.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2016 Brown University. All rights reserved.
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
+ *
+ * Authors:
+ *   Vasileios P. Kemerlis <v...@cs.brown.edu>
+ *   Juerg Haefliger <

[RFC PATCH] Add support for eXclusive Page Frame Ownership (XPFO)

2016-02-26 Thread Juerg Haefliger
This patch adds support for XPFO which protects against 'ret2dir' kernel
attacks. The basic idea is to enforce exclusive ownership of page frames
by either the kernel or userland, unless explicitly requested by the
kernel. Whenever a page destined for userland is allocated, it is
unmapped from physmap. When such a page is reclaimed from userland, it is
mapped back to physmap.

Mapping/unmapping from physmap is accomplished by modifying the PTE
permission bits to allow/disallow access to the page.

Additional fields are added to the page struct for XPFO housekeeping.
Specifically a flags field to distinguish user vs. kernel pages, a
reference counter to track physmap map/unmap operations and a lock to
protect the XPFO fields.

Known issues/limitations:
  - Only supported on x86-64.
  - Only supports 4k pages.
  - Adds additional data to the page struct.
  - There are most likely some additional and legitimate uses cases where
the kernel needs to access userspace. Those need to be identified and
made XPFO-aware.
  - There's a performance impact if XPFO is turned on. Per the paper
referenced below it's in the 1-3% ballpark. More performance testing
wouldn't hurt. What tests to run though?

Reference paper by the original patch authors:
  http://www.cs.columbia.edu/~vpk/papers/ret2dir.sec14.pdf

Suggested-by: Vasileios P. Kemerlis 
Signed-off-by: Juerg Haefliger 
---
 arch/x86/Kconfig |   2 +-
 arch/x86/Kconfig.debug   |  17 +
 arch/x86/mm/Makefile |   2 +
 arch/x86/mm/init.c   |   3 +-
 arch/x86/mm/xpfo.c   | 176 +++
 block/blk-map.c  |   7 +-
 include/linux/highmem.h  |  23 +--
 include/linux/mm_types.h |   4 ++
 include/linux/xpfo.h |  88 
 lib/swiotlb.c|   3 +-
 mm/page_alloc.c  |   7 +-
 11 files changed, 323 insertions(+), 9 deletions(-)
 create mode 100644 arch/x86/mm/xpfo.c
 create mode 100644 include/linux/xpfo.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f..9d32b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1343,7 +1343,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
def_bool y
-   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+   depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK && !XPFO
---help---
  Certain kernel features effectively disable kernel
  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed9..1331da5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,23 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config XPFO
+   bool "Enable eXclusive Page Frame Ownership (XPFO)"
+   default n
+   depends on DEBUG_KERNEL
+   depends on X86_64
+   select DEBUG_TLBFLUSH
+   ---help---
+ This option offers protection against 'ret2dir' (kernel) attacks.
+ When enabled, every time a page frame is allocated to user space, it
+ is unmapped from the direct mapped RAM region in kernel space
+ (physmap). Similarly, whenever page frames are freed/reclaimed, they
+ are mapped back to physmap. Special care is taken to minimize the
+ impact on performance by reducing TLB shootdowns and unnecessary page
+ zero fills.
+
+ If in doubt, say "N".
+
 config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a4..8bf52b6 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA)   += srat.o
 obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)+= mpx.o
+
+obj-$(CONFIG_XPFO) += xpfo.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 493f541..27fc8a6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -150,7 +150,8 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) && \
+   !defined(CONFIG_XPFO)
/*
 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
 * This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/xpfo.c b/arch/x86/mm/xpfo.c
new file mode 100644
index 000..6bc24d3
--- /dev/null
+++ b/arch/x86/mm/xpfo.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2016 Brown University. All rights reserved.
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development, L.P.
+ *
+ * Authors:
+ *   Vasileios P. Kemerlis 
+ *   Juerg Haefliger 
+ *
+ * This program is free software; you can redistribute it and/or modify 

Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-18 Thread Juerg Haefliger
On 02/10/2016 02:24 PM, Juerg Haefliger wrote:
> On 02/10/2016 11:12 AM, David Howells wrote:
>> Juerg Haefliger <juerg.haefli...@hpe.com> wrote:
>>
>>> This patch adds support for signing a kernel module with a raw
>>> detached PKCS#7 signature/message.
>>>
>>> The signature is not converted and is simply appended to the module so
>>> it needs to be in the right format. Using openssl, a valid signature can
>>> be generated like this:
>>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>>  -signer  -outform der -out 
>>>
>>> The resulting raw signature from the above command is (more or less)
>>> identical to the raw signature that sign-file itself can produce like
>>> this:
>>>   $ scripts/sign-file -d
>>
>> What's the usage case for this?  Can it be done instead with openssl PKCS#11?
> 
> Our internal signing service doesn't support PKCS#11. I have to submit the 
> blobs
> and get detached PKCS#7 messages back. I don't claim I fully understand all 
> the
> different signing mechanisms but everything worked just fine until support for
> signing with a detached signature was removed. IMO that's a regression, which
> I'm trying to fix with this patch.

Any comments?

Thanks
...Juerg



Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-18 Thread Juerg Haefliger
On 02/10/2016 02:24 PM, Juerg Haefliger wrote:
> On 02/10/2016 11:12 AM, David Howells wrote:
>> Juerg Haefliger  wrote:
>>
>>> This patch adds support for signing a kernel module with a raw
>>> detached PKCS#7 signature/message.
>>>
>>> The signature is not converted and is simply appended to the module so
>>> it needs to be in the right format. Using openssl, a valid signature can
>>> be generated like this:
>>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>>  -signer  -outform der -out 
>>>
>>> The resulting raw signature from the above command is (more or less)
>>> identical to the raw signature that sign-file itself can produce like
>>> this:
>>>   $ scripts/sign-file -d
>>
>> What's the usage case for this?  Can it be done instead with openssl PKCS#11?
> 
> Our internal signing service doesn't support PKCS#11. I have to submit the 
> blobs
> and get detached PKCS#7 messages back. I don't claim I fully understand all 
> the
> different signing mechanisms but everything worked just fine until support for
> signing with a detached signature was removed. IMO that's a regression, which
> I'm trying to fix with this patch.

Any comments?

Thanks
...Juerg



Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-10 Thread Juerg Haefliger
On 02/10/2016 11:12 AM, David Howells wrote:
> Juerg Haefliger  wrote:
> 
>> This patch adds support for signing a kernel module with a raw
>> detached PKCS#7 signature/message.
>>
>> The signature is not converted and is simply appended to the module so
>> it needs to be in the right format. Using openssl, a valid signature can
>> be generated like this:
>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>  -signer  -outform der -out 
>>
>> The resulting raw signature from the above command is (more or less)
>> identical to the raw signature that sign-file itself can produce like
>> this:
>>   $ scripts/sign-file -d
> 
> What's the usage case for this?  Can it be done instead with openssl PKCS#11?

Our internal signing service doesn't support PKCS#11. I have to submit the blobs
and get detached PKCS#7 messages back. I don't claim I fully understand all the
different signing mechanisms but everything worked just fine until support for
signing with a detached signature was removed. IMO that's a regression, which
I'm trying to fix with this patch.

...Juerg



> David
> 



Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-10 Thread Juerg Haefliger
On 02/10/2016 11:12 AM, David Howells wrote:
> Juerg Haefliger  wrote:
> 
>> This patch adds support for signing a kernel module with a raw
>> detached PKCS#7 signature/message.
>>
>> The signature is not converted and is simply appended to the module so
>> it needs to be in the right format. Using openssl, a valid signature can
>> be generated like this:
>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>  -signer  -outform der -out 
>>
>> The resulting raw signature from the above command is (more or less)
>> identical to the raw signature that sign-file itself can produce like
>> this:
>>   $ scripts/sign-file -d
> 
> What's the usage case for this?  Can it be done instead with openssl PKCS#11?

Our internal signing service doesn't support PKCS#11. I have to submit the blobs
and get detached PKCS#7 messages back. I don't claim I fully understand all the
different signing mechanisms but everything worked just fine until support for
signing with a detached signature was removed. IMO that's a regression, which
I'm trying to fix with this patch.

...Juerg



> David
> 



Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-10 Thread Juerg Haefliger
On 02/10/2016 11:12 AM, David Howells wrote:
> Juerg Haefliger <juerg.haefli...@hpe.com> wrote:
> 
>> This patch adds support for signing a kernel module with a raw
>> detached PKCS#7 signature/message.
>>
>> The signature is not converted and is simply appended to the module so
>> it needs to be in the right format. Using openssl, a valid signature can
>> be generated like this:
>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>  -signer  -outform der -out 
>>
>> The resulting raw signature from the above command is (more or less)
>> identical to the raw signature that sign-file itself can produce like
>> this:
>>   $ scripts/sign-file -d
> 
> What's the usage case for this?  Can it be done instead with openssl PKCS#11?

Our internal signing service doesn't support PKCS#11. I have to submit the blobs
and get detached PKCS#7 messages back. I don't claim I fully understand all the
different signing mechanisms but everything worked just fine until support for
signing with a detached signature was removed. IMO that's a regression, which
I'm trying to fix with this patch.

...Juerg



> David
> 



Re: [PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-10 Thread Juerg Haefliger
On 02/10/2016 11:12 AM, David Howells wrote:
> Juerg Haefliger <juerg.haefli...@hpe.com> wrote:
> 
>> This patch adds support for signing a kernel module with a raw
>> detached PKCS#7 signature/message.
>>
>> The signature is not converted and is simply appended to the module so
>> it needs to be in the right format. Using openssl, a valid signature can
>> be generated like this:
>>   $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
>>  -signer  -outform der -out 
>>
>> The resulting raw signature from the above command is (more or less)
>> identical to the raw signature that sign-file itself can produce like
>> this:
>>   $ scripts/sign-file -d
> 
> What's the usage case for this?  Can it be done instead with openssl PKCS#11?

Our internal signing service doesn't support PKCS#11. I have to submit the blobs
and get detached PKCS#7 messages back. I don't claim I fully understand all the
different signing mechanisms but everything worked just fine until support for
signing with a detached signature was removed. IMO that's a regression, which
I'm trying to fix with this patch.

...Juerg



> David
> 



[PATCH v2] scripts/sign-file.c: Add support for signing with a raw signature

2016-02-04 Thread Juerg Haefliger
This patch adds support for signing a kernel module with a raw
detached PKCS#7 signature/message.

The signature is not converted and is simply appended to the module so
it needs to be in the right format. Using openssl, a valid signature can
be generated like this:
  $ openssl smime -sign -nocerts -noattr -binary -in  -inkey \
 -signer  -outform der -out 

The resulting raw signature from the above command is (more or less)
identical to the raw signature that sign-file itself can produce like
this:
  $ scripts/sign-file -d

Signed-off-by: Juerg Haefliger 
---
 scripts/sign-file.c | 235 
 1 file changed, 146 insertions(+), 89 deletions(-)

diff --git a/scripts/sign-file.c b/scripts/sign-file.c
index 250a7a6..c78ddd7 100755
--- a/scripts/sign-file.c
+++ b/scripts/sign-file.c
@@ -2,9 +2,11 @@
  *
  * Copyright © 2014-2015 Red Hat, Inc. All Rights Reserved.
  * Copyright © 2015  Intel Corporation.
+ * Copyright © 2016  Hewlett Packard Enterprise Development LP
  *
  * Authors: David Howells 
  *  David Woodhouse 
+ *  Juerg Haefliger 
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -67,6 +69,8 @@ void format(void)
 {
fprintf(stderr,
"Usage: scripts/sign-file [-dp]
 []\n");
+   fprintf(stderr,
+   "   scripts/sign-file -s
 []\n");
exit(2);
 }
 
@@ -126,26 +130,84 @@ static int pem_pw_cb(char *buf, int len, int w, void *v)
return pwlen;
 }
 
+static EVP_PKEY *read_private_key(const char *private_key_name)
+{
+   EVP_PKEY *private_key;
+
+   if (!strncmp(private_key_name, "pkcs11:", 7)) {
+   ENGINE *e;
+
+   ENGINE_load_builtin_engines();
+   drain_openssl_errors();
+   e = ENGINE_by_id("pkcs11");
+   ERR(!e, "Load PKCS#11 ENGINE");
+   if (ENGINE_init(e))
+   drain_openssl_errors();
+   else
+   ERR(1, "ENGINE_init");
+   if (key_pass)
+   ERR(!ENGINE_ctrl_cmd_string(e, "PIN", key_pass, 0),
+   "Set PKCS#11 PIN");
+   private_key = ENGINE_load_private_key(e, private_key_name,
+ NULL, NULL);
+   ERR(!private_key, "%s", private_key_name);
+   } else {
+   BIO *b;
+
+   b = BIO_new_file(private_key_name, "rb");
+   ERR(!b, "%s", private_key_name);
+   private_key = PEM_read_bio_PrivateKey(b, NULL, pem_pw_cb,
+ NULL);
+   ERR(!private_key, "%s", private_key_name);
+   BIO_free(b);
+   }
+
+   return private_key;
+}
+
+static X509 *read_x509(const char *x509_name)
+{
+   X509 *x509;
+   BIO *b;
+
+   b = BIO_new_file(x509_name, "rb");
+   ERR(!b, "%s", x509_name);
+   x509 = d2i_X509_bio(b, NULL); /* Binary encoded X.509 */
+   if (!x509) {
+   ERR(BIO_reset(b) != 1, "%s", x509_name);
+   x509 = PEM_read_bio_X509(b, NULL, NULL,
+NULL); /* PEM encoded X.509 */
+   if (x509)
+   drain_openssl_errors();
+   }
+   BIO_free(b);
+   ERR(!x509, "%s", x509_name);
+
+   return x509;
+}
+
 int main(int argc, char **argv)
 {
struct module_signature sig_info = { .id_type = PKEY_ID_PKCS7 };
char *hash_algo = NULL;
-   char *private_key_name, *x509_name, *module_name, *dest_name;
+   char *private_key_name = NULL, *raw_sig_name = NULL;
+   char *x509_name, *module_name, *dest_name;
bool save_sig = false, replace_orig;
bool sign_only = false;
+   bool raw_sig = false;
unsigned char buf[4096];
unsigned long module_size, sig_size;
unsigned int use_signed_attrs;
const EVP_MD *digest_algo;
EVP_PKEY *private_key;
 #ifndef USE_PKCS7
-   CMS_ContentInfo *cms;
+   CMS_ContentInfo *cms = NULL;
unsigned int use_keyid = 0;
 #else
-   PKCS7 *pkcs7;
+   PKCS7 *pkcs7 = NULL;
 #endif
X509 *x509;
-   BIO *b, *bd = NULL, *bm;
+   BIO *bd, *bm;
int opt, n;
OpenSSL_add_all_algorithms();
ERR_load_crypto_strings();
@@ -160,8 +222,9 @@ int main(int argc, char **argv)
 #endif
 
do {
-   opt = getopt(argc, argv, "dpk");
+   opt = getopt(argc, argv, "sdpk");
switch (opt) {
+   case 's': raw_sig = true; break;
case 'p': save_sig = true; break;
case 'd': sign_only = t

  1   2   >