Re: Linux 4.4.73

2017-06-16 Thread Greg KH
diff --git a/Makefile b/Makefile
index 94d663c935c0..ba5a70b6e32c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 72
+SUBLEVEL = 73
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 
@@ -789,7 +789,7 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
 KBUILD_ARFLAGS := $(call ar-option,D)
 
 # check for 'asm goto'
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) 
$(KBUILD_CFLAGS)), y)
KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
diff --git a/arch/arm/boot/dts/imx6dl.dtsi b/arch/arm/boot/dts/imx6dl.dtsi
index 4b0ec0703825..8ca9217204a0 100644
--- a/arch/arm/boot/dts/imx6dl.dtsi
+++ b/arch/arm/boot/dts/imx6dl.dtsi
@@ -30,7 +30,7 @@
/* kHzuV */
996000  125
792000  1175000
-   396000  1075000
+   396000  115
>;
fsl,soc-operating-points = <
/* ARM kHz  SOC-PU uV */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 7460df3eec6b..4612ed7ec2e5 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -229,12 +229,17 @@ ENTRY(sie64a)
lctlg   %c1,%c1,__LC_USER_ASCE  # load primary asce
 .Lsie_done:
 # some program checks are suppressing. C code (e.g. do_protection_exception)
-# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
-# instructions between sie64a and .Lsie_done should not cause program
-# interrupts. So lets use a nop (47 00 00 00) as a landing pad.
+# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
+# are some corner cases (e.g. runtime instrumentation) where ILC is 
unpredictable.
+# Other instructions between sie64a and .Lsie_done should not cause program
+# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
 # See also .Lcleanup_sie
-.Lrewind_pad:
-   nop 0
+.Lrewind_pad6:
+   nopr7
+.Lrewind_pad4:
+   nopr7
+.Lrewind_pad2:
+   nopr7
.globl sie_exit
 sie_exit:
lg  %r14,__SF_EMPTY+8(%r15) # load guest register save area
@@ -247,7 +252,9 @@ sie_exit:
stg %r14,__SF_EMPTY+16(%r15)# set exit reason code
j   sie_exit
 
-   EX_TABLE(.Lrewind_pad,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad6,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad4,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad2,.Lsie_fault)
EX_TABLE(sie_exit,.Lsie_fault)
 #endif
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ef7d6c8fea66..f354fd84adeb 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -372,7 +372,7 @@ void __init vmem_map_init(void)
ro_end = (unsigned long)&_eshared & PAGE_MASK;
for_each_memblock(memory, reg) {
start = reg->base;
-   end = reg->base + reg->size - 1;
+   end = reg->base + reg->size;
if (start >= ro_end || end <= ro_start)
vmem_add_mem(start, end - start, 0);
else if (start >= ro_start && end <= ro_end)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index d21cd625c0de..cc97a43268ee 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -85,7 +85,7 @@ static void dump_tl1_traplog(struct tl1_traplog *p)
 
 void bad_trap(struct pt_regs *regs, long lvl)
 {
-   char buffer[32];
+   char buffer[36];
siginfo_t info;
 
if (notify_die(DIE_TRAP, "bad trap", regs,
@@ -116,7 +116,7 @@ void bad_trap(struct pt_regs *regs, long lvl)
 
 void bad_trap_tl1(struct pt_regs *regs, long lvl)
 {
-   char buffer[32];
+   char buffer[36];

if (notify_die(DIE_TRAP_TL1, "bad trap tl1", regs,
   0, lvl, SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/xtensa/include/asm/irq.h b/arch/xtensa/include/asm/irq.h
index f71f88ea7646..19707db966f1 100644
--- a/arch/xtensa/include/asm/irq.h
+++ b/arch/xtensa/include/asm/irq.h
@@ -29,7 +29,8 @@ static inline void variant_irq_disable(unsigned int irq) { }
 # define PLATFORM_NR_IRQS 0
 #endif
 #define XTENSA_NR_IRQS XCHAL_NUM_INTERRUPTS
-#define NR_IRQS (XTENSA_NR_IRQS + VARIANT_NR_IRQS + PLATFORM_NR_IRQS)
+#define NR_IRQS (XTENSA_NR_IRQS + VARIANT_NR_IRQS + PLATFORM_NR_IRQS + 1)
+#define XTENSA_PIC_LINUX_IRQ(hwirq) ((hwirq) + 1)
 
 #if VARIANT_NR_IRQS == 0
 static inline void variant_init_irq(void) { }
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 4ac3d23161cf..441694464b1e 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -34,11 +34,6 @@ asmlinkage void do_IRQ(int hwirq, struct pt_regs *regs)
 {
int irq = irq_find_mapping(NULL, hwirq);
 
-   if (hwirq >= NR_IRQS) {

Re: Linux 4.4.73

2017-06-16 Thread Greg KH
diff --git a/Makefile b/Makefile
index 94d663c935c0..ba5a70b6e32c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 4
-SUBLEVEL = 72
+SUBLEVEL = 73
 EXTRAVERSION =
 NAME = Blurry Fish Butt
 
@@ -789,7 +789,7 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
 KBUILD_ARFLAGS := $(call ar-option,D)
 
 # check for 'asm goto'
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) 
$(KBUILD_CFLAGS)), y)
KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
diff --git a/arch/arm/boot/dts/imx6dl.dtsi b/arch/arm/boot/dts/imx6dl.dtsi
index 4b0ec0703825..8ca9217204a0 100644
--- a/arch/arm/boot/dts/imx6dl.dtsi
+++ b/arch/arm/boot/dts/imx6dl.dtsi
@@ -30,7 +30,7 @@
/* kHzuV */
996000  125
792000  1175000
-   396000  1075000
+   396000  115
>;
fsl,soc-operating-points = <
/* ARM kHz  SOC-PU uV */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 7460df3eec6b..4612ed7ec2e5 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -229,12 +229,17 @@ ENTRY(sie64a)
lctlg   %c1,%c1,__LC_USER_ASCE  # load primary asce
 .Lsie_done:
 # some program checks are suppressing. C code (e.g. do_protection_exception)
-# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
-# instructions between sie64a and .Lsie_done should not cause program
-# interrupts. So lets use a nop (47 00 00 00) as a landing pad.
+# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
+# are some corner cases (e.g. runtime instrumentation) where ILC is 
unpredictable.
+# Other instructions between sie64a and .Lsie_done should not cause program
+# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
 # See also .Lcleanup_sie
-.Lrewind_pad:
-   nop 0
+.Lrewind_pad6:
+   nopr7
+.Lrewind_pad4:
+   nopr7
+.Lrewind_pad2:
+   nopr7
.globl sie_exit
 sie_exit:
lg  %r14,__SF_EMPTY+8(%r15) # load guest register save area
@@ -247,7 +252,9 @@ sie_exit:
stg %r14,__SF_EMPTY+16(%r15)# set exit reason code
j   sie_exit
 
-   EX_TABLE(.Lrewind_pad,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad6,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad4,.Lsie_fault)
+   EX_TABLE(.Lrewind_pad2,.Lsie_fault)
EX_TABLE(sie_exit,.Lsie_fault)
 #endif
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ef7d6c8fea66..f354fd84adeb 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -372,7 +372,7 @@ void __init vmem_map_init(void)
ro_end = (unsigned long)&_eshared & PAGE_MASK;
for_each_memblock(memory, reg) {
start = reg->base;
-   end = reg->base + reg->size - 1;
+   end = reg->base + reg->size;
if (start >= ro_end || end <= ro_start)
vmem_add_mem(start, end - start, 0);
else if (start >= ro_start && end <= ro_end)
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index d21cd625c0de..cc97a43268ee 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -85,7 +85,7 @@ static void dump_tl1_traplog(struct tl1_traplog *p)
 
 void bad_trap(struct pt_regs *regs, long lvl)
 {
-   char buffer[32];
+   char buffer[36];
siginfo_t info;
 
if (notify_die(DIE_TRAP, "bad trap", regs,
@@ -116,7 +116,7 @@ void bad_trap(struct pt_regs *regs, long lvl)
 
 void bad_trap_tl1(struct pt_regs *regs, long lvl)
 {
-   char buffer[32];
+   char buffer[36];

if (notify_die(DIE_TRAP_TL1, "bad trap tl1", regs,
   0, lvl, SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/xtensa/include/asm/irq.h b/arch/xtensa/include/asm/irq.h
index f71f88ea7646..19707db966f1 100644
--- a/arch/xtensa/include/asm/irq.h
+++ b/arch/xtensa/include/asm/irq.h
@@ -29,7 +29,8 @@ static inline void variant_irq_disable(unsigned int irq) { }
 # define PLATFORM_NR_IRQS 0
 #endif
 #define XTENSA_NR_IRQS XCHAL_NUM_INTERRUPTS
-#define NR_IRQS (XTENSA_NR_IRQS + VARIANT_NR_IRQS + PLATFORM_NR_IRQS)
+#define NR_IRQS (XTENSA_NR_IRQS + VARIANT_NR_IRQS + PLATFORM_NR_IRQS + 1)
+#define XTENSA_PIC_LINUX_IRQ(hwirq) ((hwirq) + 1)
 
 #if VARIANT_NR_IRQS == 0
 static inline void variant_init_irq(void) { }
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 4ac3d23161cf..441694464b1e 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -34,11 +34,6 @@ asmlinkage void do_IRQ(int hwirq, struct pt_regs *regs)
 {
int irq = irq_find_mapping(NULL, hwirq);
 
-   if (hwirq >= NR_IRQS) {

Linux 4.4.73

2017-06-16 Thread Greg KH
I'm announcing the release of the 4.4.73 kernel.

All users of the 4.4 kernel series must upgrade.

The updated 4.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git 
linux-4.4.y
and can be browsed at the normal kernel.org git web browser:

http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary

thanks,

greg k-h



 Makefile |4 
 arch/arm/boot/dts/imx6dl.dtsi|2 
 arch/s390/kernel/entry.S |   19 +
 arch/s390/mm/vmem.c  |2 
 arch/sparc/kernel/traps_64.c |4 
 arch/xtensa/include/asm/irq.h|3 
 arch/xtensa/kernel/irq.c |5 
 arch/xtensa/platforms/xtfpga/include/platform/hardware.h |6 
 arch/xtensa/platforms/xtfpga/setup.c |   10 
 block/partitions/msdos.c |2 
 drivers/base/power/runtime.c |   11 -
 drivers/gpu/drm/ast/ast_drv.h|1 
 drivers/gpu/drm/ast/ast_main.c   |  157 +++
 drivers/gpu/drm/ast/ast_post.c   |   18 +
 drivers/gpu/drm/nouveau/nouveau_display.c|3 
 drivers/gpu/drm/nouveau/nouveau_drm.c|5 
 drivers/gpu/drm/nouveau/nouveau_fence.h  |1 
 drivers/gpu/drm/nouveau/nouveau_usif.c   |3 
 drivers/gpu/drm/nouveau/nv84_fence.c |6 
 drivers/i2c/busses/i2c-piix4.c   |2 
 drivers/irqchip/irq-xtensa-mx.c  |2 
 drivers/irqchip/irq-xtensa-pic.c |2 
 drivers/net/ethernet/adaptec/starfire.c  |   45 
 drivers/net/ethernet/freescale/gianfar.c |4 
 drivers/net/ethernet/hisilicon/hns/hns_enet.c|2 
 drivers/net/ethernet/mellanox/mlx4/catas.c   |2 
 drivers/net/ethernet/mellanox/mlx4/intf.c|   12 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h|1 
 drivers/net/ethernet/renesas/ravb_main.c |  112 ++
 drivers/net/ethernet/xilinx/xilinx_emaclite.c|  126 ++--
 drivers/net/hamradio/mkiss.c |4 
 drivers/net/usb/r8152.c  |   13 +
 drivers/net/usb/sierra_net.c |  111 ++
 drivers/parport/parport_gsc.c|8 
 drivers/pinctrl/berlin/berlin-bg4ct.c|2 
 drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c   |   13 -
 fs/cifs/connect.c|   24 +-
 fs/fscache/cookie.c  |5 
 fs/fscache/netfs.c   |1 
 fs/fscache/object.c  |   32 ++-
 fs/nfs/nfs4state.c   |1 
 fs/proc/base.c   |2 
 fs/romfs/super.c |   23 ++
 include/linux/fscache-cache.h|1 
 include/linux/log2.h |   13 +
 include/net/ipv6.h   |5 
 mm/kasan/report.c|3 
 net/core/ethtool.c   |9 
 net/ipv4/arp.c   |   12 -
 net/ipv6/addrconf.c  |   10 
 net/ipv6/datagram.c  |   14 -
 net/ipv6/ip6_output.c|3 
 net/ipv6/tcp_ipv6.c  |   11 -
 net/ipv6/udp.c   |4 
 net/sctp/socket.c|6 
 net/tipc/server.c|   13 -
 56 files changed, 600 insertions(+), 315 deletions(-)

Alexey Khoroshilov (1):
  net: adaptec: starfire: add checks for dma mapping errors

Anssi Hannula (2):
  net: xilinx_emaclite: fix freezes due to unordered I/O
  net: xilinx_emaclite: fix receive buffer overflow

Ard Biesheuvel (1):
  log2: make order_base_2() behave correctly on const input value zero

Arseny Solokha (1):
  gianfar: synchronize DMA API usage by free_skb_rx_queue w/ gfar_new_page

Ben Skeggs (2):
  drm/nouveau: prevent userspace from deleting client object
  drm/nouveau/fence/g84-: protect against concurrent access to semaphore 
buffers

Christian Borntraeger (1):
  s390/kvm: do not rely on the ILC on kvm host protection fauls

Chuck Lever (1):
  nfs: Fix "Don't increment lock sequence ID after 

Linux 4.4.73

2017-06-16 Thread Greg KH
I'm announcing the release of the 4.4.73 kernel.

All users of the 4.4 kernel series must upgrade.

The updated 4.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git 
linux-4.4.y
and can be browsed at the normal kernel.org git web browser:

http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary

thanks,

greg k-h



 Makefile |4 
 arch/arm/boot/dts/imx6dl.dtsi|2 
 arch/s390/kernel/entry.S |   19 +
 arch/s390/mm/vmem.c  |2 
 arch/sparc/kernel/traps_64.c |4 
 arch/xtensa/include/asm/irq.h|3 
 arch/xtensa/kernel/irq.c |5 
 arch/xtensa/platforms/xtfpga/include/platform/hardware.h |6 
 arch/xtensa/platforms/xtfpga/setup.c |   10 
 block/partitions/msdos.c |2 
 drivers/base/power/runtime.c |   11 -
 drivers/gpu/drm/ast/ast_drv.h|1 
 drivers/gpu/drm/ast/ast_main.c   |  157 +++
 drivers/gpu/drm/ast/ast_post.c   |   18 +
 drivers/gpu/drm/nouveau/nouveau_display.c|3 
 drivers/gpu/drm/nouveau/nouveau_drm.c|5 
 drivers/gpu/drm/nouveau/nouveau_fence.h  |1 
 drivers/gpu/drm/nouveau/nouveau_usif.c   |3 
 drivers/gpu/drm/nouveau/nv84_fence.c |6 
 drivers/i2c/busses/i2c-piix4.c   |2 
 drivers/irqchip/irq-xtensa-mx.c  |2 
 drivers/irqchip/irq-xtensa-pic.c |2 
 drivers/net/ethernet/adaptec/starfire.c  |   45 
 drivers/net/ethernet/freescale/gianfar.c |4 
 drivers/net/ethernet/hisilicon/hns/hns_enet.c|2 
 drivers/net/ethernet/mellanox/mlx4/catas.c   |2 
 drivers/net/ethernet/mellanox/mlx4/intf.c|   12 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h|1 
 drivers/net/ethernet/renesas/ravb_main.c |  112 ++
 drivers/net/ethernet/xilinx/xilinx_emaclite.c|  126 ++--
 drivers/net/hamradio/mkiss.c |4 
 drivers/net/usb/r8152.c  |   13 +
 drivers/net/usb/sierra_net.c |  111 ++
 drivers/parport/parport_gsc.c|8 
 drivers/pinctrl/berlin/berlin-bg4ct.c|2 
 drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c   |   13 -
 fs/cifs/connect.c|   24 +-
 fs/fscache/cookie.c  |5 
 fs/fscache/netfs.c   |1 
 fs/fscache/object.c  |   32 ++-
 fs/nfs/nfs4state.c   |1 
 fs/proc/base.c   |2 
 fs/romfs/super.c |   23 ++
 include/linux/fscache-cache.h|1 
 include/linux/log2.h |   13 +
 include/net/ipv6.h   |5 
 mm/kasan/report.c|3 
 net/core/ethtool.c   |9 
 net/ipv4/arp.c   |   12 -
 net/ipv6/addrconf.c  |   10 
 net/ipv6/datagram.c  |   14 -
 net/ipv6/ip6_output.c|3 
 net/ipv6/tcp_ipv6.c  |   11 -
 net/ipv6/udp.c   |4 
 net/sctp/socket.c|6 
 net/tipc/server.c|   13 -
 56 files changed, 600 insertions(+), 315 deletions(-)

Alexey Khoroshilov (1):
  net: adaptec: starfire: add checks for dma mapping errors

Anssi Hannula (2):
  net: xilinx_emaclite: fix freezes due to unordered I/O
  net: xilinx_emaclite: fix receive buffer overflow

Ard Biesheuvel (1):
  log2: make order_base_2() behave correctly on const input value zero

Arseny Solokha (1):
  gianfar: synchronize DMA API usage by free_skb_rx_queue w/ gfar_new_page

Ben Skeggs (2):
  drm/nouveau: prevent userspace from deleting client object
  drm/nouveau/fence/g84-: protect against concurrent access to semaphore 
buffers

Christian Borntraeger (1):
  s390/kvm: do not rely on the ILC on kvm host protection fauls

Chuck Lever (1):
  nfs: Fix "Don't increment lock sequence ID after 

Re: [PATCH v2 3/4] KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf

2017-06-16 Thread Wanpeng Li
2017-06-16 23:38 GMT+08:00 Radim Krčmář :
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář :
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li 
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini 
>> >> Cc: Radim Krčmář 
>> >> Signed-off-by: Wanpeng Li 
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >>  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
>> >> *fault)
>> >>  {
>> >>   ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events().  Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case.  Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
>  1) do the nested VM exit sooner, at the place where we now queue #PF,
>  2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.

How about something like this to not get exception events if it is
"is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
vcpu->arch.exception.async_page_fault" since lost a reschedule
optimization is not that importmant in L1.

@@ -3072,13 +3074,16 @@ static void
kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
 {
 process_nmi(vcpu);
-events->exception.injected =
-vcpu->arch.exception.pending &&
-!kvm_exception_is_soft(vcpu->arch.exception.nr);
-events->exception.nr = vcpu->arch.exception.nr;
-events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-events->exception.pad = 0;
-events->exception.error_code = vcpu->arch.exception.error_code;
+if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
+vcpu->arch.exception.async_page_fault)) {
+events->exception.injected =
+vcpu->arch.exception.pending &&
+!kvm_exception_is_soft(vcpu->arch.exception.nr);
+events->exception.nr = vcpu->arch.exception.nr;
+events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+events->exception.pad = 0;
+events->exception.error_code = vcpu->arch.exception.error_code;
+}

Regards,
Wanpeng Li


Re: [PATCH v2 3/4] KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf

2017-06-16 Thread Wanpeng Li
2017-06-16 23:38 GMT+08:00 Radim Krčmář :
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář :
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li 
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini 
>> >> Cc: Radim Krčmář 
>> >> Signed-off-by: Wanpeng Li 
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >>  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
>> >> *fault)
>> >>  {
>> >>   ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events().  Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case.  Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
>  1) do the nested VM exit sooner, at the place where we now queue #PF,
>  2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.

How about something like this to not get exception events if it is
"is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
vcpu->arch.exception.async_page_fault" since lost a reschedule
optimization is not that importmant in L1.

@@ -3072,13 +3074,16 @@ static void
kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
 {
 process_nmi(vcpu);
-events->exception.injected =
-vcpu->arch.exception.pending &&
-!kvm_exception_is_soft(vcpu->arch.exception.nr);
-events->exception.nr = vcpu->arch.exception.nr;
-events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-events->exception.pad = 0;
-events->exception.error_code = vcpu->arch.exception.error_code;
+if (!(is_guest_mode(vcpu) && vcpu->arch.exception.nr == PF_VECTOR &&
+vcpu->arch.exception.async_page_fault)) {
+events->exception.injected =
+vcpu->arch.exception.pending &&
+!kvm_exception_is_soft(vcpu->arch.exception.nr);
+events->exception.nr = vcpu->arch.exception.nr;
+events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+events->exception.pad = 0;
+events->exception.error_code = vcpu->arch.exception.error_code;
+}

Regards,
Wanpeng Li


Re: endian bitshift defects [ was: staging: fusb302: don't bitshift __le16 type ]

2017-06-16 Thread Joe Perches
On Sat, 2017-06-17 at 07:23 +0200, Julia Lawall wrote:
> On Fri, 16 Jun 2017, Joe Perches wrote:
> > On Fri, 2017-06-16 at 19:45 +0200, Frans Klaver wrote:
> > > The header field in struct pd_message is declared as an __le16 type. The
> > > data in the message is supposed to be little endian. This means we don't
> > > have to go and shift the individual bytes into position when we're
> > > filling the buffer, we can just copy the contents right away. As an
> > > added benefit we don't get fishy results on big endian systems anymore.
> > 
> > Thanks for pointing this out.
> > 
> > There are several instances of this class of error.
> > 
> > Here's a cocci script to find them.
> > 
> > This is best used with cocci's --all-includes option like:
> > 
> > $ spatch --all-includes --very-quiet --sp-file lebe_bitshifts.cocci .
> > [ many defects...]

Probably would have been better as [ many possible defects... ]

> > $ cat lebe_bitshifts.cocci
> > @@
> > typedef __le16, __le32, __le64,  __be16, __be32, __be64;
> > { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> > expression b;
> > @@
> > 
> > *   a << b

[etc...]

> Is this always a problem?

No, not always.

If the CPU is the equivalent endian, the bitshift is fine.
It can't be known if the code is only compiled on a
single cpu type.  It is rather odd though to use endian
notation if the code is compiled for a single cpu type.

> Would it be useful to add this to the scripts
> in the kernel?

Maybe.

btw: is there a way for the operators to be surrounded by
some \( \| \) or some other bracket style so it could
be written with a single test?

Something like:

@@
typedef __le16, __le32, __le64,  __be16, __be32, __be64;
{ __le16, __le32, __le64,  __be16, __be32, __be64 } a;
expression b;
@@

*   a [<<|<<=|>>|>>=] b



Re: endian bitshift defects [ was: staging: fusb302: don't bitshift __le16 type ]

2017-06-16 Thread Joe Perches
On Sat, 2017-06-17 at 07:23 +0200, Julia Lawall wrote:
> On Fri, 16 Jun 2017, Joe Perches wrote:
> > On Fri, 2017-06-16 at 19:45 +0200, Frans Klaver wrote:
> > > The header field in struct pd_message is declared as an __le16 type. The
> > > data in the message is supposed to be little endian. This means we don't
> > > have to go and shift the individual bytes into position when we're
> > > filling the buffer, we can just copy the contents right away. As an
> > > added benefit we don't get fishy results on big endian systems anymore.
> > 
> > Thanks for pointing this out.
> > 
> > There are several instances of this class of error.
> > 
> > Here's a cocci script to find them.
> > 
> > This is best used with cocci's --all-includes option like:
> > 
> > $ spatch --all-includes --very-quiet --sp-file lebe_bitshifts.cocci .
> > [ many defects...]

Probably would have been better as [ many possible defects... ]

> > $ cat lebe_bitshifts.cocci
> > @@
> > typedef __le16, __le32, __le64,  __be16, __be32, __be64;
> > { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> > expression b;
> > @@
> > 
> > *   a << b

[etc...]

> Is this always a problem?

No, not always.

If the CPU is the equivalent endian, the bitshift is fine.
It can't be known if the code is only compiled on a
single cpu type.  It is rather odd though to use endian
notation if the code is compiled for a single cpu type.

> Would it be useful to add this to the scripts
> in the kernel?

Maybe.

btw: is there a way for the operators to be surrounded by
some \( \| \) or some other bracket style so it could
be written with a single test?

Something like:

@@
typedef __le16, __le32, __le64,  __be16, __be32, __be64;
{ __le16, __le32, __le64,  __be16, __be32, __be64 } a;
expression b;
@@

*   a [<<|<<=|>>|>>=] b



Re: [RFC PATCH 1/3] atmel-hlcdc: add support for 8-bit color lookup table mode

2017-06-16 Thread Boris Brezillon
Le Sat, 17 Jun 2017 00:46:12 +0200,
Peter Rosin  a écrit :

>  Hm, it's probably too late to do it here. Planes have already been
>  enabled and the engine may have started to fetch data and do the
>  composition. You could do that in ->update_plane() [1], and make it a
>  per-plane thing.
> 
>  I'm not sure, but I think you can get the new crtc_state from
>  plane->crtc->state in this context (state have already been swapped,
>  and new state is being applied, which means relevant locks are held).
> >>>
> >>> Ok, I can move it there. My plan is to just copy the default .update_plane
> >>> function and insert 
> >>>
> >>>   if (crtc->state->color_mgmt_changed && crtc->state->gamma_lut) {
> >>>   ...
> >>>   }
> >>>
> >>> just before the drm_atomic_commit(state) call. Sounds ok?  
> >>
> >> Why would you copy the default ->update_plane() when we already have
> >> our own ->atomic_update_plane() implementation [1]? Just put it there
> >> (before the atmel_hlcdc_layer_update_commit() call) and we should be
> >> good.  
> > 
> > Ahh, but you said ->update_plane() and I took that as .update_plane in
> > layer_plane_funcs, not ->atomic_update() in 
> > atmel_hlcdc_layer_plane_helper_funcs.
> > 
> > Makes sense now, and much neater too.  
> 
> No, it doesn't make sense. There's no atmel_hlcdc_layer_update_commit call
> anywhere, and no such function. You seem to have some further changes that
> are not even in -next. Where am I getting those changes and why are they
> not upstream yet?

My bad, this part as been reworked in 4.12 and I was reading 4.11 code.
Indeed, atmel_hlcdc_layer_update_commit() no longer exists, but
atmel_hlcdc_plane_atomic_update() does.

Just add a function called atmel_hlcdc_plane_update_clut() in
atmel_hlcdc_plane.c and call it just after [1].

[1]http://elixir.free-electrons.com/linux/v4.12-rc5/source/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c#L770



Re: [RFC PATCH 1/3] atmel-hlcdc: add support for 8-bit color lookup table mode

2017-06-16 Thread Boris Brezillon
Le Sat, 17 Jun 2017 00:46:12 +0200,
Peter Rosin  a écrit :

>  Hm, it's probably too late to do it here. Planes have already been
>  enabled and the engine may have started to fetch data and do the
>  composition. You could do that in ->update_plane() [1], and make it a
>  per-plane thing.
> 
>  I'm not sure, but I think you can get the new crtc_state from
>  plane->crtc->state in this context (state have already been swapped,
>  and new state is being applied, which means relevant locks are held).
> >>>
> >>> Ok, I can move it there. My plan is to just copy the default .update_plane
> >>> function and insert 
> >>>
> >>>   if (crtc->state->color_mgmt_changed && crtc->state->gamma_lut) {
> >>>   ...
> >>>   }
> >>>
> >>> just before the drm_atomic_commit(state) call. Sounds ok?  
> >>
> >> Why would you copy the default ->update_plane() when we already have
> >> our own ->atomic_update_plane() implementation [1]? Just put it there
> >> (before the atmel_hlcdc_layer_update_commit() call) and we should be
> >> good.  
> > 
> > Ahh, but you said ->update_plane() and I took that as .update_plane in
> > layer_plane_funcs, not ->atomic_update() in 
> > atmel_hlcdc_layer_plane_helper_funcs.
> > 
> > Makes sense now, and much neater too.  
> 
> No, it doesn't make sense. There's no atmel_hlcdc_layer_update_commit call
> anywhere, and no such function. You seem to have some further changes that
> are not even in -next. Where am I getting those changes and why are they
> not upstream yet?

My bad, this part as been reworked in 4.12 and I was reading 4.11 code.
Indeed, atmel_hlcdc_layer_update_commit() no longer exists, but
atmel_hlcdc_plane_atomic_update() does.

Just add a function called atmel_hlcdc_plane_update_clut() in
atmel_hlcdc_plane.c and call it just after [1].

[1]http://elixir.free-electrons.com/linux/v4.12-rc5/source/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c#L770



Re: endian bitshift defects [ was: staging: fusb302: don't bitshift __le16 type ]

2017-06-16 Thread Julia Lawall


On Fri, 16 Jun 2017, Joe Perches wrote:

> On Fri, 2017-06-16 at 19:45 +0200, Frans Klaver wrote:
> > The header field in struct pd_message is declared as an __le16 type. The
> > data in the message is supposed to be little endian. This means we don't
> > have to go and shift the individual bytes into position when we're
> > filling the buffer, we can just copy the contents right away. As an
> > added benefit we don't get fishy results on big endian systems anymore.
>
> Thanks for pointing this out.
>
> There are several instances of this class of error.
>
> Here's a cocci script to find them.
>
> This is best used with cocci's --all-includes option like:
>
> $ spatch --all-includes --very-quiet --sp-file lebe_bitshifts.cocci .
> [ many defects...]
>
> $ cat lebe_bitshifts.cocci
> @@
> typedef __le16, __le32, __le64,  __be16, __be32, __be64;
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a << b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a <<= b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a >> b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a >>= b

Is this always a problem?  Would it be useful to add this to the scripts
in the kernel?

julia

Re: endian bitshift defects [ was: staging: fusb302: don't bitshift __le16 type ]

2017-06-16 Thread Julia Lawall


On Fri, 16 Jun 2017, Joe Perches wrote:

> On Fri, 2017-06-16 at 19:45 +0200, Frans Klaver wrote:
> > The header field in struct pd_message is declared as an __le16 type. The
> > data in the message is supposed to be little endian. This means we don't
> > have to go and shift the individual bytes into position when we're
> > filling the buffer, we can just copy the contents right away. As an
> > added benefit we don't get fishy results on big endian systems anymore.
>
> Thanks for pointing this out.
>
> There are several instances of this class of error.
>
> Here's a cocci script to find them.
>
> This is best used with cocci's --all-includes option like:
>
> $ spatch --all-includes --very-quiet --sp-file lebe_bitshifts.cocci .
> [ many defects...]
>
> $ cat lebe_bitshifts.cocci
> @@
> typedef __le16, __le32, __le64,  __be16, __be32, __be64;
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a << b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a <<= b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a >> b
>
> @@
> { __le16, __le32, __le64,  __be16, __be32, __be64 } a;
> expression b;
> @@
>
> * a >>= b

Is this always a problem?  Would it be useful to add this to the scripts
in the kernel?

julia

[PATCH 2/2][RFC] PM / hibernate: Utilize the original e820 map for consistent check

2017-06-16 Thread Chen Yu
Use the e820_table_ori instead of e820_table_firmware to check
the consistence of memory layout provided by BIOS, because the
e820_table_firmware might be modified by the kernel such as efi
boot stub. To be more specific, during bootup, the efi boot stub
might allocate memory via efi service for the PCI device
information structure, then e820_reserve_setup_data() reserved
these dynamically allocated structures(AKA, setup_data) in
e820_table_firmware accordingly, changing their attribute from
E820_TYPE_RAM to E820_TYPE_RESERVED_KERN. So e820_table_firmware
is not the original BIOS-provided memory layout anymore.

As a result, we might get false-positive report that the memory layout
is inconsistent across hibernation:

The suspend kernel:
[0.00] e820: update [mem 0x76671018-0x76679457] usable ==> usable

The resume kernel:
[0.00] e820: update [mem 0x7666f018-0x76677457] usable ==> usable
...
[   15.752088] PM: Using 3 thread(s) for decompression.
[   15.752088] PM: Loading and decompressing image data (471870 pages)...
[   15.764971] Hibernate inconsistent memory map detected!
[   15.770833] PM: Image mismatch: architecture specific data

Actually it is safe to restore these pages because E820_TYPE_RAM and
E820_TYPE_RESERVED_KERN are treated the same during hibernation.

Reported-by: James Ren 
Reported-by: "Mejia, Leonidas A" 
Cc: Rafael J. Wysocki 
Cc: Len Brown 
Cc: Matt Fleming 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: linux...@vger.kernel.org
Signed-off-by: Chen Yu 
---
 arch/x86/power/hibernate_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fe..4bf087d 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -250,7 +250,7 @@ static int get_e820_md5(struct e820_table *table, void *buf)
 
 static void hibernation_e820_save(void *buf)
 {
-   get_e820_md5(e820_table_firmware, buf);
+   get_e820_md5(e820_table_ori, buf);
 }
 
 static bool hibernation_e820_mismatch(void *buf)
@@ -263,7 +263,7 @@ static bool hibernation_e820_mismatch(void *buf)
if (!memcmp(result, buf, MD5_DIGEST_SIZE))
return false;
 
-   ret = get_e820_md5(e820_table_firmware, result);
+   ret = get_e820_md5(e820_table_ori, result);
if (ret)
return true;
 
-- 
2.7.4



[PATCH 2/2][RFC] PM / hibernate: Utilize the original e820 map for consistent check

2017-06-16 Thread Chen Yu
Use the e820_table_ori instead of e820_table_firmware to check
the consistence of memory layout provided by BIOS, because the
e820_table_firmware might be modified by the kernel such as efi
boot stub. To be more specific, during bootup, the efi boot stub
might allocate memory via efi service for the PCI device
information structure, then e820_reserve_setup_data() reserved
these dynamically allocated structures(AKA, setup_data) in
e820_table_firmware accordingly, changing their attribute from
E820_TYPE_RAM to E820_TYPE_RESERVED_KERN. So e820_table_firmware
is not the original BIOS-provided memory layout anymore.

As a result, we might get false-positive report that the memory layout
is inconsistent across hibernation:

The suspend kernel:
[0.00] e820: update [mem 0x76671018-0x76679457] usable ==> usable

The resume kernel:
[0.00] e820: update [mem 0x7666f018-0x76677457] usable ==> usable
...
[   15.752088] PM: Using 3 thread(s) for decompression.
[   15.752088] PM: Loading and decompressing image data (471870 pages)...
[   15.764971] Hibernate inconsistent memory map detected!
[   15.770833] PM: Image mismatch: architecture specific data

Actually it is safe to restore these pages because E820_TYPE_RAM and
E820_TYPE_RESERVED_KERN are treated the same during hibernation.

Reported-by: James Ren 
Reported-by: "Mejia, Leonidas A" 
Cc: Rafael J. Wysocki 
Cc: Len Brown 
Cc: Matt Fleming 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: linux...@vger.kernel.org
Signed-off-by: Chen Yu 
---
 arch/x86/power/hibernate_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fe..4bf087d 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -250,7 +250,7 @@ static int get_e820_md5(struct e820_table *table, void *buf)
 
 static void hibernation_e820_save(void *buf)
 {
-   get_e820_md5(e820_table_firmware, buf);
+   get_e820_md5(e820_table_ori, buf);
 }
 
 static bool hibernation_e820_mismatch(void *buf)
@@ -263,7 +263,7 @@ static bool hibernation_e820_mismatch(void *buf)
if (!memcmp(result, buf, MD5_DIGEST_SIZE))
return false;
 
-   ret = get_e820_md5(e820_table_firmware, result);
+   ret = get_e820_md5(e820_table_ori, result);
if (ret)
return true;
 
-- 
2.7.4



Re: [RFC PATCH 1/2] mm: introduce bmap_walk()

2017-06-16 Thread Christoph Hellwig
On Fri, Jun 16, 2017 at 06:15:29PM -0700, Dan Williams wrote:
> Refactor the core of generic_swapfile_activate() into bmap_walk() so
> that it can be used by a new daxfile_activate() helper (to be added).

No way in hell!  generic_swapfile_activate needs to day and no new users
of ->bmap over my dead body.  It's a guaranteed to fuck up your data left,
right and center.


[PATCH 1/2][RFC] x86/boot/e820: Introduce e820_table_ori to represent the real original e820 layout

2017-06-16 Thread Chen Yu
Currently we try to have e820_table_firmware to represent the
original firmware memory layout passed to us by the bootloader,
however it is not the case, the e820_table_firmware might still
be modified by linux:
1. During bootup, the efi boot stub might allocate memory via
   efi service for the PCI device information structure, then
   later e820_reserve_setup_data() reserved these dynamically
   allocated structures(AKA, setup_data) in e820_table_firmware
   accordingly.
2. The kexec might also modify the e820_table_firmware.

This brings problem to the memory layout checking logic during
hibernation, because in theory that code expects to get input
from the original memory layout passed by the BIOS, otherwise
we get a force-positive failure.

So introduce a new variable e820_table_ori to record the original
BIOS provided memory layout, which is composed of two parts:
the BIOS-provided physical RAM map and extended physical RAM map.

Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: Rafael J. Wysocki 
Cc: Len Brown 
Cc: Ying Huang 
Cc: x...@kernel.org
Signed-off-by: Chen Yu 
---
 arch/x86/include/asm/e820/api.h |  1 +
 arch/x86/kernel/e820.c  | 24 
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 8e0f8b8..d30114b 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -5,6 +5,7 @@
 
 extern struct e820_table *e820_table;
 extern struct e820_table *e820_table_firmware;
+extern struct e820_table *e820_table_ori;
 
 extern unsigned long pci_mem_start;
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d78a586..29dcb4c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -20,7 +20,7 @@
 #include 
 
 /*
- * We organize the E820 table into two main data structures:
+ * We organize the E820 table into three main data structures:
  *
  * - 'e820_table_firmware': the original firmware version passed to us by the
  *   bootloader - not modified by the kernel. We use this to:
@@ -28,14 +28,18 @@
  *   - inform the user about the firmware's notion of memory layout
  * via /sys/firmware/memmap
  *
- *   - the hibernation code uses it to generate a kernel-independent MD5
- * fingerprint of the physical memory layout of a system.
- *
  *   - kexec, which is a bootloader in disguise, uses the original E820
  * layout to pass to the kexec-ed kernel. This way the original kernel
  * can have a restricted E820 map while the kexec()-ed kexec-kernel
  * can have access to full memory - etc.
  *
+ * - 'e820_table_ori': the original firmware version passed to us by the
+ *   bootloader - not modified by the kernel or the efi boot stub.
+ *   We use this to:
+ *
+ *   - the hibernation code uses it to generate a kernel-independent MD5
+ * fingerprint of the physical memory layout of a system.
+ *
  * - 'e820_table': this is the main E820 table that is massaged by the
  *   low level x86 platform code, or modified by boot parameters, before
  *   passed on to higher level MM layers.
@@ -47,9 +51,11 @@
  */
 static struct e820_table e820_table_init   __initdata;
 static struct e820_table e820_table_firmware_init  __initdata;
+static struct e820_table e820_table_ori_init   __initdata;
 
 struct e820_table *e820_table __refdata= 
_table_init;
 struct e820_table *e820_table_firmware __refdata   = 
_table_firmware_init;
+struct e820_table *e820_table_ori __refdata= _table_ori_init;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -648,6 +654,12 @@ __init void e820__reallocate_tables(void)
BUG_ON(!n);
memcpy(n, e820_table_firmware, size);
e820_table_firmware = n;
+
+   size = offsetof(struct e820_table, entries) + sizeof(struct 
e820_entry)*e820_table_ori->nr_entries;
+   n = kmalloc(size, GFP_KERNEL);
+   BUG_ON(!n);
+   memcpy(n, e820_table_ori, size);
+   e820_table_ori = n;
 }
 
 /*
@@ -669,6 +681,9 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 
data_len)
__append_e820_table(extmap, entries);
e820__update_table(e820_table);
 
+   /* Update the original table if there's any extended memory. */
+   memcpy(e820_table_ori, e820_table, sizeof(*e820_table_ori));
+
early_memunmap(sdata, data_len);
pr_info("e820: extended physical RAM map:\n");
e820__print_table("extended");
@@ -1176,6 +1191,7 @@ void __init e820__memory_setup(void)
who = x86_init.resources.memory_setup();
 
memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+   memcpy(e820_table_ori, e820_table, sizeof(*e820_table_ori));
 
pr_info("e820: BIOS-provided 

Re: [RFC PATCH 1/2] mm: introduce bmap_walk()

2017-06-16 Thread Christoph Hellwig
On Fri, Jun 16, 2017 at 06:15:29PM -0700, Dan Williams wrote:
> Refactor the core of generic_swapfile_activate() into bmap_walk() so
> that it can be used by a new daxfile_activate() helper (to be added).

No way in hell!  generic_swapfile_activate needs to day and no new users
of ->bmap over my dead body.  It's a guaranteed to fuck up your data left,
right and center.


[PATCH 1/2][RFC] x86/boot/e820: Introduce e820_table_ori to represent the real original e820 layout

2017-06-16 Thread Chen Yu
Currently we try to have e820_table_firmware to represent the
original firmware memory layout passed to us by the bootloader,
however it is not the case, the e820_table_firmware might still
be modified by linux:
1. During bootup, the efi boot stub might allocate memory via
   efi service for the PCI device information structure, then
   later e820_reserve_setup_data() reserved these dynamically
   allocated structures(AKA, setup_data) in e820_table_firmware
   accordingly.
2. The kexec might also modify the e820_table_firmware.

This brings problem to the memory layout checking logic during
hibernation, because in theory that code expects to get input
from the original memory layout passed by the BIOS, otherwise
we get a force-positive failure.

So introduce a new variable e820_table_ori to record the original
BIOS provided memory layout, which is composed of two parts:
the BIOS-provided physical RAM map and extended physical RAM map.

Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: Rafael J. Wysocki 
Cc: Len Brown 
Cc: Ying Huang 
Cc: x...@kernel.org
Signed-off-by: Chen Yu 
---
 arch/x86/include/asm/e820/api.h |  1 +
 arch/x86/kernel/e820.c  | 24 
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 8e0f8b8..d30114b 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -5,6 +5,7 @@
 
 extern struct e820_table *e820_table;
 extern struct e820_table *e820_table_firmware;
+extern struct e820_table *e820_table_ori;
 
 extern unsigned long pci_mem_start;
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d78a586..29dcb4c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -20,7 +20,7 @@
 #include 
 
 /*
- * We organize the E820 table into two main data structures:
+ * We organize the E820 table into three main data structures:
  *
  * - 'e820_table_firmware': the original firmware version passed to us by the
  *   bootloader - not modified by the kernel. We use this to:
@@ -28,14 +28,18 @@
  *   - inform the user about the firmware's notion of memory layout
  * via /sys/firmware/memmap
  *
- *   - the hibernation code uses it to generate a kernel-independent MD5
- * fingerprint of the physical memory layout of a system.
- *
  *   - kexec, which is a bootloader in disguise, uses the original E820
  * layout to pass to the kexec-ed kernel. This way the original kernel
  * can have a restricted E820 map while the kexec()-ed kexec-kernel
  * can have access to full memory - etc.
  *
+ * - 'e820_table_ori': the original firmware version passed to us by the
+ *   bootloader - not modified by the kernel or the efi boot stub.
+ *   We use this to:
+ *
+ *   - the hibernation code uses it to generate a kernel-independent MD5
+ * fingerprint of the physical memory layout of a system.
+ *
  * - 'e820_table': this is the main E820 table that is massaged by the
  *   low level x86 platform code, or modified by boot parameters, before
  *   passed on to higher level MM layers.
@@ -47,9 +51,11 @@
  */
 static struct e820_table e820_table_init   __initdata;
 static struct e820_table e820_table_firmware_init  __initdata;
+static struct e820_table e820_table_ori_init   __initdata;
 
 struct e820_table *e820_table __refdata= 
_table_init;
 struct e820_table *e820_table_firmware __refdata   = 
_table_firmware_init;
+struct e820_table *e820_table_ori __refdata= _table_ori_init;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -648,6 +654,12 @@ __init void e820__reallocate_tables(void)
BUG_ON(!n);
memcpy(n, e820_table_firmware, size);
e820_table_firmware = n;
+
+   size = offsetof(struct e820_table, entries) + sizeof(struct 
e820_entry)*e820_table_ori->nr_entries;
+   n = kmalloc(size, GFP_KERNEL);
+   BUG_ON(!n);
+   memcpy(n, e820_table_ori, size);
+   e820_table_ori = n;
 }
 
 /*
@@ -669,6 +681,9 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 
data_len)
__append_e820_table(extmap, entries);
e820__update_table(e820_table);
 
+   /* Update the original table if there's any extended memory. */
+   memcpy(e820_table_ori, e820_table, sizeof(*e820_table_ori));
+
early_memunmap(sdata, data_len);
pr_info("e820: extended physical RAM map:\n");
e820__print_table("extended");
@@ -1176,6 +1191,7 @@ void __init e820__memory_setup(void)
who = x86_init.resources.memory_setup();
 
memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+   memcpy(e820_table_ori, e820_table, sizeof(*e820_table_ori));
 
pr_info("e820: BIOS-provided physical RAM map:\n");
e820__print_table(who);
-- 
2.7.4



[PATCH 0/2][RFC] Introduce e820_table_ori to fix the memory inconsistent problem during hibernation

2017-06-16 Thread Chen Yu
This is a patch set to fix the issue found during hibernation restore that,
the MD5 fingerprint of the physical memory layout checking code has reported a
false-positive failure due to incorrect input from the e820 map.

Chen Yu (2):
  x86/boot/e820: Introduce e820_table_ori to represent the real original
e820 layout
  PM / hibernate: Utilize the original e820 map for consistent check

 arch/x86/include/asm/e820/api.h |  1 +
 arch/x86/kernel/e820.c  | 24 
 arch/x86/power/hibernate_64.c   |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

-- 
2.7.4



[PATCH 0/2][RFC] Introduce e820_table_ori to fix the memory inconsistent problem during hibernation

2017-06-16 Thread Chen Yu
This is a patch set to fix the issue found during hibernation restore that,
the MD5 fingerprint of the physical memory layout checking code has reported a
false-positive failure due to incorrect input from the e820 map.

Chen Yu (2):
  x86/boot/e820: Introduce e820_table_ori to represent the real original
e820 layout
  PM / hibernate: Utilize the original e820 map for consistent check

 arch/x86/include/asm/e820/api.h |  1 +
 arch/x86/kernel/e820.c  | 24 
 arch/x86/power/hibernate_64.c   |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

-- 
2.7.4



[PATCH] mm,oom_kill: Close race window of needlessly selecting new victims.

2017-06-16 Thread Tetsuo Handa
Michal Hocko wrote:
> On Fri 16-06-17 21:22:20, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > OK, could you play with the patch/idea suggested in
> > > http://lkml.kernel.org/r/20170615122031.gl1...@dhcp22.suse.cz?
> > 
> > I think we don't need to worry about mmap_sem dependency inside __mmput().
> > Since the OOM killer checks for !MMF_OOM_SKIP mm rather than TIF_MEMDIE 
> > thread,
> > we can keep the OOM killer disabled until we set MMF_OOM_SKIP to the 
> > victim's mm.
> > That is, elevating mm_users throughout the reaping procedure does not cause
> > premature victim selection, even after TIF_MEMDIE is cleared from the 
> > victim's
> > thread. Then, we don't need to use down_write()/up_write() for non OOM 
> > victim's mm
> > (nearly 100% of exit_mmap() calls), and can force partial reaping of OOM 
> > victim's mm
> > (nearly 0% of exit_mmap() calls) before __mmput() starts doing exit_aio() 
> > etc.
> > Patch is shown below. Only compile tested.
> 
> Yes, that would be another approach.
>  
> >  include/linux/sched/coredump.h |  1 +
> >  mm/oom_kill.c  | 80 
> > --
> >  2 files changed, 40 insertions(+), 41 deletions(-)
> > 
> > diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> > index 98ae0d0..6b6237b 100644
> > --- a/include/linux/sched/coredump.h
> > +++ b/include/linux/sched/coredump.h
> > @@ -62,6 +62,7 @@ static inline int get_dumpable(struct mm_struct *mm)
> >   * on NFS restore
> >   */
> >  //#define MMF_EXE_FILE_CHANGED 18  /* see prctl_set_mm_exe_file() 
> > */
> > +#define MMF_OOM_REAPING18  /* mm is supposed to be reaped 
> > */
> 
> A new flag is not really needed. We can increase it for _each_ reapable
> oom victim.

Yes if based on an assumption that number of mark_oom_victim() calls and
wake_oom_reaper() calls matches...

> 
> > @@ -658,6 +643,13 @@ static void mark_oom_victim(struct task_struct *tsk)
> > if (!cmpxchg(>signal->oom_mm, NULL, mm))
> > mmgrab(tsk->signal->oom_mm);
> >  
> > +#ifdef CONFIG_MMU
> > +   if (!test_bit(MMF_OOM_REAPING, >flags)) {
> > +   set_bit(MMF_OOM_REAPING, >flags);
> > +   mmget(mm);
> > +   }
> > +#endif
> 
> This would really need a big fat warning explaining why we do not need
> mmget_not_zero. We rely on exit_mm doing both mmput and tsk->mm = NULL
> under the task_lock and mark_oom_victim is called under this lock as
> well and task_will_free_mem resp. find_lock_task_mm makes sure we do not
> even consider tasks wihout mm.
> 
> I agree that a solution which is fully contained inside the oom proper
> would be preferable to touching __mmput path.

OK. Updated patch shown below.


>From 5ed8922bd281456793408328c8b27899ebdd298b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa 
Date: Sat, 17 Jun 2017 14:04:09 +0900
Subject: [PATCH] mm,oom_kill: Close race window of needlessly selecting new
 victims.

David Rientjes has reported that the OOM killer can select next OOM victim
when existing OOM victims called __mmput() before the OOM reaper starts
trying to unmap pages. In his testing, 4.12-rc kernels are killing 1-4
processes unnecessarily for each OOM condition.

--
  One oom kill shows the system to be oom:

  [22999.488705] Node 0 Normal free:90484kB min:90500kB ...
  [22999.488711] Node 1 Normal free:91536kB min:91948kB ...

  followed up by one or more unnecessary oom kills showing the oom killer
  racing with memory freeing of the victim:

  [22999.510329] Node 0 Normal free:229588kB min:90500kB ...
  [22999.510334] Node 1 Normal free:600036kB min:91948kB ...
--

This is because commit e5e3f4c4f0e95ecb ("mm, oom_reaper: make sure that
mmput_async is called only when memory was reaped") kept not to set
MMF_OOM_REAPED flag when the OOM reaper found that mm_users == 0 but then
commit 26db62f179d112d3 ("oom: keep mm of the killed task available") by
error changed to always set MMF_OOM_REAPED flag. As a result, MMF_OOM_SKIP
flag is immediately set without waiting for __mmput() because __mmput()
might get stuck before setting MMF_OOM_SKIP flag, and led to above report.

A workaround is to let the OOM reaper wait for a while and give up via
timeout as if the OOM reaper was unable to take mmap_sem for read. But
we want to avoid timeout based approach if possible. Therefore, this
patch takes a different approach.

This patch elevates mm_users of an OOM victim's mm, and prevents the OOM
victim from calling __mmput() before the OOM reaper starts trying to unmap
pages. In this way, we can force the OOM reaper to try to reclaim some
memory before setting MMF_OOM_SKIP flag.

Since commit 862e3073b3eed13f ("mm, oom: get rid of
signal_struct::oom_victims") changed to keep the OOM killer disabled
until MMF_OOM_SKIP is set on the victim's mm rather than until TIF_MEMDIE
is cleared from the victim's thread, we can keep the OOM killer 

[PATCH] mm,oom_kill: Close race window of needlessly selecting new victims.

2017-06-16 Thread Tetsuo Handa
Michal Hocko wrote:
> On Fri 16-06-17 21:22:20, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > OK, could you play with the patch/idea suggested in
> > > http://lkml.kernel.org/r/20170615122031.gl1...@dhcp22.suse.cz?
> > 
> > I think we don't need to worry about mmap_sem dependency inside __mmput().
> > Since the OOM killer checks for !MMF_OOM_SKIP mm rather than TIF_MEMDIE 
> > thread,
> > we can keep the OOM killer disabled until we set MMF_OOM_SKIP to the 
> > victim's mm.
> > That is, elevating mm_users throughout the reaping procedure does not cause
> > premature victim selection, even after TIF_MEMDIE is cleared from the 
> > victim's
> > thread. Then, we don't need to use down_write()/up_write() for non OOM 
> > victim's mm
> > (nearly 100% of exit_mmap() calls), and can force partial reaping of OOM 
> > victim's mm
> > (nearly 0% of exit_mmap() calls) before __mmput() starts doing exit_aio() 
> > etc.
> > Patch is shown below. Only compile tested.
> 
> Yes, that would be another approach.
>  
> >  include/linux/sched/coredump.h |  1 +
> >  mm/oom_kill.c  | 80 
> > --
> >  2 files changed, 40 insertions(+), 41 deletions(-)
> > 
> > diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
> > index 98ae0d0..6b6237b 100644
> > --- a/include/linux/sched/coredump.h
> > +++ b/include/linux/sched/coredump.h
> > @@ -62,6 +62,7 @@ static inline int get_dumpable(struct mm_struct *mm)
> >   * on NFS restore
> >   */
> >  //#define MMF_EXE_FILE_CHANGED 18  /* see prctl_set_mm_exe_file() 
> > */
> > +#define MMF_OOM_REAPING18  /* mm is supposed to be reaped 
> > */
> 
> A new flag is not really needed. We can increase it for _each_ reapable
> oom victim.

Yes if based on an assumption that number of mark_oom_victim() calls and
wake_oom_reaper() calls matches...

> 
> > @@ -658,6 +643,13 @@ static void mark_oom_victim(struct task_struct *tsk)
> > if (!cmpxchg(>signal->oom_mm, NULL, mm))
> > mmgrab(tsk->signal->oom_mm);
> >  
> > +#ifdef CONFIG_MMU
> > +   if (!test_bit(MMF_OOM_REAPING, >flags)) {
> > +   set_bit(MMF_OOM_REAPING, >flags);
> > +   mmget(mm);
> > +   }
> > +#endif
> 
> This would really need a big fat warning explaining why we do not need
> mmget_not_zero. We rely on exit_mm doing both mmput and tsk->mm = NULL
> under the task_lock and mark_oom_victim is called under this lock as
> well and task_will_free_mem resp. find_lock_task_mm makes sure we do not
> even consider tasks wihout mm.
> 
> I agree that a solution which is fully contained inside the oom proper
> would be preferable to touching __mmput path.

OK. Updated patch shown below.


>From 5ed8922bd281456793408328c8b27899ebdd298b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa 
Date: Sat, 17 Jun 2017 14:04:09 +0900
Subject: [PATCH] mm,oom_kill: Close race window of needlessly selecting new
 victims.

David Rientjes has reported that the OOM killer can select next OOM victim
when existing OOM victims called __mmput() before the OOM reaper starts
trying to unmap pages. In his testing, 4.12-rc kernels are killing 1-4
processes unnecessarily for each OOM condition.

--
  One oom kill shows the system to be oom:

  [22999.488705] Node 0 Normal free:90484kB min:90500kB ...
  [22999.488711] Node 1 Normal free:91536kB min:91948kB ...

  followed up by one or more unnecessary oom kills showing the oom killer
  racing with memory freeing of the victim:

  [22999.510329] Node 0 Normal free:229588kB min:90500kB ...
  [22999.510334] Node 1 Normal free:600036kB min:91948kB ...
--

This is because commit e5e3f4c4f0e95ecb ("mm, oom_reaper: make sure that
mmput_async is called only when memory was reaped") kept not to set
MMF_OOM_REAPED flag when the OOM reaper found that mm_users == 0 but then
commit 26db62f179d112d3 ("oom: keep mm of the killed task available") by
error changed to always set MMF_OOM_REAPED flag. As a result, MMF_OOM_SKIP
flag is immediately set without waiting for __mmput() because __mmput()
might get stuck before setting MMF_OOM_SKIP flag, and led to above report.

A workaround is to let the OOM reaper wait for a while and give up via
timeout as if the OOM reaper was unable to take mmap_sem for read. But
we want to avoid timeout based approach if possible. Therefore, this
patch takes a different approach.

This patch elevates mm_users of an OOM victim's mm, and prevents the OOM
victim from calling __mmput() before the OOM reaper starts trying to unmap
pages. In this way, we can force the OOM reaper to try to reclaim some
memory before setting MMF_OOM_SKIP flag.

Since commit 862e3073b3eed13f ("mm, oom: get rid of
signal_struct::oom_victims") changed to keep the OOM killer disabled
until MMF_OOM_SKIP is set on the victim's mm rather than until TIF_MEMDIE
is cleared from the victim's thread, we can keep the OOM killer disabled
until __oom_reap_task_mm() or 

Re: [RFC PATCH 06/13] switchtec_ntb: initialize hardware for memory windows

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:22PM -0600, Logan Gunthorpe wrote:
> This commit adds the code to initialize the memory windows in the
> hardware. This includes setting up the requester ID table, and
> figuring out which bar corresponds to which memory window. (Seeing
> the switch can be configured with any number of bars.)
> 
> Also, seeing the device doesn't have hardware for scratchpads or
> determining the link status, we create a shared memory window that has
> these features. A magic number with a version copmonent will be used
> to determine if the otherside's driver is actually up.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/ntb/hw/mscc/switchtec_ntb.c | 296 
> 
>  1 file changed, 296 insertions(+)
> 
> diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c 
> b/drivers/ntb/hw/mscc/switchtec_ntb.c
> index 1f094216aa1c..756307d1a8a3 100644
> --- a/drivers/ntb/hw/mscc/switchtec_ntb.c
> +++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
> @@ -15,37 +15,332 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
>  MODULE_VERSION("0.1");
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Microsemi Corporation");
>  
> +#ifndef ioread64
> +#ifdef readq
> +#define ioread64 readq
> +#else
> +#define ioread64 _ioread64
> +static inline u64 _ioread64(void __iomem *mmio)
> +{
> + u64 low, high;
> +
> + low = ioread32(mmio);
> + high = ioread32(mmio + sizeof(u32));
> + return low | (high << 32);
> +}
> +#endif
> +#endif

Really?  Don't we have ioread64 in generic code for all arches?  If not,
that should be fixed, don't hide this in a random driver please.  Or
just restrict your driver to only building on those arches that does
provide this api.

thanks,

greg k-h


Re: [RFC PATCH 06/13] switchtec_ntb: initialize hardware for memory windows

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:22PM -0600, Logan Gunthorpe wrote:
> This commit adds the code to initialize the memory windows in the
> hardware. This includes setting up the requester ID table, and
> figuring out which bar corresponds to which memory window. (Seeing
> the switch can be configured with any number of bars.)
> 
> Also, seeing the device doesn't have hardware for scratchpads or
> determining the link status, we create a shared memory window that has
> these features. A magic number with a version copmonent will be used
> to determine if the otherside's driver is actually up.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/ntb/hw/mscc/switchtec_ntb.c | 296 
> 
>  1 file changed, 296 insertions(+)
> 
> diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c 
> b/drivers/ntb/hw/mscc/switchtec_ntb.c
> index 1f094216aa1c..756307d1a8a3 100644
> --- a/drivers/ntb/hw/mscc/switchtec_ntb.c
> +++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
> @@ -15,37 +15,332 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
>  MODULE_VERSION("0.1");
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Microsemi Corporation");
>  
> +#ifndef ioread64
> +#ifdef readq
> +#define ioread64 readq
> +#else
> +#define ioread64 _ioread64
> +static inline u64 _ioread64(void __iomem *mmio)
> +{
> + u64 low, high;
> +
> + low = ioread32(mmio);
> + high = ioread32(mmio + sizeof(u32));
> + return low | (high << 32);
> +}
> +#endif
> +#endif

Really?  Don't we have ioread64 in generic code for all arches?  If not,
that should be fixed, don't hide this in a random driver please.  Or
just restrict your driver to only building on those arches that does
provide this api.

thanks,

greg k-h


Re: [RFC PATCH 04/13] switchtec: add link event notifier block

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:20PM -0600, Logan Gunthorpe wrote:
> In order for the switchtec NTB code to handle link change events we
> create a notifier block in the switchtec code which gets called
> whenever an appropriate event interrupt occurs.
> 
> In order to preserve userspace's ability to follow these events,
> we compare the event count with a stored copy from last time we
> checked.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/pci/switch/switchtec.c | 53 
> ++
>  include/linux/switchtec.h  |  5 
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index e9bf17b1934e..63e305b24fb9 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -972,6 +972,50 @@ static const struct file_operations switchtec_fops = {
>   .compat_ioctl = switchtec_dev_ioctl,
>  };
>  
> +static void link_event_work(struct work_struct *work)
> +{
> + struct switchtec_dev *stdev;
> +
> + stdev = container_of(work, struct switchtec_dev, link_event_work);
> +
> + dev_dbg(>dev, "%s\n", __func__);

You do know about ftrace, right?  It's good to drop debugging code like
this for "final" versions.

> +
> + blocking_notifier_call_chain(>link_notifier, 0, stdev);
> +}

Do you really need a notifier call chain?  How many different things are
going to "hook up" to this?  I ask as they tend to get really messy over
time while direct callbacks are easier to handle and manage.

thanks,

greg k-h


Re: [RFC PATCH 04/13] switchtec: add link event notifier block

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:20PM -0600, Logan Gunthorpe wrote:
> In order for the switchtec NTB code to handle link change events we
> create a notifier block in the switchtec code which gets called
> whenever an appropriate event interrupt occurs.
> 
> In order to preserve userspace's ability to follow these events,
> we compare the event count with a stored copy from last time we
> checked.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/pci/switch/switchtec.c | 53 
> ++
>  include/linux/switchtec.h  |  5 
>  2 files changed, 58 insertions(+)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index e9bf17b1934e..63e305b24fb9 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -972,6 +972,50 @@ static const struct file_operations switchtec_fops = {
>   .compat_ioctl = switchtec_dev_ioctl,
>  };
>  
> +static void link_event_work(struct work_struct *work)
> +{
> + struct switchtec_dev *stdev;
> +
> + stdev = container_of(work, struct switchtec_dev, link_event_work);
> +
> + dev_dbg(>dev, "%s\n", __func__);

You do know about ftrace, right?  It's good to drop debugging code like
this for "final" versions.

> +
> + blocking_notifier_call_chain(>link_notifier, 0, stdev);
> +}

Do you really need a notifier call chain?  How many different things are
going to "hook up" to this?  I ask as they tend to get really messy over
time while direct callbacks are easier to handle and manage.

thanks,

greg k-h


Re: [RFC PATCH 01/13] switchtec: move structure definitions into a common header

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:17PM -0600, Logan Gunthorpe wrote:
> Create the switchtec.h header in include/linux with hardware defines
> and the switchtec_dev structure moved directly from switchtec.c.
> This is a prep patch for created an NTB driver for switchtec.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 

Acked-by: Greg Kroah-Hartman 


Re: [RFC PATCH 01/13] switchtec: move structure definitions into a common header

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:17PM -0600, Logan Gunthorpe wrote:
> Create the switchtec.h header in include/linux with hardware defines
> and the switchtec_dev structure moved directly from switchtec.c.
> This is a prep patch for created an NTB driver for switchtec.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 

Acked-by: Greg Kroah-Hartman 


Re: [RFC PATCH 02/13] switchtec: export class symbol for use in upper layer driver

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:18PM -0600, Logan Gunthorpe wrote:
> We switch to class_register/unregister and a declared class which
> is exported for use in the switchtec_ntb driver.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/pci/switch/switchtec.c | 21 +++--
>  include/linux/switchtec.h  |  2 ++
>  2 files changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index c4369ba7bbc1..e9bf17b1934e 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -21,8 +21,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
>  #include 
>  
>  MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
> @@ -35,9 +33,14 @@ module_param(max_devices, int, 0644);
>  MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
>  
>  static dev_t switchtec_devt;
> -static struct class *switchtec_class;
>  static DEFINE_IDA(switchtec_minor_ida);
>  
> +struct class switchtec_class = {
> + .owner = THIS_MODULE,
> + .name = "switchtec",
> +};
> +EXPORT_SYMBOL(switchtec_class);

EXPORT_SYMBOL_GPL()?

And do you really have to move from a dynamic class to a static one?  I
know it will work just the same, but I hate seeing static structures
that have reference counts on them :)

thanks,

greg k-h

> +
>  enum mrpc_state {
>   MRPC_IDLE = 0,
>   MRPC_QUEUED,
> @@ -1026,7 +1029,7 @@ static struct switchtec_dev *stdev_create(struct 
> pci_dev *pdev)
>  
>   dev = >dev;
>   device_initialize(dev);
> - dev->class = switchtec_class;
> + dev->class = _class;
>   dev->parent = >dev;
>   dev->groups = switchtec_device_groups;
>   dev->release = stdev_release;
> @@ -1313,11 +1316,9 @@ static int __init switchtec_init(void)
>   if (rc)
>   return rc;
>  
> - switchtec_class = class_create(THIS_MODULE, "switchtec");
> - if (IS_ERR(switchtec_class)) {
> - rc = PTR_ERR(switchtec_class);
> + rc = class_register(_class);
> + if (rc)
>   goto err_create_class;
> - }
>  
>   rc = pci_register_driver(_pci_driver);
>   if (rc)
> @@ -1328,7 +1329,7 @@ static int __init switchtec_init(void)
>   return 0;
>  
>  err_pci_register:
> - class_destroy(switchtec_class);
> + class_unregister(_class);
>  
>  err_create_class:
>   unregister_chrdev_region(switchtec_devt, max_devices);
> @@ -1340,7 +1341,7 @@ module_init(switchtec_init);
>  static void __exit switchtec_exit(void)
>  {
>   pci_unregister_driver(_pci_driver);
> - class_destroy(switchtec_class);
> + class_unregister(_class);
>   unregister_chrdev_region(switchtec_devt, max_devices);
>   ida_destroy(_minor_ida);
>  
> diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
> index 508cda78a430..3b87618fc42f 100644
> --- a/include/linux/switchtec.h
> +++ b/include/linux/switchtec.h
> @@ -267,4 +267,6 @@ static inline struct switchtec_dev *to_stdev(struct 
> device *dev)
>   return container_of(dev, struct switchtec_dev, dev);
>  }
>  
> +extern struct class switchtec_class;
> +
>  #endif
> -- 
> 2.11.0


Re: [RFC PATCH 02/13] switchtec: export class symbol for use in upper layer driver

2017-06-16 Thread Greg Kroah-Hartman
On Thu, Jun 15, 2017 at 02:37:18PM -0600, Logan Gunthorpe wrote:
> We switch to class_register/unregister and a declared class which
> is exported for use in the switchtec_ntb driver.
> 
> Signed-off-by: Logan Gunthorpe 
> Reviewed-by: Stephen Bates 
> Reviewed-by: Kurt Schwemmer 
> ---
>  drivers/pci/switch/switchtec.c | 21 +++--
>  include/linux/switchtec.h  |  2 ++
>  2 files changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index c4369ba7bbc1..e9bf17b1934e 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -21,8 +21,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
>  #include 
>  
>  MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
> @@ -35,9 +33,14 @@ module_param(max_devices, int, 0644);
>  MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
>  
>  static dev_t switchtec_devt;
> -static struct class *switchtec_class;
>  static DEFINE_IDA(switchtec_minor_ida);
>  
> +struct class switchtec_class = {
> + .owner = THIS_MODULE,
> + .name = "switchtec",
> +};
> +EXPORT_SYMBOL(switchtec_class);

EXPORT_SYMBOL_GPL()?

And do you really have to move from a dynamic class to a static one?  I
know it will work just the same, but I hate seeing static structures
that have reference counts on them :)

thanks,

greg k-h

> +
>  enum mrpc_state {
>   MRPC_IDLE = 0,
>   MRPC_QUEUED,
> @@ -1026,7 +1029,7 @@ static struct switchtec_dev *stdev_create(struct 
> pci_dev *pdev)
>  
>   dev = >dev;
>   device_initialize(dev);
> - dev->class = switchtec_class;
> + dev->class = _class;
>   dev->parent = >dev;
>   dev->groups = switchtec_device_groups;
>   dev->release = stdev_release;
> @@ -1313,11 +1316,9 @@ static int __init switchtec_init(void)
>   if (rc)
>   return rc;
>  
> - switchtec_class = class_create(THIS_MODULE, "switchtec");
> - if (IS_ERR(switchtec_class)) {
> - rc = PTR_ERR(switchtec_class);
> + rc = class_register(_class);
> + if (rc)
>   goto err_create_class;
> - }
>  
>   rc = pci_register_driver(_pci_driver);
>   if (rc)
> @@ -1328,7 +1329,7 @@ static int __init switchtec_init(void)
>   return 0;
>  
>  err_pci_register:
> - class_destroy(switchtec_class);
> + class_unregister(_class);
>  
>  err_create_class:
>   unregister_chrdev_region(switchtec_devt, max_devices);
> @@ -1340,7 +1341,7 @@ module_init(switchtec_init);
>  static void __exit switchtec_exit(void)
>  {
>   pci_unregister_driver(_pci_driver);
> - class_destroy(switchtec_class);
> + class_unregister(_class);
>   unregister_chrdev_region(switchtec_devt, max_devices);
>   ida_destroy(_minor_ida);
>  
> diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
> index 508cda78a430..3b87618fc42f 100644
> --- a/include/linux/switchtec.h
> +++ b/include/linux/switchtec.h
> @@ -267,4 +267,6 @@ static inline struct switchtec_dev *to_stdev(struct 
> device *dev)
>   return container_of(dev, struct switchtec_dev, dev);
>  }
>  
> +extern struct class switchtec_class;
> +
>  #endif
> -- 
> 2.11.0


Re: [RFC PATCH 00/13] Switchtec NTB Support

2017-06-16 Thread 'Greg Kroah-Hartman'
On Fri, Jun 16, 2017 at 11:21:00PM +0300, Serge Semin wrote:
> On Fri, Jun 16, 2017 at 01:34:59PM -0600, Logan Gunthorpe 
>  wrote:
> > Now, if you'd like to actually review the code I'd be happy to address
> > any concerns you find. I won't be responding to any more philosophical
> > arguments or bike-shedding over the format of the patch.
> > 
> 
> I don't want to review a patchset, which isn't properly formated.

Ah, but the patchset does seem to properly formatted.  At least it's
easy for me to review as-published, while a much smaller number of
patches, making much larger individual patches, would be much much
harder to review.

But what do I know...

Oh wait, I review more kernel patches than anyone else :)

Logan, given that you need to rebase these on the "new" ntb api (and why
the hell is that tree on github? We can't take kernel git pulls from
github), is it worth reviewing this patch series as-is, or do you want
us to wait?

thanks,

greg k-h


Re: [RFC PATCH 00/13] Switchtec NTB Support

2017-06-16 Thread 'Greg Kroah-Hartman'
On Fri, Jun 16, 2017 at 11:21:00PM +0300, Serge Semin wrote:
> On Fri, Jun 16, 2017 at 01:34:59PM -0600, Logan Gunthorpe 
>  wrote:
> > Now, if you'd like to actually review the code I'd be happy to address
> > any concerns you find. I won't be responding to any more philosophical
> > arguments or bike-shedding over the format of the patch.
> > 
> 
> I don't want to review a patchset, which isn't properly formated.

Ah, but the patchset does seem to properly formatted.  At least it's
easy for me to review as-published, while a much smaller number of
patches, making much larger individual patches, would be much much
harder to review.

But what do I know...

Oh wait, I review more kernel patches than anyone else :)

Logan, given that you need to rebase these on the "new" ntb api (and why
the hell is that tree on github? We can't take kernel git pulls from
github), is it worth reviewing this patch series as-is, or do you want
us to wait?

thanks,

greg k-h


Re: LTS testing with latest kselftests - some failures

2017-06-16 Thread Greg Kroah-Hartman
On Fri, Jun 16, 2017 at 09:47:21PM +0200, Luis R. Rodriguez wrote:
> Some of the knobs however are for extending tests for
> existing APIs in older kernels, the async and custom fallback one are an
> example.  There are a series of test cases later added which could help
> test LTS kernels. Would Linaro pick these test driver enhancements to help
> increase coverage of tests? Or is it not worth it? If its worth it then
> what I was curious was how to help make this easier for this process to
> bloom.

I don't understand, what do you mean by "pick these test driver
enhancements"?  What kind of "knobs" are there in tests?  Shouldn't the
tests "just work" with no kind of special configuration of the tests be
needed?  No user is going to know to enable something special.

Make the tests "just work" please, because given the large number of
them, no one is going to know to look for special things.

thanks,

greg k-h


Re: LTS testing with latest kselftests - some failures

2017-06-16 Thread Greg Kroah-Hartman
On Fri, Jun 16, 2017 at 09:47:21PM +0200, Luis R. Rodriguez wrote:
> Some of the knobs however are for extending tests for
> existing APIs in older kernels, the async and custom fallback one are an
> example.  There are a series of test cases later added which could help
> test LTS kernels. Would Linaro pick these test driver enhancements to help
> increase coverage of tests? Or is it not worth it? If its worth it then
> what I was curious was how to help make this easier for this process to
> bloom.

I don't understand, what do you mean by "pick these test driver
enhancements"?  What kind of "knobs" are there in tests?  Shouldn't the
tests "just work" with no kind of special configuration of the tests be
needed?  No user is going to know to enable something special.

Make the tests "just work" please, because given the large number of
them, no one is going to know to look for special things.

thanks,

greg k-h


Re: [PATCH v2 1/3] mm: add vm_insert_mixed_mkwrite()

2017-06-16 Thread Ross Zwisler
On Thu, Jun 15, 2017 at 04:42:04PM +0200, Jan Kara wrote:
> On Wed 14-06-17 11:22:09, Ross Zwisler wrote:
> > To be able to use the common 4k zero page in DAX we need to have our PTE
> > fault path look more like our PMD fault path where a PTE entry can be
> > marked as dirty and writeable as it is first inserted, rather than waiting
> > for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
> > 
> > Right now we can rely on having a dax_pfn_mkwrite() call because we can
> > distinguish between these two cases in do_wp_page():
> > 
> > case 1: 4k zero page => writable DAX storage
> > case 2: read-only DAX storage => writeable DAX storage
> > 
> > This distinction is made by via vm_normal_page().  vm_normal_page() returns
> > false for the common 4k zero page, though, just as it does for DAX ptes.
> > Instead of special casing the DAX + 4k zero page case, we will simplify our
> > DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> > get rid of dax_pfn_mkwrite() completely.
> > 
> > This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> > and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> > will do the work that was previously done by wp_page_reuse() as part of the
> > dax_pfn_mkwrite() call path.
> > 
> > Signed-off-by: Ross Zwisler 
> 
> So I agree that getting rid of dax_pfn_mkwrite() and using fault handler in
> that case is a way to go. However I somewhat dislike the
> vm_insert_mixed_mkwrite() thing - it looks like a hack - and I'm aware that
> we have a similar thing for PMD which is ugly as well. Besides being ugly
> I'm also concerned that when 'mkwrite' is set, we just silently overwrite
> whatever PTE was installed at that position. Not that I'd see how that
> could screw us for DAX but still a concern that e.g. some PTE flag could
> get discarded by this is there... In fact, for !HAVE_PTE_SPECIAL
> architectures, you will leak zero page references by just overwriting the
> PTE - for those archs you really need to unmap zero page before replacing
> PTE (and the same for PMD I suppose).
> 
> So how about some vmf_insert_pfn(vmf, pe_size, pfn) helper that would
> properly detect PTE / PMD case, read / write case etc., check that PTE did
> not change from orig_pte, and handle all the nasty details instead of
> messing with insert_pfn?

I played around with this some today, and I wasn't super happy with the
results.  Here were some issues I encountered:

1) The pte_mkyoung(), maybe_mkwrite() and pte_mkdirty() calls need to happen
with the PTE locked, and I'm currently able to piggy-back on the locking done
in insert_pfn().  If I keep those steps out of insert_pfn() I either have to
essentially duplicate all the work done by insert_pfn() into another function
so I can do everything I need under one lock, or I have to insert the PFN via
insert_pfn() (which as you point out, will just leave the pfn alone if it's
already present), then for writes I have to re-grab the PTE lock and set do
the mkwrite steps.

Either of these work, but they both also seem kind of gross...

2) Combining the PTE and PMD cases into a common function will require
mm/memory.c to call vmf_insert_pfn_pmd(), which depends on
CONFIG_TRANSPARENT_HUGEPAGE being defined.  This works, it just means some
more #ifdef CONFIG_TRANSPARENT_HUGEPAGE hackery in mm/memory.c.

I agree that unconditionally overwriting the PTE when mkwrite is set is
undesireable, and should be fixed.  My implementation of the wrapper just
didn't seem that natural, which usually tells me I'm headed down the wrong
path.  Maybe I'm just not fully understanding what you intended?

In any case, my current favorite soultion for this issue is still what I had
in v1:

https://patchwork.kernel.org/patch/9772809/

with perhaps the removal of the new vm_insert_mixed_mkwrite() symbol, and just
adding a 'write' flag to vm_insert_mixed() and updating all the call sites,
and fixing the flow where mkwrite unconditionally overwrites the PTE?

If not, can you help me understand what you think is ugly about the 'write'
flag to vm_insert_mixed() and vmf_insert_pfn_pmd()?


Re: [PATCH v2 1/3] mm: add vm_insert_mixed_mkwrite()

2017-06-16 Thread Ross Zwisler
On Thu, Jun 15, 2017 at 04:42:04PM +0200, Jan Kara wrote:
> On Wed 14-06-17 11:22:09, Ross Zwisler wrote:
> > To be able to use the common 4k zero page in DAX we need to have our PTE
> > fault path look more like our PMD fault path where a PTE entry can be
> > marked as dirty and writeable as it is first inserted, rather than waiting
> > for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
> > 
> > Right now we can rely on having a dax_pfn_mkwrite() call because we can
> > distinguish between these two cases in do_wp_page():
> > 
> > case 1: 4k zero page => writable DAX storage
> > case 2: read-only DAX storage => writeable DAX storage
> > 
> > This distinction is made by via vm_normal_page().  vm_normal_page() returns
> > false for the common 4k zero page, though, just as it does for DAX ptes.
> > Instead of special casing the DAX + 4k zero page case, we will simplify our
> > DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> > get rid of dax_pfn_mkwrite() completely.
> > 
> > This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> > and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> > will do the work that was previously done by wp_page_reuse() as part of the
> > dax_pfn_mkwrite() call path.
> > 
> > Signed-off-by: Ross Zwisler 
> 
> So I agree that getting rid of dax_pfn_mkwrite() and using fault handler in
> that case is a way to go. However I somewhat dislike the
> vm_insert_mixed_mkwrite() thing - it looks like a hack - and I'm aware that
> we have a similar thing for PMD which is ugly as well. Besides being ugly
> I'm also concerned that when 'mkwrite' is set, we just silently overwrite
> whatever PTE was installed at that position. Not that I'd see how that
> could screw us for DAX but still a concern that e.g. some PTE flag could
> get discarded by this is there... In fact, for !HAVE_PTE_SPECIAL
> architectures, you will leak zero page references by just overwriting the
> PTE - for those archs you really need to unmap zero page before replacing
> PTE (and the same for PMD I suppose).
> 
> So how about some vmf_insert_pfn(vmf, pe_size, pfn) helper that would
> properly detect PTE / PMD case, read / write case etc., check that PTE did
> not change from orig_pte, and handle all the nasty details instead of
> messing with insert_pfn?

I played around with this some today, and I wasn't super happy with the
results.  Here were some issues I encountered:

1) The pte_mkyoung(), maybe_mkwrite() and pte_mkdirty() calls need to happen
with the PTE locked, and I'm currently able to piggy-back on the locking done
in insert_pfn().  If I keep those steps out of insert_pfn() I either have to
essentially duplicate all the work done by insert_pfn() into another function
so I can do everything I need under one lock, or I have to insert the PFN via
insert_pfn() (which as you point out, will just leave the pfn alone if it's
already present), then for writes I have to re-grab the PTE lock and set do
the mkwrite steps.

Either of these work, but they both also seem kind of gross...

2) Combining the PTE and PMD cases into a common function will require
mm/memory.c to call vmf_insert_pfn_pmd(), which depends on
CONFIG_TRANSPARENT_HUGEPAGE being defined.  This works, it just means some
more #ifdef CONFIG_TRANSPARENT_HUGEPAGE hackery in mm/memory.c.

I agree that unconditionally overwriting the PTE when mkwrite is set is
undesireable, and should be fixed.  My implementation of the wrapper just
didn't seem that natural, which usually tells me I'm headed down the wrong
path.  Maybe I'm just not fully understanding what you intended?

In any case, my current favorite soultion for this issue is still what I had
in v1:

https://patchwork.kernel.org/patch/9772809/

with perhaps the removal of the new vm_insert_mixed_mkwrite() symbol, and just
adding a 'write' flag to vm_insert_mixed() and updating all the call sites,
and fixing the flow where mkwrite unconditionally overwrites the PTE?

If not, can you help me understand what you think is ugly about the 'write'
flag to vm_insert_mixed() and vmf_insert_pfn_pmd()?


[RFC v2 03/12] powerpc: Implement sys_pkey_alloc and sys_pkey_free system call.

2017-06-16 Thread Ram Pai
Sys_pkey_alloc() allocates and returns available pkey
Sys_pkey_free()  frees up the pkey.

Total 32 keys are supported on powerpc. However pkey 0,1 and 31
are reserved. So effectively we have 29 pkeys.

Signed-off-by: Ram Pai 
---
 arch/powerpc/Kconfig |  15 
 arch/powerpc/include/asm/book3s/64/mmu.h |  10 +++
 arch/powerpc/include/asm/book3s/64/pgtable.h |  62 ++
 arch/powerpc/include/asm/pkeys.h | 124 +++
 arch/powerpc/include/asm/systbl.h|   2 +
 arch/powerpc/include/asm/unistd.h|   4 +-
 arch/powerpc/include/uapi/asm/unistd.h   |   2 +
 arch/powerpc/mm/Makefile |   1 +
 arch/powerpc/mm/mmu_context_book3s64.c   |   5 ++
 arch/powerpc/mm/pkeys.c  |  88 +++
 include/linux/mm.h   |  31 ---
 include/uapi/asm-generic/mman-common.h   |   2 +-
 12 files changed, 331 insertions(+), 15 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f7c8f99..b6960617 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -871,6 +871,21 @@ config SECCOMP
 
  If unsure, say Y. Only embedded should say N here.
 
+config PPC64_MEMORY_PROTECTION_KEYS
+   prompt "PowerPC Memory Protection Keys"
+   def_bool y
+   # Note: only available in 64-bit mode
+   depends on PPC64 && PPC_64K_PAGES
+   select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_PKEYS
+   ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/powerpc/protection-keys.txt
+
+ If unsure, say y.
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3..0c0a2a8 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -108,6 +108,16 @@ struct patb_entry {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
struct list_head iommu_group_mem_list;
 #endif
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   /*
+* Each bit represents one protection key.
+* bit set   -> key allocated
+* bit unset -> key available for allocation
+*/
+   u32 pkey_allocation_map;
+   s16 execute_only_pkey; /* key holding execute-only protection */
+#endif
 } mm_context_t;
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 85bc987..87e9a89 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -428,6 +428,68 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
 }
 
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+
+#include 
+static inline u64 read_amr(void)
+{
+   return mfspr(SPRN_AMR);
+}
+static inline void write_amr(u64 value)
+{
+   mtspr(SPRN_AMR, value);
+}
+static inline u64 read_iamr(void)
+{
+   return mfspr(SPRN_IAMR);
+}
+static inline void write_iamr(u64 value)
+{
+   mtspr(SPRN_IAMR, value);
+}
+static inline u64 read_uamor(void)
+{
+   return mfspr(SPRN_UAMOR);
+}
+static inline void write_uamor(u64 value)
+{
+   mtspr(SPRN_UAMOR, value);
+}
+
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+static inline u64 read_amr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_amr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_uamor(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_uamor(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_iamr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_iamr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
new file mode 100644
index 000..7bc8746
--- /dev/null
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -0,0 +1,124 @@
+#ifndef _ASM_PPC64_PKEYS_H
+#define _ASM_PPC64_PKEYS_H
+
+
+#define arch_max_pkey()  32
+
+#define 

[RFC v2 03/12] powerpc: Implement sys_pkey_alloc and sys_pkey_free system call.

2017-06-16 Thread Ram Pai
Sys_pkey_alloc() allocates and returns available pkey
Sys_pkey_free()  frees up the pkey.

Total 32 keys are supported on powerpc. However pkey 0,1 and 31
are reserved. So effectively we have 29 pkeys.

Signed-off-by: Ram Pai 
---
 arch/powerpc/Kconfig |  15 
 arch/powerpc/include/asm/book3s/64/mmu.h |  10 +++
 arch/powerpc/include/asm/book3s/64/pgtable.h |  62 ++
 arch/powerpc/include/asm/pkeys.h | 124 +++
 arch/powerpc/include/asm/systbl.h|   2 +
 arch/powerpc/include/asm/unistd.h|   4 +-
 arch/powerpc/include/uapi/asm/unistd.h   |   2 +
 arch/powerpc/mm/Makefile |   1 +
 arch/powerpc/mm/mmu_context_book3s64.c   |   5 ++
 arch/powerpc/mm/pkeys.c  |  88 +++
 include/linux/mm.h   |  31 ---
 include/uapi/asm-generic/mman-common.h   |   2 +-
 12 files changed, 331 insertions(+), 15 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f7c8f99..b6960617 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -871,6 +871,21 @@ config SECCOMP
 
  If unsure, say Y. Only embedded should say N here.
 
+config PPC64_MEMORY_PROTECTION_KEYS
+   prompt "PowerPC Memory Protection Keys"
+   def_bool y
+   # Note: only available in 64-bit mode
+   depends on PPC64 && PPC_64K_PAGES
+   select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_PKEYS
+   ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/powerpc/protection-keys.txt
+
+ If unsure, say y.
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3..0c0a2a8 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -108,6 +108,16 @@ struct patb_entry {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
struct list_head iommu_group_mem_list;
 #endif
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   /*
+* Each bit represents one protection key.
+* bit set   -> key allocated
+* bit unset -> key available for allocation
+*/
+   u32 pkey_allocation_map;
+   s16 execute_only_pkey; /* key holding execute-only protection */
+#endif
 } mm_context_t;
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 85bc987..87e9a89 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -428,6 +428,68 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
 }
 
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+
+#include 
+static inline u64 read_amr(void)
+{
+   return mfspr(SPRN_AMR);
+}
+static inline void write_amr(u64 value)
+{
+   mtspr(SPRN_AMR, value);
+}
+static inline u64 read_iamr(void)
+{
+   return mfspr(SPRN_IAMR);
+}
+static inline void write_iamr(u64 value)
+{
+   mtspr(SPRN_IAMR, value);
+}
+static inline u64 read_uamor(void)
+{
+   return mfspr(SPRN_UAMOR);
+}
+static inline void write_uamor(u64 value)
+{
+   mtspr(SPRN_UAMOR, value);
+}
+
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+static inline u64 read_amr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_amr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_uamor(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_uamor(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_iamr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_iamr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
new file mode 100644
index 000..7bc8746
--- /dev/null
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -0,0 +1,124 @@
+#ifndef _ASM_PPC64_PKEYS_H
+#define _ASM_PPC64_PKEYS_H
+
+
+#define arch_max_pkey()  32
+
+#define AMR_AD_BIT 0x1UL
+#define 

[RFC v2 02/12] powerpc: Free up four 64K PTE bits in 64K backed hpte pages.

2017-06-16 Thread Ram Pai
Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 64K backed hpte pages. This along with the earlier
patch will entirely free up the four bits from 64K PTE.

This patch does the following change to 64K PTE that is
backed by 64K hpte.

H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

since bit 7 is now freed up, we move H_PAGE_BUSY from bit 9
to bit 7. Trying to minimize gaps so that contiguous bits
can be allocated if needed in the future.

The second part of the PTE will hold
(H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 26 --
 arch/powerpc/mm/hash64_64k.c  | 16 +++-
 arch/powerpc/mm/hugetlbpage-hash64.c  | 16 ++--
 3 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0eb3c89..2fa5c60 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -12,12 +12,8 @@
  */
 #define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
-#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
-#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
-#define H_PAGE_F_GIX_SHIFT 56
 
-
-#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_BUSY_RPAGE_RPN44 /* software: PTE & hash are busy */
 #define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
 
 /*
@@ -56,24 +52,18 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
unsigned long *hidxp;
 
rpte.pte = pte;
-   rpte.hidx = 0;
-   if (pte_val(pte) & H_PAGE_COMBO) {
-   /*
-* Make sure we order the hidx load against the H_PAGE_COMBO
-* check. The store side ordering is done in __hash_page_4K
-*/
-   smp_rmb();
-   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-   rpte.hidx = *hidxp;
-   }
+   /*
+* The store side ordering is done in __hash_page_4K
+*/
+   smp_rmb();
+   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+   rpte.hidx = *hidxp;
return rpte;
 }
 
 static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long 
index)
 {
-   if ((pte_val(rpte.pte) & H_PAGE_COMBO))
-   return (rpte.hidx >> (index<<2)) & 0xf;
-   return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
+   return ((rpte.hidx >> (index<<2)) & 0xfUL);
 }
 
 static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 3702a3c..1c25ec2 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -211,6 +211,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize)
 {
+   real_pte_t rpte;
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -247,6 +248,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
rflags = htab_convert_pte_flags(new_pte);
+   rpte = __real_pte(__pte(old_pte), ptep);
 
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -254,16 +256,13 @@ int __hash_page_64K(unsigned long ea, unsigned long 
access,
 
vpn  = hpt_vpn(ea, vsid, ssize);
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+   unsigned long gslot;
+
/*
 * There MIGHT be an HPTE for this pte
 */
-   hash = hpt_hash(vpn, shift, ssize);
-   if (old_pte & H_PAGE_F_SECOND)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-   if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
+   gslot = get_hidx_gslot(vpn, shift, ssize, rpte, 0);
+   if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
   MMU_PAGE_64K, ssize,
   flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
@@ -313,8 +312,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
return -1;
}
 
-   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &

[RFC v2 02/12] powerpc: Free up four 64K PTE bits in 64K backed hpte pages.

2017-06-16 Thread Ram Pai
Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 64K backed hpte pages. This along with the earlier
patch will entirely free up the four bits from 64K PTE.

This patch does the following change to 64K PTE that is
backed by 64K hpte.

H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

since bit 7 is now freed up, we move H_PAGE_BUSY from bit 9
to bit 7. Trying to minimize gaps so that contiguous bits
can be allocated if needed in the future.

The second part of the PTE will hold
(H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 26 --
 arch/powerpc/mm/hash64_64k.c  | 16 +++-
 arch/powerpc/mm/hugetlbpage-hash64.c  | 16 ++--
 3 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0eb3c89..2fa5c60 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -12,12 +12,8 @@
  */
 #define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
-#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
-#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
-#define H_PAGE_F_GIX_SHIFT 56
 
-
-#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_BUSY_RPAGE_RPN44 /* software: PTE & hash are busy */
 #define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
 
 /*
@@ -56,24 +52,18 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
unsigned long *hidxp;
 
rpte.pte = pte;
-   rpte.hidx = 0;
-   if (pte_val(pte) & H_PAGE_COMBO) {
-   /*
-* Make sure we order the hidx load against the H_PAGE_COMBO
-* check. The store side ordering is done in __hash_page_4K
-*/
-   smp_rmb();
-   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-   rpte.hidx = *hidxp;
-   }
+   /*
+* The store side ordering is done in __hash_page_4K
+*/
+   smp_rmb();
+   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+   rpte.hidx = *hidxp;
return rpte;
 }
 
 static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long 
index)
 {
-   if ((pte_val(rpte.pte) & H_PAGE_COMBO))
-   return (rpte.hidx >> (index<<2)) & 0xf;
-   return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
+   return ((rpte.hidx >> (index<<2)) & 0xfUL);
 }
 
 static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 3702a3c..1c25ec2 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -211,6 +211,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize)
 {
+   real_pte_t rpte;
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -247,6 +248,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
rflags = htab_convert_pte_flags(new_pte);
+   rpte = __real_pte(__pte(old_pte), ptep);
 
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -254,16 +256,13 @@ int __hash_page_64K(unsigned long ea, unsigned long 
access,
 
vpn  = hpt_vpn(ea, vsid, ssize);
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+   unsigned long gslot;
+
/*
 * There MIGHT be an HPTE for this pte
 */
-   hash = hpt_hash(vpn, shift, ssize);
-   if (old_pte & H_PAGE_F_SECOND)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-   if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
+   gslot = get_hidx_gslot(vpn, shift, ssize, rpte, 0);
+   if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
   MMU_PAGE_64K, ssize,
   flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
@@ -313,8 +312,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
return -1;
}
 
-   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
-

[RFC v2 06/12] powerpc: Program HPTE key protection bits.

2017-06-16 Thread Ram Pai
Map the PTE protection key bits to the HPTE key protection bits,
while creatiing HPTE  entries.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 5 +
 arch/powerpc/include/asm/pkeys.h  | 7 +++
 arch/powerpc/mm/hash_utils_64.c   | 5 +
 3 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index cfb8169..3d7872c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -90,6 +90,8 @@
 #define HPTE_R_PP0 ASM_CONST(0x8000)
 #define HPTE_R_TS  ASM_CONST(0x4000)
 #define HPTE_R_KEY_HI  ASM_CONST(0x3000)
+#define HPTE_R_KEY_BIT0ASM_CONST(0x2000)
+#define HPTE_R_KEY_BIT1ASM_CONST(0x1000)
 #define HPTE_R_RPN_SHIFT   12
 #define HPTE_R_RPN ASM_CONST(0x0000)
 #define HPTE_R_RPN_3_0 ASM_CONST(0x01fff000)
@@ -104,6 +106,9 @@
 #define HPTE_R_C   ASM_CONST(0x0080)
 #define HPTE_R_R   ASM_CONST(0x0100)
 #define HPTE_R_KEY_LO  ASM_CONST(0x0e00)
+#define HPTE_R_KEY_BIT2ASM_CONST(0x0800)
+#define HPTE_R_KEY_BIT3ASM_CONST(0x0400)
+#define HPTE_R_KEY_BIT4ASM_CONST(0x0200)
 
 #define HPTE_V_1TB_SEG ASM_CONST(0x4000)
 #define HPTE_V_VRMA_MASK   ASM_CONST(0x4001ff00)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0f3dca8..9b6820d 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -27,6 +27,13 @@
((vm_flags & VM_PKEY_BIT3) ? H_PAGE_PKEY_BIT1 : 0x0UL) | \
((vm_flags & VM_PKEY_BIT4) ? H_PAGE_PKEY_BIT0 : 0x0UL))
 
+#define calc_pte_to_hpte_pkey_bits(pteflags)   \
+   (((pteflags & H_PAGE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |\
+   ((pteflags & H_PAGE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL))
+
 /*
  * Bits are in BE format.
  * NOTE: key 31, 1, 0 are not used.
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c0f4b46..7d974cd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -230,6 +231,10 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
 */
rflags |= HPTE_R_M;
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   rflags |= calc_pte_to_hpte_pkey_bits(pteflags);
+#endif
+
return rflags;
 }
 
-- 
1.8.3.1



[RFC v2 06/12] powerpc: Program HPTE key protection bits.

2017-06-16 Thread Ram Pai
Map the PTE protection key bits to the HPTE key protection bits,
while creatiing HPTE  entries.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 5 +
 arch/powerpc/include/asm/pkeys.h  | 7 +++
 arch/powerpc/mm/hash_utils_64.c   | 5 +
 3 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index cfb8169..3d7872c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -90,6 +90,8 @@
 #define HPTE_R_PP0 ASM_CONST(0x8000)
 #define HPTE_R_TS  ASM_CONST(0x4000)
 #define HPTE_R_KEY_HI  ASM_CONST(0x3000)
+#define HPTE_R_KEY_BIT0ASM_CONST(0x2000)
+#define HPTE_R_KEY_BIT1ASM_CONST(0x1000)
 #define HPTE_R_RPN_SHIFT   12
 #define HPTE_R_RPN ASM_CONST(0x0000)
 #define HPTE_R_RPN_3_0 ASM_CONST(0x01fff000)
@@ -104,6 +106,9 @@
 #define HPTE_R_C   ASM_CONST(0x0080)
 #define HPTE_R_R   ASM_CONST(0x0100)
 #define HPTE_R_KEY_LO  ASM_CONST(0x0e00)
+#define HPTE_R_KEY_BIT2ASM_CONST(0x0800)
+#define HPTE_R_KEY_BIT3ASM_CONST(0x0400)
+#define HPTE_R_KEY_BIT4ASM_CONST(0x0200)
 
 #define HPTE_V_1TB_SEG ASM_CONST(0x4000)
 #define HPTE_V_VRMA_MASK   ASM_CONST(0x4001ff00)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0f3dca8..9b6820d 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -27,6 +27,13 @@
((vm_flags & VM_PKEY_BIT3) ? H_PAGE_PKEY_BIT1 : 0x0UL) | \
((vm_flags & VM_PKEY_BIT4) ? H_PAGE_PKEY_BIT0 : 0x0UL))
 
+#define calc_pte_to_hpte_pkey_bits(pteflags)   \
+   (((pteflags & H_PAGE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |\
+   ((pteflags & H_PAGE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL))
+
 /*
  * Bits are in BE format.
  * NOTE: key 31, 1, 0 are not used.
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c0f4b46..7d974cd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -230,6 +231,10 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
 */
rflags |= HPTE_R_M;
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   rflags |= calc_pte_to_hpte_pkey_bits(pteflags);
+#endif
+
return rflags;
 }
 
-- 
1.8.3.1



[RFC v2 04/12] powerpc: store and restore the pkey state across context switches.

2017-06-16 Thread Ram Pai
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/processor.h |  5 +
 arch/powerpc/kernel/process.c| 18 ++
 2 files changed, 23 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index a2123f2..1f714df 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -310,6 +310,11 @@ struct thread_struct {
struct thread_vr_state ckvr_state; /* Checkpointed VR state */
unsigned long   ckvrsave; /* Checkpointed VRSAVE */
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   unsigned long   amr;
+   unsigned long   iamr;
+   unsigned long   uamor;
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
void*   kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index baae104..37d001a 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1096,6 +1096,11 @@ static inline void save_sprs(struct thread_struct *t)
t->tar = mfspr(SPRN_TAR);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   t->amr = mfspr(SPRN_AMR);
+   t->iamr = mfspr(SPRN_IAMR);
+   t->uamor = mfspr(SPRN_UAMOR);
+#endif
 }
 
 static inline void restore_sprs(struct thread_struct *old_thread,
@@ -1131,6 +1136,14 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (old_thread->amr != new_thread->amr)
+   mtspr(SPRN_AMR, new_thread->amr);
+   if (old_thread->iamr != new_thread->iamr)
+   mtspr(SPRN_IAMR, new_thread->iamr);
+   if (old_thread->uamor != new_thread->uamor)
+   mtspr(SPRN_UAMOR, new_thread->uamor);
+#endif
 }
 
 struct task_struct *__switch_to(struct task_struct *prev,
@@ -1686,6 +1699,11 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
current->thread.tm_texasr = 0;
current->thread.tm_tfiar = 0;
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   current->thread.amr   = 0x0ul;
+   current->thread.iamr  = 0x0ul;
+   current->thread.uamor = 0x0ul;
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 }
 EXPORT_SYMBOL(start_thread);
 
-- 
1.8.3.1



[RFC v2 04/12] powerpc: store and restore the pkey state across context switches.

2017-06-16 Thread Ram Pai
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/processor.h |  5 +
 arch/powerpc/kernel/process.c| 18 ++
 2 files changed, 23 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index a2123f2..1f714df 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -310,6 +310,11 @@ struct thread_struct {
struct thread_vr_state ckvr_state; /* Checkpointed VR state */
unsigned long   ckvrsave; /* Checkpointed VRSAVE */
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   unsigned long   amr;
+   unsigned long   iamr;
+   unsigned long   uamor;
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
void*   kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index baae104..37d001a 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1096,6 +1096,11 @@ static inline void save_sprs(struct thread_struct *t)
t->tar = mfspr(SPRN_TAR);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   t->amr = mfspr(SPRN_AMR);
+   t->iamr = mfspr(SPRN_IAMR);
+   t->uamor = mfspr(SPRN_UAMOR);
+#endif
 }
 
 static inline void restore_sprs(struct thread_struct *old_thread,
@@ -1131,6 +1136,14 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (old_thread->amr != new_thread->amr)
+   mtspr(SPRN_AMR, new_thread->amr);
+   if (old_thread->iamr != new_thread->iamr)
+   mtspr(SPRN_IAMR, new_thread->iamr);
+   if (old_thread->uamor != new_thread->uamor)
+   mtspr(SPRN_UAMOR, new_thread->uamor);
+#endif
 }
 
 struct task_struct *__switch_to(struct task_struct *prev,
@@ -1686,6 +1699,11 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
current->thread.tm_texasr = 0;
current->thread.tm_tfiar = 0;
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   current->thread.amr   = 0x0ul;
+   current->thread.iamr  = 0x0ul;
+   current->thread.uamor = 0x0ul;
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 }
 EXPORT_SYMBOL(start_thread);
 
-- 
1.8.3.1



[RFC v2 09/12] powerpc: Deliver SEGV signal on pkey violation.

2017-06-16 Thread Ram Pai
The value of the AMR register at the time of exception
is made available in gp_regs[PT_AMR] of the siginfo.

This field can be used to reprogram the permission bits of
any valid pkey.

Similarly the value of the pkey, whose protection got violated,
is made available at si_pkey field of the siginfo structure.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/paca.h|  1 +
 arch/powerpc/include/uapi/asm/ptrace.h |  3 ++-
 arch/powerpc/kernel/asm-offsets.c  |  5 
 arch/powerpc/kernel/exceptions-64s.S   |  8 ++
 arch/powerpc/kernel/signal_32.c| 14 ++
 arch/powerpc/kernel/signal_64.c| 14 ++
 arch/powerpc/kernel/traps.c| 49 ++
 arch/powerpc/mm/fault.c|  4 +++
 8 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1c09f8f..a41afd3 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -92,6 +92,7 @@ struct paca_struct {
struct dtl_entry *dispatch_log_end;
 #endif /* CONFIG_PPC_STD_MMU_64 */
u64 dscr_default;   /* per-CPU default DSCR */
+   u64 paca_amr;   /* value of amr at exception */
 
 #ifdef CONFIG_PPC_STD_MMU_64
/*
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
b/arch/powerpc/include/uapi/asm/ptrace.h
index 8036b38..7ec2428 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -108,8 +108,9 @@ struct pt_regs {
 #define PT_DAR 41
 #define PT_DSISR 42
 #define PT_RESULT 43
-#define PT_DSCR 44
 #define PT_REGS_COUNT 44
+#define PT_DSCR 44
+#define PT_AMR 45
 
 #define PT_FPR048  /* each FP reg occupies 2 slots in this space */
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 709e234..17f5d8a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -241,6 +241,11 @@ int main(void)
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default);
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   OFFSET(PACA_AMR, paca_struct, paca_amr);
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime);
OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 3fd0528..8db9ef8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -493,6 +493,10 @@ EXC_COMMON_BEGIN(data_access_common)
ld  r12,_MSR(r1)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   mfspr   r5,SPRN_AMR
+   std r5,PACA_AMR(r13)
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
li  r5,0x300
std r3,_DAR(r1)
std r4,_DSISR(r1)
@@ -561,6 +565,10 @@ EXC_COMMON_BEGIN(instruction_access_common)
ld  r12,_MSR(r1)
ld  r3,_NIP(r1)
andis.  r4,r12,0x5820
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   mfspr   r5,SPRN_AMR
+   std r5,PACA_AMR(r13)
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
li  r5,0x400
std r3,_DAR(r1)
std r4,_DSISR(r1)
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 97bb138..059766a 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -500,6 +500,11 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
   (unsigned long) >tramp[2]);
}
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (__put_user(get_paca()->paca_amr, >mc_gregs[PT_AMR]))
+   return 1;
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
return 0;
 }
 
@@ -661,6 +666,9 @@ static long restore_user_regs(struct pt_regs *regs,
long err;
unsigned int save_r2 = 0;
unsigned long msr;
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   unsigned long amr;
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 #ifdef CONFIG_VSX
int i;
 #endif
@@ -750,6 +758,12 @@ static long restore_user_regs(struct pt_regs *regs,
return 1;
 #endif /* CONFIG_SPE */
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   err |= __get_user(amr, >mc_gregs[PT_AMR]);
+   if (!err && amr != get_paca()->paca_amr)
+   write_amr(amr);
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
return 0;
 }
 
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index c83c115..35df2e4 100644
--- 

[RFC v2 09/12] powerpc: Deliver SEGV signal on pkey violation.

2017-06-16 Thread Ram Pai
The value of the AMR register at the time of exception
is made available in gp_regs[PT_AMR] of the siginfo.

This field can be used to reprogram the permission bits of
any valid pkey.

Similarly the value of the pkey, whose protection got violated,
is made available at si_pkey field of the siginfo structure.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/paca.h|  1 +
 arch/powerpc/include/uapi/asm/ptrace.h |  3 ++-
 arch/powerpc/kernel/asm-offsets.c  |  5 
 arch/powerpc/kernel/exceptions-64s.S   |  8 ++
 arch/powerpc/kernel/signal_32.c| 14 ++
 arch/powerpc/kernel/signal_64.c| 14 ++
 arch/powerpc/kernel/traps.c| 49 ++
 arch/powerpc/mm/fault.c|  4 +++
 8 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1c09f8f..a41afd3 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -92,6 +92,7 @@ struct paca_struct {
struct dtl_entry *dispatch_log_end;
 #endif /* CONFIG_PPC_STD_MMU_64 */
u64 dscr_default;   /* per-CPU default DSCR */
+   u64 paca_amr;   /* value of amr at exception */
 
 #ifdef CONFIG_PPC_STD_MMU_64
/*
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
b/arch/powerpc/include/uapi/asm/ptrace.h
index 8036b38..7ec2428 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -108,8 +108,9 @@ struct pt_regs {
 #define PT_DAR 41
 #define PT_DSISR 42
 #define PT_RESULT 43
-#define PT_DSCR 44
 #define PT_REGS_COUNT 44
+#define PT_DSCR 44
+#define PT_AMR 45
 
 #define PT_FPR048  /* each FP reg occupies 2 slots in this space */
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 709e234..17f5d8a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -241,6 +241,11 @@ int main(void)
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default);
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   OFFSET(PACA_AMR, paca_struct, paca_amr);
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime);
OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 3fd0528..8db9ef8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -493,6 +493,10 @@ EXC_COMMON_BEGIN(data_access_common)
ld  r12,_MSR(r1)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   mfspr   r5,SPRN_AMR
+   std r5,PACA_AMR(r13)
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
li  r5,0x300
std r3,_DAR(r1)
std r4,_DSISR(r1)
@@ -561,6 +565,10 @@ EXC_COMMON_BEGIN(instruction_access_common)
ld  r12,_MSR(r1)
ld  r3,_NIP(r1)
andis.  r4,r12,0x5820
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   mfspr   r5,SPRN_AMR
+   std r5,PACA_AMR(r13)
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
li  r5,0x400
std r3,_DAR(r1)
std r4,_DSISR(r1)
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 97bb138..059766a 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -500,6 +500,11 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
   (unsigned long) >tramp[2]);
}
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (__put_user(get_paca()->paca_amr, >mc_gregs[PT_AMR]))
+   return 1;
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
return 0;
 }
 
@@ -661,6 +666,9 @@ static long restore_user_regs(struct pt_regs *regs,
long err;
unsigned int save_r2 = 0;
unsigned long msr;
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   unsigned long amr;
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 #ifdef CONFIG_VSX
int i;
 #endif
@@ -750,6 +758,12 @@ static long restore_user_regs(struct pt_regs *regs,
return 1;
 #endif /* CONFIG_SPE */
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   err |= __get_user(amr, >mc_gregs[PT_AMR]);
+   if (!err && amr != get_paca()->paca_amr)
+   write_amr(amr);
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
return 0;
 }
 
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index c83c115..35df2e4 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ 

[RFC v2 07/12] powerpc: Macro the mask used for checking DSI exception

2017-06-16 Thread Ram Pai
Replace the magic number used to check for DSI exception
with a meaningful value.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/reg.h   | 9 -
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7e50e47..2dcb8a1 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -272,16 +272,23 @@
 #define SPRN_DAR   0x013   /* Data Address Register */
 #define SPRN_DBCR  0x136   /* e300 Data Breakpoint Control Reg */
 #define SPRN_DSISR 0x012   /* Data Storage Interrupt Status Register */
+#define   DSISR_BIT32  0x8000  /* not defined */
 #define   DSISR_NOHPTE 0x4000  /* no translation found */
+#define   DSISR_PAGEATTR_CONFLT0x2000  /* page attribute 
conflict */
+#define   DSISR_BIT35  0x1000  /* not defined */
 #define   DSISR_PROTFAULT  0x0800  /* protection fault */
 #define   DSISR_BADACCESS  0x0400  /* bad access to CI or G */
 #define   DSISR_ISSTORE0x0200  /* access was a store */
 #define   DSISR_DABRMATCH  0x0040  /* hit data breakpoint */
-#define   DSISR_NOSEGMENT  0x0020  /* SLB miss */
 #define   DSISR_KEYFAULT   0x0020  /* Key fault */
+#define   DSISR_BIT43  0x0010  /* not defined */
 #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
+#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
+   DSISR_PAGEATTR_CONFLT | \
+   DSISR_BADACCESS |   \
+   DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR   0x11B   /* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ae418b8..3fd0528 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1411,7 +1411,7 @@ USE_TEXT_SECTION()
.balign IFETCH_ALIGN_BYTES
 do_hash_page:
 #ifdef CONFIG_PPC_STD_MMU_64
-   andis.  r0,r4,0xa410/* weird error? */
+   andis.  r0,r4,DSISR_PAGE_FAULT_MASK@h
bne-handle_page_fault   /* if not, try to insert a HPTE */
andis.  r0,r4,DSISR_DABRMATCH@h
bne-handle_dabr_fault
-- 
1.8.3.1



[RFC v2 07/12] powerpc: Macro the mask used for checking DSI exception

2017-06-16 Thread Ram Pai
Replace the magic number used to check for DSI exception
with a meaningful value.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/reg.h   | 9 -
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7e50e47..2dcb8a1 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -272,16 +272,23 @@
 #define SPRN_DAR   0x013   /* Data Address Register */
 #define SPRN_DBCR  0x136   /* e300 Data Breakpoint Control Reg */
 #define SPRN_DSISR 0x012   /* Data Storage Interrupt Status Register */
+#define   DSISR_BIT32  0x8000  /* not defined */
 #define   DSISR_NOHPTE 0x4000  /* no translation found */
+#define   DSISR_PAGEATTR_CONFLT0x2000  /* page attribute 
conflict */
+#define   DSISR_BIT35  0x1000  /* not defined */
 #define   DSISR_PROTFAULT  0x0800  /* protection fault */
 #define   DSISR_BADACCESS  0x0400  /* bad access to CI or G */
 #define   DSISR_ISSTORE0x0200  /* access was a store */
 #define   DSISR_DABRMATCH  0x0040  /* hit data breakpoint */
-#define   DSISR_NOSEGMENT  0x0020  /* SLB miss */
 #define   DSISR_KEYFAULT   0x0020  /* Key fault */
+#define   DSISR_BIT43  0x0010  /* not defined */
 #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
+#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
+   DSISR_PAGEATTR_CONFLT | \
+   DSISR_BADACCESS |   \
+   DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR   0x11B   /* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ae418b8..3fd0528 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1411,7 +1411,7 @@ USE_TEXT_SECTION()
.balign IFETCH_ALIGN_BYTES
 do_hash_page:
 #ifdef CONFIG_PPC_STD_MMU_64
-   andis.  r0,r4,0xa410/* weird error? */
+   andis.  r0,r4,DSISR_PAGE_FAULT_MASK@h
bne-handle_page_fault   /* if not, try to insert a HPTE */
andis.  r0,r4,DSISR_DABRMATCH@h
bne-handle_dabr_fault
-- 
1.8.3.1



[RFC v2 12/12]selftest: Updated protection key selftest

2017-06-16 Thread Ram Pai
Added test support for PowerPC implementation off protection keys.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  365 +++
 tools/testing/selftests/vm/protection_keys.c  | 1451 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1395 
 6 files changed, 1818 insertions(+), 1615 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index cbb29e4..1d32f78 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -17,6 +17,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += protection_keys
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000..5fec0a2
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,365 @@
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__ /* arch */
+
+#define SYS_mprotect_key 380
+#define SYS_pkey_alloc  381
+#define SYS_pkey_free   382
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  1
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x1
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<21)
+
+#define INIT_PRKU 0x0UL
+
+#elif __powerpc64__ /* arch */
+
+#define SYS_mprotect_key 386
+#define SYS_pkey_alloc  384
+#define SYS_pkey_free   385
+#define si_pkey_offset 0x20
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define REG_AMR45
+#define gregs gp_regs
+#define fpregs fp_regs
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS  3
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x3  /* disable read and write */
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<24)
+
+#define INIT_PRKU 0x3UL
+#else /* arch */
+
+   NOT SUPPORTED
+
+#endif /* arch */
+
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+
+
+static inline u32 pkey_to_shift(int pkey)
+{
+#ifdef __i386__
+   return pkey * PKRU_BITS_PER_PKEY;
+#elif __powerpc64__
+   return (NR_PKEYS - pkey - 1) * PKRU_BITS_PER_PKEY;
+#endif
+}
+
+
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   if (!dprint_in_signal) {
+   vprintf(format, ap);
+   } else {
+   int len = vsnprintf(dprint_in_signal_buffer,
+   DPRINT_IN_SIGNAL_BUF_SIZE,
+   format, ap);
+   /*
+* len is amount that would have been printed,
+* but actual write is truncated at BUF_SIZE.
+*/
+   if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
+   len = DPRINT_IN_SIGNAL_BUF_SIZE;
+   write(1, dprint_in_signal_buffer, len);
+   }
+   va_end(ap);
+}
+#define dprintf_level(level, args...) do { \
+   if (level <= DEBUG_LEVEL)   \
+   sigsafe_printf(args);   \
+   fflush(NULL);   \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 __rdpkey_reg(void)
+{
+#ifdef __i386__
+   unsigned int eax, edx;
+   unsigned int ecx = 0;
+   unsigned int pkey_reg;
+
+   asm volatile(".byte 0x0f,0x01,0xee\n\t"
+: "=a" (eax), "=d" (edx)
+: "c" (ecx));
+#elif __powerpc64__
+   u64 eax;
+   u64 pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" ((u64)(eax)));
+#endif
+   pkey_reg = (u64)eax;
+   return pkey_reg;
+}
+
+static inline u64 _rdpkey_reg(int line)
+{
+   u64 pkey_reg 

[RFC v2 12/12]selftest: Updated protection key selftest

2017-06-16 Thread Ram Pai
Added test support for PowerPC implementation off protection keys.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  365 +++
 tools/testing/selftests/vm/protection_keys.c  | 1451 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1395 
 6 files changed, 1818 insertions(+), 1615 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index cbb29e4..1d32f78 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -17,6 +17,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += protection_keys
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000..5fec0a2
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,365 @@
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__ /* arch */
+
+#define SYS_mprotect_key 380
+#define SYS_pkey_alloc  381
+#define SYS_pkey_free   382
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  1
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x1
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<21)
+
+#define INIT_PRKU 0x0UL
+
+#elif __powerpc64__ /* arch */
+
+#define SYS_mprotect_key 386
+#define SYS_pkey_alloc  384
+#define SYS_pkey_free   385
+#define si_pkey_offset 0x20
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define REG_AMR45
+#define gregs gp_regs
+#define fpregs fp_regs
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS  3
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x3  /* disable read and write */
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<24)
+
+#define INIT_PRKU 0x3UL
+#else /* arch */
+
+   NOT SUPPORTED
+
+#endif /* arch */
+
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+
+
+static inline u32 pkey_to_shift(int pkey)
+{
+#ifdef __i386__
+   return pkey * PKRU_BITS_PER_PKEY;
+#elif __powerpc64__
+   return (NR_PKEYS - pkey - 1) * PKRU_BITS_PER_PKEY;
+#endif
+}
+
+
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   if (!dprint_in_signal) {
+   vprintf(format, ap);
+   } else {
+   int len = vsnprintf(dprint_in_signal_buffer,
+   DPRINT_IN_SIGNAL_BUF_SIZE,
+   format, ap);
+   /*
+* len is amount that would have been printed,
+* but actual write is truncated at BUF_SIZE.
+*/
+   if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
+   len = DPRINT_IN_SIGNAL_BUF_SIZE;
+   write(1, dprint_in_signal_buffer, len);
+   }
+   va_end(ap);
+}
+#define dprintf_level(level, args...) do { \
+   if (level <= DEBUG_LEVEL)   \
+   sigsafe_printf(args);   \
+   fflush(NULL);   \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 __rdpkey_reg(void)
+{
+#ifdef __i386__
+   unsigned int eax, edx;
+   unsigned int ecx = 0;
+   unsigned int pkey_reg;
+
+   asm volatile(".byte 0x0f,0x01,0xee\n\t"
+: "=a" (eax), "=d" (edx)
+: "c" (ecx));
+#elif __powerpc64__
+   u64 eax;
+   u64 pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" ((u64)(eax)));
+#endif
+   pkey_reg = (u64)eax;
+   return pkey_reg;
+}
+
+static inline u64 _rdpkey_reg(int line)
+{
+   u64 pkey_reg = __rdpkey_reg();
+

[RFC v2 11/12]Documentation: Documentation updates.

2017-06-16 Thread Ram Pai
The Documentaton file is moved from x86 into the generic area,
since this feature is now supported by more than one archs.

Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt  | 110 ++
 Documentation/x86/protection-keys.txt |  85 --
 2 files changed, 110 insertions(+), 85 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
new file mode 100644
index 000..b49e6bb
--- /dev/null
+++ b/Documentation/vm/protection-keys.txt
@@ -0,0 +1,110 @@
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
+found in new generation of intel CPUs on PowerPC CPUs.
+
+Memory Protection Keys provides a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables
+when an application changes protection domains.
+
+
+On Intel:
+
+It works by dedicating 4 previously ignored bits in each page table
+entry to a "protection key", giving 16 possible keys.
+
+There is also a new user-accessible register (PKRU) with two separate
+bits (Access Disable and Write Disable) for each key.  Being a CPU
+register, PKRU is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+
+There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+to the new register.  The feature is only available in 64-bit mode,
+even though there is theoretically space in the PAE PTEs.  These
+permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+
+On PowerPC:
+
+It works by dedicating 5 page table entry to a "protection key",
+giving 32 possible keys.
+
+There is a user-accessible register (AMR) with two separate bits
+(Access Disable and Write Disable) for each key.  Being a CPU
+register, AMR is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+NOTE: Disabling read permission does not disable
+write and vice-versa.
+
+The feature is available on 64-bit HPTE mode only.
+
+'mtspr 0xd, mem' reads the AMR register
+'mfspr mem, 0xd' writes into the AMR register.
+
+Permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+=== Syscalls ===
+
+There are 3 system calls which directly interact with pkeys:
+
+   int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+   int pkey_free(int pkey);
+   int pkey_mprotect(unsigned long start, size_t len,
+ unsigned long prot, int pkey);
+
+Before a pkey can be used, it must first be allocated with
+pkey_alloc().  An application calls the WRPKRU instruction
+directly in order to change access permissions to memory covered
+with a key.  In this example WRPKRU is wrapped by a C function
+called pkey_set().
+
+   int real_prot = PROT_READ|PROT_WRITE;
+   pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+   ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
+   ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
+   ... application runs here
+
+Now, if the application needs to update the data at 'ptr', it can
+gain access, do the update, then remove its write access:
+
+   pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+   *ptr = foo; // assign something
+   pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+
+Now when it frees the memory, it will also free the pkey since it
+is no longer in use:
+
+   munmap(ptr, PAGE_SIZE);
+   pkey_free(pkey);
+
+(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+ An example implementation can be found in
+ tools/testing/selftests/x86/protection_keys.c)
+
+=== Behavior ===
+
+The kernel attempts to make protection keys consistent with the
+behavior of a plain mprotect().  For instance if you do this:
+
+   mprotect(ptr, size, PROT_NONE);
+   something(ptr);
+
+you can expect the same effects with protection keys when doing this:
+
+   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
+   pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
+   something(ptr);
+
+That should be true whether something() is a direct access to 'ptr'
+like:
+
+   *ptr = foo;
+
+or when the kernel does the access on the application's behalf like
+with a read():
+
+   read(fd, ptr, 1);
+
+The kernel will send a SIGSEGV in both cases, but si_code will be set
+to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
+the plain mprotect() permissions are violated.
diff --git a/Documentation/x86/protection-keys.txt 
b/Documentation/x86/protection-keys.txt
deleted file mode 100644
index b643045..000
--- 

[RFC v2 08/12] powerpc: Handle exceptions caused by violation of pkey protection.

2017-06-16 Thread Ram Pai
Handle Data and Instruction exceptions caused by memory
protection-key.

Signed-off-by: Ram Pai 
(cherry picked from commit a5e5217619a0c475fe0cacc3b0cf1d3d33c79a09)

Conflicts:
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/exceptions-64s.S
---
 arch/powerpc/include/asm/mmu_context.h | 12 +
 arch/powerpc/include/asm/pkeys.h   |  9 
 arch/powerpc/include/asm/reg.h |  7 +--
 arch/powerpc/mm/fault.c| 21 +++-
 arch/powerpc/mm/pkeys.c| 90 ++
 5 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index da7e943..71fffe0 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -175,11 +175,23 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 {
 }
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+bool arch_pte_access_permitted(pte_t pte, bool write);
+bool arch_vma_access_permitted(struct vm_area_struct *vma,
+   bool write, bool execute, bool foreign);
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+   /* by default, allow everything */
+   return true;
+}
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
 {
/* by default, allow everything */
return true;
 }
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 9b6820d..405e7db 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -14,6 +14,15 @@
VM_PKEY_BIT3 | \
VM_PKEY_BIT4)
 
+static inline u16 pte_flags_to_pkey(unsigned long pte_flags)
+{
+   return ((pte_flags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0);
+}
+
 #define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | \
((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 2dcb8a1..a11977f 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -285,9 +285,10 @@
 #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
-#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
-   DSISR_PAGEATTR_CONFLT | \
-   DSISR_BADACCESS |   \
+#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
+   DSISR_PAGEATTR_CONFLT | \
+   DSISR_BADACCESS |   \
+   DSISR_KEYFAULT |\
DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a7d580..c31624f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -216,9 +216,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
 * bits we are interested in.  But there are some bits which
 * indicate errors in DSISR but can validly be set in SRR1.
 */
-   if (trap == 0x400)
+   if (trap == 0x400) {
error_code &= 0x4820;
-   else
+   flags |= FAULT_FLAG_INSTRUCTION;
+   } else
is_write = error_code & DSISR_ISSTORE;
 #else
is_write = error_code & ESR_DST;
@@ -261,6 +262,13 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
}
 #endif
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (error_code & DSISR_KEYFAULT) {
+   code = SEGV_PKUERR;
+   goto bad_area_nosemaphore;
+   }
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
@@ -441,6 +449,15 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 #endif /* CONFIG_PPC_STD_MMU */
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (!arch_vma_access_permitted(vma, flags & 

[RFC v2 11/12]Documentation: Documentation updates.

2017-06-16 Thread Ram Pai
The Documentaton file is moved from x86 into the generic area,
since this feature is now supported by more than one archs.

Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt  | 110 ++
 Documentation/x86/protection-keys.txt |  85 --
 2 files changed, 110 insertions(+), 85 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
new file mode 100644
index 000..b49e6bb
--- /dev/null
+++ b/Documentation/vm/protection-keys.txt
@@ -0,0 +1,110 @@
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
+found in new generation of intel CPUs on PowerPC CPUs.
+
+Memory Protection Keys provides a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables
+when an application changes protection domains.
+
+
+On Intel:
+
+It works by dedicating 4 previously ignored bits in each page table
+entry to a "protection key", giving 16 possible keys.
+
+There is also a new user-accessible register (PKRU) with two separate
+bits (Access Disable and Write Disable) for each key.  Being a CPU
+register, PKRU is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+
+There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+to the new register.  The feature is only available in 64-bit mode,
+even though there is theoretically space in the PAE PTEs.  These
+permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+
+On PowerPC:
+
+It works by dedicating 5 page table entry to a "protection key",
+giving 32 possible keys.
+
+There is a user-accessible register (AMR) with two separate bits
+(Access Disable and Write Disable) for each key.  Being a CPU
+register, AMR is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+NOTE: Disabling read permission does not disable
+write and vice-versa.
+
+The feature is available on 64-bit HPTE mode only.
+
+'mtspr 0xd, mem' reads the AMR register
+'mfspr mem, 0xd' writes into the AMR register.
+
+Permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+=== Syscalls ===
+
+There are 3 system calls which directly interact with pkeys:
+
+   int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+   int pkey_free(int pkey);
+   int pkey_mprotect(unsigned long start, size_t len,
+ unsigned long prot, int pkey);
+
+Before a pkey can be used, it must first be allocated with
+pkey_alloc().  An application calls the WRPKRU instruction
+directly in order to change access permissions to memory covered
+with a key.  In this example WRPKRU is wrapped by a C function
+called pkey_set().
+
+   int real_prot = PROT_READ|PROT_WRITE;
+   pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+   ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
+   ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
+   ... application runs here
+
+Now, if the application needs to update the data at 'ptr', it can
+gain access, do the update, then remove its write access:
+
+   pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+   *ptr = foo; // assign something
+   pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+
+Now when it frees the memory, it will also free the pkey since it
+is no longer in use:
+
+   munmap(ptr, PAGE_SIZE);
+   pkey_free(pkey);
+
+(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+ An example implementation can be found in
+ tools/testing/selftests/x86/protection_keys.c)
+
+=== Behavior ===
+
+The kernel attempts to make protection keys consistent with the
+behavior of a plain mprotect().  For instance if you do this:
+
+   mprotect(ptr, size, PROT_NONE);
+   something(ptr);
+
+you can expect the same effects with protection keys when doing this:
+
+   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
+   pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
+   something(ptr);
+
+That should be true whether something() is a direct access to 'ptr'
+like:
+
+   *ptr = foo;
+
+or when the kernel does the access on the application's behalf like
+with a read():
+
+   read(fd, ptr, 1);
+
+The kernel will send a SIGSEGV in both cases, but si_code will be set
+to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
+the plain mprotect() permissions are violated.
diff --git a/Documentation/x86/protection-keys.txt 
b/Documentation/x86/protection-keys.txt
deleted file mode 100644
index b643045..000
--- a/Documentation/x86/protection-keys.txt
+++ 

[RFC v2 08/12] powerpc: Handle exceptions caused by violation of pkey protection.

2017-06-16 Thread Ram Pai
Handle Data and Instruction exceptions caused by memory
protection-key.

Signed-off-by: Ram Pai 
(cherry picked from commit a5e5217619a0c475fe0cacc3b0cf1d3d33c79a09)

Conflicts:
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/exceptions-64s.S
---
 arch/powerpc/include/asm/mmu_context.h | 12 +
 arch/powerpc/include/asm/pkeys.h   |  9 
 arch/powerpc/include/asm/reg.h |  7 +--
 arch/powerpc/mm/fault.c| 21 +++-
 arch/powerpc/mm/pkeys.c| 90 ++
 5 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index da7e943..71fffe0 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -175,11 +175,23 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 {
 }
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+bool arch_pte_access_permitted(pte_t pte, bool write);
+bool arch_vma_access_permitted(struct vm_area_struct *vma,
+   bool write, bool execute, bool foreign);
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+   /* by default, allow everything */
+   return true;
+}
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
 {
/* by default, allow everything */
return true;
 }
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 9b6820d..405e7db 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -14,6 +14,15 @@
VM_PKEY_BIT3 | \
VM_PKEY_BIT4)
 
+static inline u16 pte_flags_to_pkey(unsigned long pte_flags)
+{
+   return ((pte_flags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0);
+}
+
 #define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | \
((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 2dcb8a1..a11977f 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -285,9 +285,10 @@
 #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
-#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
-   DSISR_PAGEATTR_CONFLT | \
-   DSISR_BADACCESS |   \
+#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
+   DSISR_PAGEATTR_CONFLT | \
+   DSISR_BADACCESS |   \
+   DSISR_KEYFAULT |\
DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a7d580..c31624f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -216,9 +216,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
 * bits we are interested in.  But there are some bits which
 * indicate errors in DSISR but can validly be set in SRR1.
 */
-   if (trap == 0x400)
+   if (trap == 0x400) {
error_code &= 0x4820;
-   else
+   flags |= FAULT_FLAG_INSTRUCTION;
+   } else
is_write = error_code & DSISR_ISSTORE;
 #else
is_write = error_code & ESR_DST;
@@ -261,6 +262,13 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
}
 #endif
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (error_code & DSISR_KEYFAULT) {
+   code = SEGV_PKUERR;
+   goto bad_area_nosemaphore;
+   }
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
@@ -441,6 +449,15 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 #endif /* CONFIG_PPC_STD_MMU */
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+  

[RFC v2 05/12] powerpc: Implementation for sys_mprotect_pkey() system call.

2017-06-16 Thread Ram Pai
This system call, associates the pkey with PTE of all
pages corresponding to the given address range.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 22 ++-
 arch/powerpc/include/asm/mman.h  | 29 +
 arch/powerpc/include/asm/pkeys.h | 21 ++-
 arch/powerpc/include/asm/systbl.h|  1 +
 arch/powerpc/include/asm/unistd.h|  4 +-
 arch/powerpc/include/uapi/asm/unistd.h   |  1 +
 arch/powerpc/mm/pkeys.c  | 93 +++-
 include/linux/mm.h   |  1 +
 8 files changed, 154 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 87e9a89..bc845cd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -37,6 +37,7 @@
 #define _RPAGE_RSV20x0800UL
 #define _RPAGE_RSV30x0400UL
 #define _RPAGE_RSV40x0200UL
+#define _RPAGE_RSV50x00040UL
 
 #define _PAGE_PTE  0x4000UL/* distinguishes PTEs 
from pointers */
 #define _PAGE_PRESENT  0x8000UL/* pte contains a 
translation */
@@ -56,6 +57,20 @@
 /* Max physical address bit as per radix table */
 #define _RPAGE_PA_MAX  57
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+#define H_PAGE_PKEY_BIT0   _RPAGE_RSV1
+#define H_PAGE_PKEY_BIT1   _RPAGE_RSV2
+#define H_PAGE_PKEY_BIT2   _RPAGE_RSV3
+#define H_PAGE_PKEY_BIT3   _RPAGE_RSV4
+#define H_PAGE_PKEY_BIT4   _RPAGE_RSV5
+#else /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+#define H_PAGE_PKEY_BIT0   0
+#define H_PAGE_PKEY_BIT1   0
+#define H_PAGE_PKEY_BIT2   0
+#define H_PAGE_PKEY_BIT3   0
+#define H_PAGE_PKEY_BIT4   0
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
 /*
  * Max physical address bit we will use for now.
  *
@@ -122,7 +137,12 @@
 #define PAGE_PROT_BITS  (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
 H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
 _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_EXEC | 
\
-_PAGE_SOFT_DIRTY)
+_PAGE_SOFT_DIRTY | \
+H_PAGE_PKEY_BIT0 | \
+H_PAGE_PKEY_BIT1 | \
+H_PAGE_PKEY_BIT2 | \
+H_PAGE_PKEY_BIT3 | \
+H_PAGE_PKEY_BIT4)
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 30922f6..14cc1aa 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,24 +13,31 @@
 
 #include 
 #include 
+#include 
 #include 
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+
 /*
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
-   unsigned long pkey)
-{
-   return (prot & PROT_SAO) ? VM_SAO : 0;
-}
-#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+#define arch_calc_vm_prot_bits(prot, key) ( \
+   ((prot) & PROT_SAO ? VM_SAO : 0) |  \
+   pkey_to_vmflag_bits(key))
+#define arch_vm_get_page_prot(vm_flags) __pgprot(   \
+   ((vm_flags) & VM_SAO ? _PAGE_SAO : 0) | \
+   vmflag_to_page_pkey_bits(vm_flags))
+
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((prot) & PROT_SAO ? VM_SAO : 0))
+#define arch_vm_get_page_prot(vm_flags) __pgprot(  \
+   ((vm_flags) & VM_SAO ? _PAGE_SAO : 0))
+
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 
-static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
-{
-   return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
-}
-#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
 
 static inline bool arch_validate_prot(unsigned long prot)
 {
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 7bc8746..0f3dca8 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -14,6 +14,19 @@
VM_PKEY_BIT3 | \
VM_PKEY_BIT4)
 
+#define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | \
+   ((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
+   ((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
+   ((key & 0x8UL) ? VM_PKEY_BIT3 : 0x0UL) |\
+   ((key & 0x10UL) ? VM_PKEY_BIT4 : 

[RFC v2 10/12] powerpc: Read AMR only if pkey-violation caused the exception.

2017-06-16 Thread Ram Pai
Signed-off-by: Ram Pai 
---
 arch/powerpc/kernel/exceptions-64s.S | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 8db9ef8..a4de1b4 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -493,13 +493,15 @@ EXC_COMMON_BEGIN(data_access_common)
ld  r12,_MSR(r1)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
 #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
+   beq+1f
mfspr   r5,SPRN_AMR
std r5,PACA_AMR(r13)
 #endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
-   li  r5,0x300
-   std r3,_DAR(r1)
-   std r4,_DSISR(r1)
+1: li  r5,0x300
 BEGIN_MMU_FTR_SECTION
b   do_hash_page/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
@@ -565,13 +567,15 @@ EXC_COMMON_BEGIN(instruction_access_common)
ld  r12,_MSR(r1)
ld  r3,_NIP(r1)
andis.  r4,r12,0x5820
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
 #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
+   beq+1f
mfspr   r5,SPRN_AMR
std r5,PACA_AMR(r13)
 #endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
-   li  r5,0x400
-   std r3,_DAR(r1)
-   std r4,_DSISR(r1)
+1: li  r5,0x400
 BEGIN_MMU_FTR_SECTION
b   do_hash_page/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
-- 
1.8.3.1



[RFC v2 01/12] powerpc: Free up four 64K PTE bits in 4K backed hpte pages.

2017-06-16 Thread Ram Pai
Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 4K backed hpte pages. These bits continue to be used
for 64K backed hpte pages in this patch, but will be freed
up in the next patch.

The patch does the following change to the 64K PTE format

H_PAGE_BUSY moves from bit 3 to bit 9
H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
is  initialized  to  0xF  indicating  an invalid  slot.  If  a hpte
gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
released immediately. In  other  words, even  though   0xF   is   a
valid slot we discard  and consider it as an invalid
slot;i.e hpte_soft_invalid(). This  gives  us  an opportunity to not
depend on a bit in the primary PTE in order to determine the
validity of a slot.

When  we  release  ahpte   in the 0xF   slot we also   release a
legitimate primary   slot  andunmapthat  entry. This  is  to
ensure  that we do get a   legimate   non-0xF  slot the next time we
retry for a slot.

Though treating 0xF slot as invalid reduces the number of available
slots  and  may  have an effect  on the performance, the probabilty
of hitting a 0xF is extermely low.

Compared  to the current scheme, the above described scheme reduces
the number of false hash table updates  significantly  and  has the
added  advantage  of  releasing  four  valuable  PTE bits for other
purpose.

This idea was jointly developed by Paul Mackerras, Aneesh, Michael
Ellermen and myself.

4K PTE format remain unchanged currently.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 20 +++
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 32 +++
 arch/powerpc/include/asm/book3s/64/hash.h | 15 +++--
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  5 ++
 arch/powerpc/mm/dump_linuxpagetables.c|  3 +-
 arch/powerpc/mm/hash64_4k.c   | 14 ++---
 arch/powerpc/mm/hash64_64k.c  | 81 ---
 arch/powerpc/mm/hash_utils_64.c   | 30 +++---
 8 files changed, 122 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index b4b5e6b..5ef1d81 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -16,6 +16,18 @@
 #define H_PUD_TABLE_SIZE   (sizeof(pud_t) << H_PUD_INDEX_SIZE)
 #define H_PGD_TABLE_SIZE   (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
+
+/*
+ * Only supported by 4k linux page size
+ */
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+#define H_PAGE_BUSY_RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
+
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
 H_PAGE_F_SECOND | H_PAGE_F_GIX)
@@ -48,6 +60,14 @@ static inline int hash__hugepd_ok(hugepd_t hpd)
 }
 #endif
 
+static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
+   unsigned int subpg_index, unsigned long slot)
+{
+   return (slot << H_PAGE_F_GIX_SHIFT) &
+   (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+}
+
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 static inline char *get_hpte_slot_array(pmd_t *pmdp)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9732837..0eb3c89 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -10,23 +10,25 @@
  * 64k aligned address free up few of the lower bits of RPN for us
  * We steal that here. For more deatils look at pte_pfn/pfn_pte()
  */
-#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+
+#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
  */
 #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
 
-/*
- * Used to track subpage group valid if H_PAGE_COMBO is set
- * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND
- */
-#define 

[RFC v2 05/12] powerpc: Implementation for sys_mprotect_pkey() system call.

2017-06-16 Thread Ram Pai
This system call, associates the pkey with PTE of all
pages corresponding to the given address range.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 22 ++-
 arch/powerpc/include/asm/mman.h  | 29 +
 arch/powerpc/include/asm/pkeys.h | 21 ++-
 arch/powerpc/include/asm/systbl.h|  1 +
 arch/powerpc/include/asm/unistd.h|  4 +-
 arch/powerpc/include/uapi/asm/unistd.h   |  1 +
 arch/powerpc/mm/pkeys.c  | 93 +++-
 include/linux/mm.h   |  1 +
 8 files changed, 154 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 87e9a89..bc845cd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -37,6 +37,7 @@
 #define _RPAGE_RSV20x0800UL
 #define _RPAGE_RSV30x0400UL
 #define _RPAGE_RSV40x0200UL
+#define _RPAGE_RSV50x00040UL
 
 #define _PAGE_PTE  0x4000UL/* distinguishes PTEs 
from pointers */
 #define _PAGE_PRESENT  0x8000UL/* pte contains a 
translation */
@@ -56,6 +57,20 @@
 /* Max physical address bit as per radix table */
 #define _RPAGE_PA_MAX  57
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+#define H_PAGE_PKEY_BIT0   _RPAGE_RSV1
+#define H_PAGE_PKEY_BIT1   _RPAGE_RSV2
+#define H_PAGE_PKEY_BIT2   _RPAGE_RSV3
+#define H_PAGE_PKEY_BIT3   _RPAGE_RSV4
+#define H_PAGE_PKEY_BIT4   _RPAGE_RSV5
+#else /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+#define H_PAGE_PKEY_BIT0   0
+#define H_PAGE_PKEY_BIT1   0
+#define H_PAGE_PKEY_BIT2   0
+#define H_PAGE_PKEY_BIT3   0
+#define H_PAGE_PKEY_BIT4   0
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
 /*
  * Max physical address bit we will use for now.
  *
@@ -122,7 +137,12 @@
 #define PAGE_PROT_BITS  (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
 H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
 _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_EXEC | 
\
-_PAGE_SOFT_DIRTY)
+_PAGE_SOFT_DIRTY | \
+H_PAGE_PKEY_BIT0 | \
+H_PAGE_PKEY_BIT1 | \
+H_PAGE_PKEY_BIT2 | \
+H_PAGE_PKEY_BIT3 | \
+H_PAGE_PKEY_BIT4)
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 30922f6..14cc1aa 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,24 +13,31 @@
 
 #include 
 #include 
+#include 
 #include 
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+
 /*
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
-   unsigned long pkey)
-{
-   return (prot & PROT_SAO) ? VM_SAO : 0;
-}
-#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+#define arch_calc_vm_prot_bits(prot, key) ( \
+   ((prot) & PROT_SAO ? VM_SAO : 0) |  \
+   pkey_to_vmflag_bits(key))
+#define arch_vm_get_page_prot(vm_flags) __pgprot(   \
+   ((vm_flags) & VM_SAO ? _PAGE_SAO : 0) | \
+   vmflag_to_page_pkey_bits(vm_flags))
+
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((prot) & PROT_SAO ? VM_SAO : 0))
+#define arch_vm_get_page_prot(vm_flags) __pgprot(  \
+   ((vm_flags) & VM_SAO ? _PAGE_SAO : 0))
+
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 
-static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
-{
-   return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
-}
-#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
 
 static inline bool arch_validate_prot(unsigned long prot)
 {
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 7bc8746..0f3dca8 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -14,6 +14,19 @@
VM_PKEY_BIT3 | \
VM_PKEY_BIT4)
 
+#define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | \
+   ((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
+   ((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
+   ((key & 0x8UL) ? VM_PKEY_BIT3 : 0x0UL) |\
+   ((key & 0x10UL) ? VM_PKEY_BIT4 : 0x0UL))
+
+#define 

[RFC v2 10/12] powerpc: Read AMR only if pkey-violation caused the exception.

2017-06-16 Thread Ram Pai
Signed-off-by: Ram Pai 
---
 arch/powerpc/kernel/exceptions-64s.S | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 8db9ef8..a4de1b4 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -493,13 +493,15 @@ EXC_COMMON_BEGIN(data_access_common)
ld  r12,_MSR(r1)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
 #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
+   beq+1f
mfspr   r5,SPRN_AMR
std r5,PACA_AMR(r13)
 #endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
-   li  r5,0x300
-   std r3,_DAR(r1)
-   std r4,_DSISR(r1)
+1: li  r5,0x300
 BEGIN_MMU_FTR_SECTION
b   do_hash_page/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
@@ -565,13 +567,15 @@ EXC_COMMON_BEGIN(instruction_access_common)
ld  r12,_MSR(r1)
ld  r3,_NIP(r1)
andis.  r4,r12,0x5820
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
 #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
+   beq+1f
mfspr   r5,SPRN_AMR
std r5,PACA_AMR(r13)
 #endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
-   li  r5,0x400
-   std r3,_DAR(r1)
-   std r4,_DSISR(r1)
+1: li  r5,0x400
 BEGIN_MMU_FTR_SECTION
b   do_hash_page/* Try to handle as hpte fault */
 MMU_FTR_SECTION_ELSE
-- 
1.8.3.1



[RFC v2 01/12] powerpc: Free up four 64K PTE bits in 4K backed hpte pages.

2017-06-16 Thread Ram Pai
Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 4K backed hpte pages. These bits continue to be used
for 64K backed hpte pages in this patch, but will be freed
up in the next patch.

The patch does the following change to the 64K PTE format

H_PAGE_BUSY moves from bit 3 to bit 9
H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
is  initialized  to  0xF  indicating  an invalid  slot.  If  a hpte
gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
released immediately. In  other  words, even  though   0xF   is   a
valid slot we discard  and consider it as an invalid
slot;i.e hpte_soft_invalid(). This  gives  us  an opportunity to not
depend on a bit in the primary PTE in order to determine the
validity of a slot.

When  we  release  ahpte   in the 0xF   slot we also   release a
legitimate primary   slot  andunmapthat  entry. This  is  to
ensure  that we do get a   legimate   non-0xF  slot the next time we
retry for a slot.

Though treating 0xF slot as invalid reduces the number of available
slots  and  may  have an effect  on the performance, the probabilty
of hitting a 0xF is extermely low.

Compared  to the current scheme, the above described scheme reduces
the number of false hash table updates  significantly  and  has the
added  advantage  of  releasing  four  valuable  PTE bits for other
purpose.

This idea was jointly developed by Paul Mackerras, Aneesh, Michael
Ellermen and myself.

4K PTE format remain unchanged currently.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 20 +++
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 32 +++
 arch/powerpc/include/asm/book3s/64/hash.h | 15 +++--
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  5 ++
 arch/powerpc/mm/dump_linuxpagetables.c|  3 +-
 arch/powerpc/mm/hash64_4k.c   | 14 ++---
 arch/powerpc/mm/hash64_64k.c  | 81 ---
 arch/powerpc/mm/hash_utils_64.c   | 30 +++---
 8 files changed, 122 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index b4b5e6b..5ef1d81 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -16,6 +16,18 @@
 #define H_PUD_TABLE_SIZE   (sizeof(pud_t) << H_PUD_INDEX_SIZE)
 #define H_PGD_TABLE_SIZE   (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
+
+/*
+ * Only supported by 4k linux page size
+ */
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+#define H_PAGE_BUSY_RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
+
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
 H_PAGE_F_SECOND | H_PAGE_F_GIX)
@@ -48,6 +60,14 @@ static inline int hash__hugepd_ok(hugepd_t hpd)
 }
 #endif
 
+static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
+   unsigned int subpg_index, unsigned long slot)
+{
+   return (slot << H_PAGE_F_GIX_SHIFT) &
+   (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+}
+
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 static inline char *get_hpte_slot_array(pmd_t *pmdp)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9732837..0eb3c89 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -10,23 +10,25 @@
  * 64k aligned address free up few of the lower bits of RPN for us
  * We steal that here. For more deatils look at pte_pfn/pfn_pte()
  */
-#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+
+#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
  */
 #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
 
-/*
- * Used to track subpage group valid if H_PAGE_COMBO is set
- * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND
- */
-#define H_PAGE_COMBO_VALID 

[RFC v2 00/12] powerpc: Memory Protection Keys

2017-06-16 Thread Ram Pai
Memory protection keys enable applications to protect its
address space from inadvertent access or corruption from
itself.

The overall idea:

 A process allocates a   key  and associates it with
 a  address  range  withinits   address   space.
 The process  than  can  dynamically  set read/write 
 permissions on  the   key   without  involving  the 
 kernel. Any  code that  violates   the  permissions
 off the address space; as defined by its associated
 key, will receive a segmentation fault.

This patch series enables the feature on PPC64.
It is enabled on HPTE 64K-page platform.

ISA3.0 section 5.7.13 describes the detailed specifications.


Testing:
This patch series has passed all the protection key
tests available in  the selftests directory.
The tests are updated to work on both x86 and powerpc.


version v2:
(1) documentation and selftest added
(2) fixed a bug in 4k hpte backed 64k pte where page
invalidation was not done correctly, and 
initialization of second-part-of-the-pte was not
done correctly if the pte was not yet Hashed
with a hpte.  Reported by Aneesh.
(3) Fixed ABI breakage caused in siginfo structure.
Reported by Anshuman.

Outstanding known issue:
  Calls to sys_swapcontext with a made-up context will end 
  up with a crap AMR if done by code who didn't know about
  that register. -- Reported by Ben.

version v1: Initial version

Thanks-to: Dave Hansen, Aneesh, Paul Mackerras,
   Michael Ellermen


Ram Pai (12):
  Free up four 64K PTE bits in 4K backed hpte pages.
  Free up four 64K PTE bits in 64K backed hpte pages.
  Implement sys_pkey_alloc and sys_pkey_free system call.
  store and restore the pkey state across context switches.
  Implementation for sys_mprotect_pkey() system call.
  Program HPTE key protection bits.
  Macro the mask used for checking DSI exception
  Handle exceptions caused by violation of pkey protection.
  Deliver SEGV signal on pkey violation.
  Read AMR only if pkey-violation caused the exception.
  Documentation updates.
  Updated protection key selftest

 Documentation/vm/protection-keys.txt  |  110 ++
 Documentation/x86/protection-keys.txt |   85 --
 arch/powerpc/Kconfig  |   15 +
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |   20 +
 arch/powerpc/include/asm/book3s/64/hash-64k.h |   48 +-
 arch/powerpc/include/asm/book3s/64/hash.h |   15 +-
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   10 +
 arch/powerpc/include/asm/book3s/64/mmu.h  |   10 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  |   84 +-
 arch/powerpc/include/asm/mman.h   |   29 +-
 arch/powerpc/include/asm/mmu_context.h|   12 +
 arch/powerpc/include/asm/paca.h   |1 +
 arch/powerpc/include/asm/pkeys.h  |  159 +++
 arch/powerpc/include/asm/processor.h  |5 +
 arch/powerpc/include/asm/reg.h|   10 +-
 arch/powerpc/include/asm/systbl.h |3 +
 arch/powerpc/include/asm/unistd.h |6 +-
 arch/powerpc/include/uapi/asm/ptrace.h|3 +-
 arch/powerpc/include/uapi/asm/unistd.h|3 +
 arch/powerpc/kernel/asm-offsets.c |5 +
 arch/powerpc/kernel/exceptions-64s.S  |   18 +-
 arch/powerpc/kernel/process.c |   18 +
 arch/powerpc/kernel/signal_32.c   |   14 +
 arch/powerpc/kernel/signal_64.c   |   14 +
 arch/powerpc/kernel/traps.c   |   49 +
 arch/powerpc/mm/Makefile  |1 +
 arch/powerpc/mm/dump_linuxpagetables.c|3 +-
 arch/powerpc/mm/fault.c   |   25 +-
 arch/powerpc/mm/hash64_4k.c   |   14 +-
 arch/powerpc/mm/hash64_64k.c  |   93 +-
 arch/powerpc/mm/hash_utils_64.c   |   35 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  |   16 +-
 arch/powerpc/mm/mmu_context_book3s64.c|5 +
 arch/powerpc/mm/pkeys.c   |  267 +
 include/linux/mm.h|   32 +-
 include/uapi/asm-generic/mman-common.h|2 +-
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  365 +++
 tools/testing/selftests/vm/protection_keys.c  | 1451 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1395 
 42 files changed, 2828 insertions(+), 1844 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 

[RFC v2 00/12] powerpc: Memory Protection Keys

2017-06-16 Thread Ram Pai
Memory protection keys enable applications to protect its
address space from inadvertent access or corruption from
itself.

The overall idea:

 A process allocates a   key  and associates it with
 a  address  range  withinits   address   space.
 The process  than  can  dynamically  set read/write 
 permissions on  the   key   without  involving  the 
 kernel. Any  code that  violates   the  permissions
 off the address space; as defined by its associated
 key, will receive a segmentation fault.

This patch series enables the feature on PPC64.
It is enabled on HPTE 64K-page platform.

ISA3.0 section 5.7.13 describes the detailed specifications.


Testing:
This patch series has passed all the protection key
tests available in  the selftests directory.
The tests are updated to work on both x86 and powerpc.


version v2:
(1) documentation and selftest added
(2) fixed a bug in 4k hpte backed 64k pte where page
invalidation was not done correctly, and 
initialization of second-part-of-the-pte was not
done correctly if the pte was not yet Hashed
with a hpte.  Reported by Aneesh.
(3) Fixed ABI breakage caused in siginfo structure.
Reported by Anshuman.

Outstanding known issue:
  Calls to sys_swapcontext with a made-up context will end 
  up with a crap AMR if done by code who didn't know about
  that register. -- Reported by Ben.

version v1: Initial version

Thanks-to: Dave Hansen, Aneesh, Paul Mackerras,
   Michael Ellermen


Ram Pai (12):
  Free up four 64K PTE bits in 4K backed hpte pages.
  Free up four 64K PTE bits in 64K backed hpte pages.
  Implement sys_pkey_alloc and sys_pkey_free system call.
  store and restore the pkey state across context switches.
  Implementation for sys_mprotect_pkey() system call.
  Program HPTE key protection bits.
  Macro the mask used for checking DSI exception
  Handle exceptions caused by violation of pkey protection.
  Deliver SEGV signal on pkey violation.
  Read AMR only if pkey-violation caused the exception.
  Documentation updates.
  Updated protection key selftest

 Documentation/vm/protection-keys.txt  |  110 ++
 Documentation/x86/protection-keys.txt |   85 --
 arch/powerpc/Kconfig  |   15 +
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |   20 +
 arch/powerpc/include/asm/book3s/64/hash-64k.h |   48 +-
 arch/powerpc/include/asm/book3s/64/hash.h |   15 +-
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   10 +
 arch/powerpc/include/asm/book3s/64/mmu.h  |   10 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  |   84 +-
 arch/powerpc/include/asm/mman.h   |   29 +-
 arch/powerpc/include/asm/mmu_context.h|   12 +
 arch/powerpc/include/asm/paca.h   |1 +
 arch/powerpc/include/asm/pkeys.h  |  159 +++
 arch/powerpc/include/asm/processor.h  |5 +
 arch/powerpc/include/asm/reg.h|   10 +-
 arch/powerpc/include/asm/systbl.h |3 +
 arch/powerpc/include/asm/unistd.h |6 +-
 arch/powerpc/include/uapi/asm/ptrace.h|3 +-
 arch/powerpc/include/uapi/asm/unistd.h|3 +
 arch/powerpc/kernel/asm-offsets.c |5 +
 arch/powerpc/kernel/exceptions-64s.S  |   18 +-
 arch/powerpc/kernel/process.c |   18 +
 arch/powerpc/kernel/signal_32.c   |   14 +
 arch/powerpc/kernel/signal_64.c   |   14 +
 arch/powerpc/kernel/traps.c   |   49 +
 arch/powerpc/mm/Makefile  |1 +
 arch/powerpc/mm/dump_linuxpagetables.c|3 +-
 arch/powerpc/mm/fault.c   |   25 +-
 arch/powerpc/mm/hash64_4k.c   |   14 +-
 arch/powerpc/mm/hash64_64k.c  |   93 +-
 arch/powerpc/mm/hash_utils_64.c   |   35 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  |   16 +-
 arch/powerpc/mm/mmu_context_book3s64.c|5 +
 arch/powerpc/mm/pkeys.c   |  267 +
 include/linux/mm.h|   32 +-
 include/uapi/asm-generic/mman-common.h|2 +-
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  365 +++
 tools/testing/selftests/vm/protection_keys.c  | 1451 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1395 
 42 files changed, 2828 insertions(+), 1844 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 

[PATCH v4 0/5] perf config: Bugfixes & Refactoring

2017-06-16 Thread Taeung Song
Hi all,

This is simple patchset for perf-config
to fix small bugs and refactor code.

I'd appreciate some feedback on this patchset.

The code is also available at 'config/refactoring-v4' branch on

  git://github.com/taeung/linux-perf.git


Thanks,
Taeung

v4:
- rebase on current acme/perf/core
- simplify commit log messages
- remove needless two patches

v3:
- fix a bug of no checked 'ret' in the loop in cmd_config() (Arnaldo)
- modify commit log messages to be more clear (Aranaldo)
- return -1 if show_spec_config() cannot show the config
- initialize 'ret' with -1 instead of 0 for more compact code in cmd_config()
- Add a error message when perf_config_set__new() failed in cmd_config()

v2:
- there is no need to consider empty config file (Arnaldo)

Taeung Song (5):
  perf config: Check error cases of {show_spec, set}_config()
  perf config: Refactor the code using 'ret' variable in cmd_config()
  perf config: Finally write changed configs on config file at a time
  perf config: Check not only section->from_system_config but also
item's
  perf config: Autogenerate a config file if it does not exist on
setting feature.

 tools/perf/builtin-config.c | 57 ++---
 tools/perf/util/config.c|  5 +---
 2 files changed, 39 insertions(+), 23 deletions(-)

-- 
2.7.4



[PATCH v4 0/5] perf config: Bugfixes & Refactoring

2017-06-16 Thread Taeung Song
Hi all,

This is simple patchset for perf-config
to fix small bugs and refactor code.

I'd appreciate some feedback on this patchset.

The code is also available at 'config/refactoring-v4' branch on

  git://github.com/taeung/linux-perf.git


Thanks,
Taeung

v4:
- rebase on current acme/perf/core
- simplify commit log messages
- remove needless two patches

v3:
- fix a bug of no checked 'ret' in the loop in cmd_config() (Arnaldo)
- modify commit log messages to be more clear (Aranaldo)
- return -1 if show_spec_config() cannot show the config
- initialize 'ret' with -1 instead of 0 for more compact code in cmd_config()
- Add a error message when perf_config_set__new() failed in cmd_config()

v2:
- there is no need to consider empty config file (Arnaldo)

Taeung Song (5):
  perf config: Check error cases of {show_spec, set}_config()
  perf config: Refactor the code using 'ret' variable in cmd_config()
  perf config: Finally write changed configs on config file at a time
  perf config: Check not only section->from_system_config but also
item's
  perf config: Autogenerate a config file if it does not exist on
setting feature.

 tools/perf/builtin-config.c | 57 ++---
 tools/perf/util/config.c|  5 +---
 2 files changed, 39 insertions(+), 23 deletions(-)

-- 
2.7.4



[PATCH v4 4/5] perf config: Check not only section->from_system_config but also item's

2017-06-16 Thread Taeung Song
Currently only section->from_system_config is being checked multiple times.
items->from_system_config should be also checked, so fix it.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index a29d96e..cf8e183 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -56,7 +56,7 @@ static int set_config(struct perf_config_set *set, const char 
*file_name)
fprintf(fp, "[%s]\n", section->name);
 
perf_config_items__for_each_entry(>items, item) {
-   if (!use_system_config && section->from_system_config)
+   if (!use_system_config && item->from_system_config)
continue;
if (item->value)
fprintf(fp, "\t%s = %s\n",
-- 
2.7.4



[PATCH v4 3/5] perf config: Finally write changed configs on config file at a time

2017-06-16 Thread Taeung Song
Currently set_config() can be repeatedly called for each
input config on the below case:

  $ perf config kmem.default=slab report.children=false ...

But it's a waste, so finally write changed configs at a time.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index ece4558..a29d96e 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -33,8 +33,7 @@ static struct option config_options[] = {
OPT_END()
 };
 
-static int set_config(struct perf_config_set *set, const char *file_name,
- const char *var, const char *value)
+static int set_config(struct perf_config_set *set, const char *file_name)
 {
struct perf_config_section *section = NULL;
struct perf_config_item *item = NULL;
@@ -48,7 +47,6 @@ static int set_config(struct perf_config_set *set, const char 
*file_name,
if (!fp)
return -1;
 
-   perf_config_set__collect(set, file_name, var, value);
fprintf(fp, "%s\n", first_line);
 
/* overwrite configvariables */
@@ -160,6 +158,7 @@ int cmd_config(int argc, const char **argv)
struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
const char *config_filename;
+   bool changed = false;
 
argc = parse_options(argc, argv, config_options, config_usage,
 PARSE_OPT_STOP_AT_NON_OPTION);
@@ -230,15 +229,26 @@ int cmd_config(int argc, const char **argv)
goto out_err;
}
} else {
-   if (set_config(set, config_filename, var, 
value) < 0) {
-   pr_err("Failed to set '%s=%s' on %s\n",
-  var, value, config_filename);
+   if (perf_config_set__collect(set, 
config_filename,
+var, value) < 0) {
+   pr_err("Failed to add '%s=%s'\n",
+  var, value);
free(arg);
goto out_err;
}
+   changed = true;
}
free(arg);
}
+
+   if (!changed)
+   break;
+
+   if (set_config(set, config_filename) < 0) {
+   pr_err("Failed to set the configs on %s\n",
+  config_filename);
+   goto out_err;
+   }
}
 
ret = 0;
-- 
2.7.4



[PATCH v4 2/5] perf config: Refactor the code using 'ret' variable in cmd_config()

2017-06-16 Thread Taeung Song
To simplify the code related to 'ret' variable in cmd_config(),
initialize 'ret' with -1 instead of 0.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 30 +-
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index bb1be79..ece4558 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -156,7 +156,7 @@ static int parse_config_arg(char *arg, char **var, char 
**value)
 
 int cmd_config(int argc, const char **argv)
 {
-   int i, ret = 0;
+   int i, ret = -1;
struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
const char *config_filename;
@@ -186,10 +186,8 @@ int cmd_config(int argc, const char **argv)
 * because of reinitializing with options config file location.
 */
set = perf_config_set__new();
-   if (!set) {
-   ret = -1;
+   if (!set)
goto out_err;
-   }
 
switch (actions) {
case ACTION_LIST:
@@ -197,10 +195,11 @@ int cmd_config(int argc, const char **argv)
pr_err("Error: takes no arguments\n");
parse_options_usage(config_usage, config_options, "l", 
1);
} else {
-   ret = show_config(set);
-   if (ret < 0)
+   if (show_config(set) < 0) {
pr_err("Nothing configured, "
   "please check your %s \n", 
config_filename);
+   goto out_err;
+   }
}
break;
default:
@@ -215,38 +214,35 @@ int cmd_config(int argc, const char **argv)
 
if (!arg) {
pr_err("%s: strdup failed\n", __func__);
-   ret = -1;
-   break;
+   goto out_err;
}
 
if (parse_config_arg(arg, , ) < 0) {
free(arg);
-   ret = -1;
-   break;
+   goto out_err;
}
 
if (value == NULL) {
-   ret = show_spec_config(set, var);
-   if (ret < 0) {
+   if (show_spec_config(set, var) < 0) {
pr_err("%s is not configured: %s\n",
   var, config_filename);
free(arg);
-   break;
+   goto out_err;
}
} else {
-   ret = set_config(set, config_filename, var, 
value);
-   if (ret < 0) {
+   if (set_config(set, config_filename, var, 
value) < 0) {
pr_err("Failed to set '%s=%s' on %s\n",
   var, value, config_filename);
free(arg);
-   break;
+   goto out_err;
}
}
free(arg);
}
}
 
-   perf_config_set__delete(set);
+   ret = 0;
 out_err:
+   perf_config_set__delete(set);
return ret;
 }
-- 
2.7.4



[PATCH v4 1/5] perf config: Check error cases of {show_spec, set}_config()

2017-06-16 Thread Taeung Song
show_spec_config() and set_config() can be called multiple times
in the loop in cmd_config().
However, The error cases of them wasn't checked, so fix it.

Reported-by: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 7545966..bb1be79 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -225,10 +225,23 @@ int cmd_config(int argc, const char **argv)
break;
}
 
-   if (value == NULL)
+   if (value == NULL) {
ret = show_spec_config(set, var);
-   else
+   if (ret < 0) {
+   pr_err("%s is not configured: %s\n",
+  var, config_filename);
+   free(arg);
+   break;
+   }
+   } else {
ret = set_config(set, config_filename, var, 
value);
+   if (ret < 0) {
+   pr_err("Failed to set '%s=%s' on %s\n",
+  var, value, config_filename);
+   free(arg);
+   break;
+   }
+   }
free(arg);
}
}
-- 
2.7.4



[PATCH v4 4/5] perf config: Check not only section->from_system_config but also item's

2017-06-16 Thread Taeung Song
Currently only section->from_system_config is being checked multiple times.
items->from_system_config should be also checked, so fix it.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index a29d96e..cf8e183 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -56,7 +56,7 @@ static int set_config(struct perf_config_set *set, const char 
*file_name)
fprintf(fp, "[%s]\n", section->name);
 
perf_config_items__for_each_entry(>items, item) {
-   if (!use_system_config && section->from_system_config)
+   if (!use_system_config && item->from_system_config)
continue;
if (item->value)
fprintf(fp, "\t%s = %s\n",
-- 
2.7.4



[PATCH v4 3/5] perf config: Finally write changed configs on config file at a time

2017-06-16 Thread Taeung Song
Currently set_config() can be repeatedly called for each
input config on the below case:

  $ perf config kmem.default=slab report.children=false ...

But it's a waste, so finally write changed configs at a time.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index ece4558..a29d96e 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -33,8 +33,7 @@ static struct option config_options[] = {
OPT_END()
 };
 
-static int set_config(struct perf_config_set *set, const char *file_name,
- const char *var, const char *value)
+static int set_config(struct perf_config_set *set, const char *file_name)
 {
struct perf_config_section *section = NULL;
struct perf_config_item *item = NULL;
@@ -48,7 +47,6 @@ static int set_config(struct perf_config_set *set, const char 
*file_name,
if (!fp)
return -1;
 
-   perf_config_set__collect(set, file_name, var, value);
fprintf(fp, "%s\n", first_line);
 
/* overwrite configvariables */
@@ -160,6 +158,7 @@ int cmd_config(int argc, const char **argv)
struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
const char *config_filename;
+   bool changed = false;
 
argc = parse_options(argc, argv, config_options, config_usage,
 PARSE_OPT_STOP_AT_NON_OPTION);
@@ -230,15 +229,26 @@ int cmd_config(int argc, const char **argv)
goto out_err;
}
} else {
-   if (set_config(set, config_filename, var, 
value) < 0) {
-   pr_err("Failed to set '%s=%s' on %s\n",
-  var, value, config_filename);
+   if (perf_config_set__collect(set, 
config_filename,
+var, value) < 0) {
+   pr_err("Failed to add '%s=%s'\n",
+  var, value);
free(arg);
goto out_err;
}
+   changed = true;
}
free(arg);
}
+
+   if (!changed)
+   break;
+
+   if (set_config(set, config_filename) < 0) {
+   pr_err("Failed to set the configs on %s\n",
+  config_filename);
+   goto out_err;
+   }
}
 
ret = 0;
-- 
2.7.4



[PATCH v4 2/5] perf config: Refactor the code using 'ret' variable in cmd_config()

2017-06-16 Thread Taeung Song
To simplify the code related to 'ret' variable in cmd_config(),
initialize 'ret' with -1 instead of 0.

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 30 +-
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index bb1be79..ece4558 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -156,7 +156,7 @@ static int parse_config_arg(char *arg, char **var, char 
**value)
 
 int cmd_config(int argc, const char **argv)
 {
-   int i, ret = 0;
+   int i, ret = -1;
struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
const char *config_filename;
@@ -186,10 +186,8 @@ int cmd_config(int argc, const char **argv)
 * because of reinitializing with options config file location.
 */
set = perf_config_set__new();
-   if (!set) {
-   ret = -1;
+   if (!set)
goto out_err;
-   }
 
switch (actions) {
case ACTION_LIST:
@@ -197,10 +195,11 @@ int cmd_config(int argc, const char **argv)
pr_err("Error: takes no arguments\n");
parse_options_usage(config_usage, config_options, "l", 
1);
} else {
-   ret = show_config(set);
-   if (ret < 0)
+   if (show_config(set) < 0) {
pr_err("Nothing configured, "
   "please check your %s \n", 
config_filename);
+   goto out_err;
+   }
}
break;
default:
@@ -215,38 +214,35 @@ int cmd_config(int argc, const char **argv)
 
if (!arg) {
pr_err("%s: strdup failed\n", __func__);
-   ret = -1;
-   break;
+   goto out_err;
}
 
if (parse_config_arg(arg, , ) < 0) {
free(arg);
-   ret = -1;
-   break;
+   goto out_err;
}
 
if (value == NULL) {
-   ret = show_spec_config(set, var);
-   if (ret < 0) {
+   if (show_spec_config(set, var) < 0) {
pr_err("%s is not configured: %s\n",
   var, config_filename);
free(arg);
-   break;
+   goto out_err;
}
} else {
-   ret = set_config(set, config_filename, var, 
value);
-   if (ret < 0) {
+   if (set_config(set, config_filename, var, 
value) < 0) {
pr_err("Failed to set '%s=%s' on %s\n",
   var, value, config_filename);
free(arg);
-   break;
+   goto out_err;
}
}
free(arg);
}
}
 
-   perf_config_set__delete(set);
+   ret = 0;
 out_err:
+   perf_config_set__delete(set);
return ret;
 }
-- 
2.7.4



[PATCH v4 1/5] perf config: Check error cases of {show_spec, set}_config()

2017-06-16 Thread Taeung Song
show_spec_config() and set_config() can be called multiple times
in the loop in cmd_config().
However, The error cases of them wasn't checked, so fix it.

Reported-by: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/builtin-config.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 7545966..bb1be79 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -225,10 +225,23 @@ int cmd_config(int argc, const char **argv)
break;
}
 
-   if (value == NULL)
+   if (value == NULL) {
ret = show_spec_config(set, var);
-   else
+   if (ret < 0) {
+   pr_err("%s is not configured: %s\n",
+  var, config_filename);
+   free(arg);
+   break;
+   }
+   } else {
ret = set_config(set, config_filename, var, 
value);
+   if (ret < 0) {
+   pr_err("Failed to set '%s=%s' on %s\n",
+  var, value, config_filename);
+   free(arg);
+   break;
+   }
+   }
free(arg);
}
}
-- 
2.7.4



[PATCH v4 5/5] perf config: Autogenerate a config file if it does not exist on setting feature.

2017-06-16 Thread Taeung Song
Currently the users can not create a config file
in the below case.

Before:

  $ rm -f ~/.perfconfig
  $ perf config --user report.children=false

  $ cat ~/.perfconfig
  cat: /root/.perfconfig: No such file or directory

But I think it should work no matter whether a config file exists or not.

After:

  $ rm -f ~/.perfconfig
  $ perf config --user report.children=false

  $ cat ~/.perfconfig
  # this file is auto-generated.
  [report]
  children = false

NOTE:
If not free config_set after perf_config_set__init() failed,
we can do that. The config set will be freed at the tail end.
(i.e. by perf_config_set__delete() at the end of cmd_config())

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/util/config.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 8d724f0..1cc5d80 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -691,10 +691,7 @@ struct perf_config_set *perf_config_set__new(void)
 
if (set) {
INIT_LIST_HEAD(>sections);
-   if (perf_config_set__init(set) < 0) {
-   perf_config_set__delete(set);
-   set = NULL;
-   }
+   perf_config_set__init(set);
}
 
return set;
-- 
2.7.4



[PATCH v4 5/5] perf config: Autogenerate a config file if it does not exist on setting feature.

2017-06-16 Thread Taeung Song
Currently the users can not create a config file
in the below case.

Before:

  $ rm -f ~/.perfconfig
  $ perf config --user report.children=false

  $ cat ~/.perfconfig
  cat: /root/.perfconfig: No such file or directory

But I think it should work no matter whether a config file exists or not.

After:

  $ rm -f ~/.perfconfig
  $ perf config --user report.children=false

  $ cat ~/.perfconfig
  # this file is auto-generated.
  [report]
  children = false

NOTE:
If not free config_set after perf_config_set__init() failed,
we can do that. The config set will be freed at the tail end.
(i.e. by perf_config_set__delete() at the end of cmd_config())

Cc: Jiri Olsa 
Cc: Namhyung Kim 
Signed-off-by: Taeung Song 
---
 tools/perf/util/config.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 8d724f0..1cc5d80 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -691,10 +691,7 @@ struct perf_config_set *perf_config_set__new(void)
 
if (set) {
INIT_LIST_HEAD(>sections);
-   if (perf_config_set__init(set) < 0) {
-   perf_config_set__delete(set);
-   set = NULL;
-   }
+   perf_config_set__init(set);
}
 
return set;
-- 
2.7.4



Re: [kernel-hardening] Re: [PATCH v4 06/13] iscsi: ensure RNG is seeded before use

2017-06-16 Thread Lee Duncan
On 06/16/2017 05:41 PM, Jason A. Donenfeld wrote:
> Hi Lee,
> 
> On Fri, Jun 16, 2017 at 11:58 PM, Lee Duncan  wrote:
>> It seems like what you are doing is basically "good", i.e. if there is
>> not enough random data, don't use it. But what happens in that case? The
>> authentication fails? How does the user know to wait and try again?
> 
> The process just remains in interruptible (kill-able) sleep until
> there is enough entropy, so the process doesn't need to do anything.
> If the waiting is interrupted by a signal, it returns -ESYSRESTART,
> which follows the usual semantics of restartable syscalls.
> 
> Jason
> 

In your testing, how long might a process have to wait? Are we talking
seconds? Longer? What about timeouts?

Sorry, but your changing something that isn't exactly broken, so I just
want to be sure we're not introducing some regression, like clients
can't connect the first 5 minutes are a reboot.
-- 
Lee Duncan


Re: [kernel-hardening] Re: [PATCH v4 06/13] iscsi: ensure RNG is seeded before use

2017-06-16 Thread Lee Duncan
On 06/16/2017 05:41 PM, Jason A. Donenfeld wrote:
> Hi Lee,
> 
> On Fri, Jun 16, 2017 at 11:58 PM, Lee Duncan  wrote:
>> It seems like what you are doing is basically "good", i.e. if there is
>> not enough random data, don't use it. But what happens in that case? The
>> authentication fails? How does the user know to wait and try again?
> 
> The process just remains in interruptible (kill-able) sleep until
> there is enough entropy, so the process doesn't need to do anything.
> If the waiting is interrupted by a signal, it returns -ESYSRESTART,
> which follows the usual semantics of restartable syscalls.
> 
> Jason
> 

In your testing, how long might a process have to wait? Are we talking
seconds? Longer? What about timeouts?

Sorry, but your changing something that isn't exactly broken, so I just
want to be sure we're not introducing some regression, like clients
can't connect the first 5 minutes are a reboot.
-- 
Lee Duncan


[PATCH v3] ip6_tunnel: Correct tos value in collect_md mode

2017-06-16 Thread Haishuang Yan
Same as ip_gre, geneve and vxlan, use key->tos as traffic class value.

CC: Peter Dawson 
Fixes: 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on
encapsulated packets”)
Signed-off-by: Haishuang Yan 

---
Changes since v3:
  * Add fixes information
  * Remove obsoleted RT_TOS mask
---
 net/ipv6/ip6_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ef99d59..9d65918 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1249,7 +1249,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
-   dsfield = ip6_tclass(key->label);
+   dsfield =  key->tos;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
@@ -1320,7 +1320,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
-   dsfield = ip6_tclass(key->label);
+   dsfield = key->tos;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, 
skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head 
*/
-- 
1.8.3.1





[PATCH v3] ip6_tunnel: Correct tos value in collect_md mode

2017-06-16 Thread Haishuang Yan
Same as ip_gre, geneve and vxlan, use key->tos as traffic class value.

CC: Peter Dawson 
Fixes: 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on
encapsulated packets”)
Signed-off-by: Haishuang Yan 

---
Changes since v3:
  * Add fixes information
  * Remove obsoleted RT_TOS mask
---
 net/ipv6/ip6_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ef99d59..9d65918 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1249,7 +1249,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
fl6.flowi6_proto = IPPROTO_IPIP;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
-   dsfield = ip6_tclass(key->label);
+   dsfield =  key->tos;
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
@@ -1320,7 +1320,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
fl6.flowi6_proto = IPPROTO_IPV6;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
-   dsfield = ip6_tclass(key->label);
+   dsfield = key->tos;
} else {
offset = ip6_tnl_parse_tlv_enc_lim(skb, 
skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head 
*/
-- 
1.8.3.1





[PATCH v2 1/2] ip_tunnel: fix ip tunnel lookup in collect_md mode

2017-06-16 Thread Haishuang Yan
In collect_md mode, if the tun dev is down, it still can call
ip_tunnel_rcv to receive on packets, and the rx statistics increase
improperly.

Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.")
Cc: Pravin B Shelar 
Signed-off-by: Haishuang Yan 

---
Change since v2:
  * Fix wrong recipient addresss
---
 net/ipv4/ip_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0f1d876..a3caba1 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net 
*itn,
return cand;
 
t = rcu_dereference(itn->collect_md_tun);
-   if (t)
+   if (t && (t->dev->flags & IFF_UP))
return t;
 
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
-- 
1.8.3.1





[PATCH v2 2/2] ip6_tunnel: fix ip6 tunnel lookup in collect_md mode

2017-06-16 Thread Haishuang Yan
In collect_md mode, if the tun dev is down, it still can call
__ip6_tnl_rcv to receive on packets, and the rx statistics increase
improperly.

Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Cc: Alexei Starovoitov 
Signed-off-by: Haishuang Yan 

---
Change since v2:
  * Fix wrong recipient address
---
 net/ipv6/ip6_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6400726..25961c7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -171,7 +171,7 @@ static struct net_device_stats *ip6_get_stats(struct 
net_device *dev)
}
 
t = rcu_dereference(ip6n->collect_md_tun);
-   if (t)
+   if (t && (t->dev->flags & IFF_UP))
return t;
 
t = rcu_dereference(ip6n->tnls_wc[0]);
-- 
1.8.3.1





[PATCH v2 2/2] ip6_tunnel: fix ip6 tunnel lookup in collect_md mode

2017-06-16 Thread Haishuang Yan
In collect_md mode, if the tun dev is down, it still can call
__ip6_tnl_rcv to receive on packets, and the rx statistics increase
improperly.

Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Cc: Alexei Starovoitov 
Signed-off-by: Haishuang Yan 

---
Change since v2:
  * Fix wrong recipient address
---
 net/ipv6/ip6_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6400726..25961c7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -171,7 +171,7 @@ static struct net_device_stats *ip6_get_stats(struct 
net_device *dev)
}
 
t = rcu_dereference(ip6n->collect_md_tun);
-   if (t)
+   if (t && (t->dev->flags & IFF_UP))
return t;
 
t = rcu_dereference(ip6n->tnls_wc[0]);
-- 
1.8.3.1





[PATCH v2 1/2] ip_tunnel: fix ip tunnel lookup in collect_md mode

2017-06-16 Thread Haishuang Yan
In collect_md mode, if the tun dev is down, it still can call
ip_tunnel_rcv to receive on packets, and the rx statistics increase
improperly.

Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.")
Cc: Pravin B Shelar 
Signed-off-by: Haishuang Yan 

---
Change since v2:
  * Fix wrong recipient addresss
---
 net/ipv4/ip_tunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0f1d876..a3caba1 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net 
*itn,
return cand;
 
t = rcu_dereference(itn->collect_md_tun);
-   if (t)
+   if (t && (t->dev->flags & IFF_UP))
return t;
 
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
-- 
1.8.3.1





[PATCH V2] staging: rtl8192u: style fix

2017-06-16 Thread Derek Robson
Fixed checkpatch.pl warnings of "function definition argument FOO should
also have an identifier name"
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vauge subjet
---
 drivers/staging/rtl8192u/ieee80211/ieee80211.h   | 2 +-
 drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h | 4 ++--
 drivers/staging/rtl8192u/r8192U.h| 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211.h 
b/drivers/staging/rtl8192u/ieee80211/ieee80211.h
index 899c77ed2a43..b062cad052b9 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211.h
@@ -2187,7 +2187,7 @@ int ieee80211_encrypt_fragment(struct ieee80211_device 
*ieee,
   struct sk_buff *frag, int hdr_len);
 
 int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev);
-void ieee80211_txb_free(struct ieee80211_txb *);
+void ieee80211_txb_free(struct ieee80211_txb *txb);
 
 
 /* ieee80211_rx.c */
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h 
b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
index 005bf89aae65..a0aa0f5be63a 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
@@ -82,8 +82,8 @@ struct ieee80211_crypt_data {
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
 struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
-void ieee80211_crypt_deinit_entries(struct ieee80211_device *, int);
-void ieee80211_crypt_deinit_handler(unsigned long);
+void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force);
+void ieee80211_crypt_deinit_handler(unsigned long data);
 void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
struct ieee80211_crypt_data **crypt);
 
diff --git a/drivers/staging/rtl8192u/r8192U.h 
b/drivers/staging/rtl8192u/r8192U.h
index 4c7a5e3d3e5e..51c150a39fc2 100644
--- a/drivers/staging/rtl8192u/r8192U.h
+++ b/drivers/staging/rtl8192u/r8192U.h
@@ -1147,9 +1147,9 @@ int write_nic_word(struct net_device *dev, int x, u16 y);
 int write_nic_dword(struct net_device *dev, int x, u32 y);
 void force_pci_posting(struct net_device *dev);
 
-void rtl8192_rtx_disable(struct net_device *);
-void rtl8192_rx_enable(struct net_device *);
-void rtl8192_tx_enable(struct net_device *);
+void rtl8192_rtx_disable(struct net_device *dev);
+void rtl8192_rx_enable(struct net_device *dev);
+void rtl8192_tx_enable(struct net_device *dev);
 
 void rtl8192_disassociate(struct net_device *dev);
 void rtl8185_set_rf_pins_enable(struct net_device *dev, u32 a);
-- 
2.13.0



[PATCH V2] staging: rtl8192u: style fix

2017-06-16 Thread Derek Robson
Fixed checkpatch.pl warnings of "function definition argument FOO should
also have an identifier name"
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vauge subjet
---
 drivers/staging/rtl8192u/ieee80211/ieee80211.h   | 2 +-
 drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h | 4 ++--
 drivers/staging/rtl8192u/r8192U.h| 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211.h 
b/drivers/staging/rtl8192u/ieee80211/ieee80211.h
index 899c77ed2a43..b062cad052b9 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211.h
@@ -2187,7 +2187,7 @@ int ieee80211_encrypt_fragment(struct ieee80211_device 
*ieee,
   struct sk_buff *frag, int hdr_len);
 
 int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev);
-void ieee80211_txb_free(struct ieee80211_txb *);
+void ieee80211_txb_free(struct ieee80211_txb *txb);
 
 
 /* ieee80211_rx.c */
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h 
b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
index 005bf89aae65..a0aa0f5be63a 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
@@ -82,8 +82,8 @@ struct ieee80211_crypt_data {
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
 struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
-void ieee80211_crypt_deinit_entries(struct ieee80211_device *, int);
-void ieee80211_crypt_deinit_handler(unsigned long);
+void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force);
+void ieee80211_crypt_deinit_handler(unsigned long data);
 void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
struct ieee80211_crypt_data **crypt);
 
diff --git a/drivers/staging/rtl8192u/r8192U.h 
b/drivers/staging/rtl8192u/r8192U.h
index 4c7a5e3d3e5e..51c150a39fc2 100644
--- a/drivers/staging/rtl8192u/r8192U.h
+++ b/drivers/staging/rtl8192u/r8192U.h
@@ -1147,9 +1147,9 @@ int write_nic_word(struct net_device *dev, int x, u16 y);
 int write_nic_dword(struct net_device *dev, int x, u32 y);
 void force_pci_posting(struct net_device *dev);
 
-void rtl8192_rtx_disable(struct net_device *);
-void rtl8192_rx_enable(struct net_device *);
-void rtl8192_tx_enable(struct net_device *);
+void rtl8192_rtx_disable(struct net_device *dev);
+void rtl8192_rx_enable(struct net_device *dev);
+void rtl8192_tx_enable(struct net_device *dev);
 
 void rtl8192_disassociate(struct net_device *dev);
 void rtl8185_set_rf_pins_enable(struct net_device *dev, u32 a);
-- 
2.13.0



[PATCH V2] staging: rtl8723bs - remove asm includes

2017-06-16 Thread Derek Robson
Fixed checkpatch warnings "Use #include  instead of "
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vauge subject.
---
 drivers/staging/rtl8723bs/include/osdep_service_linux.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/rtl8723bs/include/osdep_service_linux.h 
b/drivers/staging/rtl8723bs/include/osdep_service_linux.h
index 486e8184b0b2..0c9b4f622fee 100644
--- a/drivers/staging/rtl8723bs/include/osdep_service_linux.h
+++ b/drivers/staging/rtl8723bs/include/osdep_service_linux.h
@@ -26,10 +26,10 @@
/* include  */
#include 
#include 
-   #include 
+   #include 
#include 
-   #include 
-   #include 
+   #include 
+   #include 
#include 
#include 
#include 
-- 
2.13.0



[PATCH V2] staging: rtl8723bs - remove asm includes

2017-06-16 Thread Derek Robson
Fixed checkpatch warnings "Use #include  instead of "
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vauge subject.
---
 drivers/staging/rtl8723bs/include/osdep_service_linux.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/rtl8723bs/include/osdep_service_linux.h 
b/drivers/staging/rtl8723bs/include/osdep_service_linux.h
index 486e8184b0b2..0c9b4f622fee 100644
--- a/drivers/staging/rtl8723bs/include/osdep_service_linux.h
+++ b/drivers/staging/rtl8723bs/include/osdep_service_linux.h
@@ -26,10 +26,10 @@
/* include  */
#include 
#include 
-   #include 
+   #include 
#include 
-   #include 
-   #include 
+   #include 
+   #include 
#include 
#include 
#include 
-- 
2.13.0



[PATCH V2] staging: unisys: visorhba - style fix

2017-06-16 Thread Derek Robson
Fixed style of permissions to octal.
Found using checkpatch

Signed-off-by: Derek Robson 

V1 has vauge subject
---
 drivers/staging/unisys/visorhba/visorhba_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/unisys/visorhba/visorhba_main.c 
b/drivers/staging/unisys/visorhba/visorhba_main.c
index 2fd31c9762c6..a6e7a6bbc428 100644
--- a/drivers/staging/unisys/visorhba/visorhba_main.c
+++ b/drivers/staging/unisys/visorhba/visorhba_main.c
@@ -1090,7 +1090,7 @@ static int visorhba_probe(struct visor_device *dev)
goto err_scsi_remove_host;
}
devdata->debugfs_info =
-   debugfs_create_file("info", S_IRUSR | S_IRGRP,
+   debugfs_create_file("info", 0440,
devdata->debugfs_dir, devdata,
_debugfs_fops);
if (!devdata->debugfs_info) {
-- 
2.13.0



[PATCH V2] staging: unisys: visorhba - style fix

2017-06-16 Thread Derek Robson
Fixed style of permissions to octal.
Found using checkpatch

Signed-off-by: Derek Robson 

V1 has vauge subject
---
 drivers/staging/unisys/visorhba/visorhba_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/unisys/visorhba/visorhba_main.c 
b/drivers/staging/unisys/visorhba/visorhba_main.c
index 2fd31c9762c6..a6e7a6bbc428 100644
--- a/drivers/staging/unisys/visorhba/visorhba_main.c
+++ b/drivers/staging/unisys/visorhba/visorhba_main.c
@@ -1090,7 +1090,7 @@ static int visorhba_probe(struct visor_device *dev)
goto err_scsi_remove_host;
}
devdata->debugfs_info =
-   debugfs_create_file("info", S_IRUSR | S_IRGRP,
+   debugfs_create_file("info", 0440,
devdata->debugfs_dir, devdata,
_debugfs_fops);
if (!devdata->debugfs_info) {
-- 
2.13.0



Re: [PATCH v2] ip6_tunnel: Correct tos value in collect_md mode

2017-06-16 Thread 严海双


> On 16 Jun 2017, at 10:44 PM, Daniel Borkmann  wrote:
> 
> On 06/15/2017 05:54 AM, Peter Dawson wrote:
>> On Thu, 15 Jun 2017 10:30:29 +0800
>> Haishuang Yan  wrote:
>> 
>>> Same as ip_gre, geneve and vxlan, use key->tos as tos value.
>>> 
>>> CC: Peter Dawson 
>>> Fixes: 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on
>>> encapsulated packets”)
>>> Suggested-by: Daniel Borkmann 
>>> Signed-off-by: Haishuang Yan 
>>> 
>>> ---
>>> Changes since v2:
>>>   * Add fixes information
>>>   * mask key->tos with RT_TOS() suggested by Daniel
>>> ---
>>>  net/ipv6/ip6_tunnel.c | 4 ++--
>>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
>>> index ef99d59..6400726 100644
>>> --- a/net/ipv6/ip6_tunnel.c
>>> +++ b/net/ipv6/ip6_tunnel.c
>>> @@ -1249,7 +1249,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct 
>>> net_device *dev, __u8 dsfield,
>>> fl6.flowi6_proto = IPPROTO_IPIP;
>>> fl6.daddr = key->u.ipv6.dst;
>>> fl6.flowlabel = key->label;
>>> -   dsfield = ip6_tclass(key->label);
>>> +   dsfield =  RT_TOS(key->tos);
>>> } else {
>>> if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
>>> encap_limit = t->parms.encap_limit;
>>> @@ -1320,7 +1320,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct 
>>> net_device *dev, __u8 dsfield,
>>> fl6.flowi6_proto = IPPROTO_IPV6;
>>> fl6.daddr = key->u.ipv6.dst;
>>> fl6.flowlabel = key->label;
>>> -   dsfield = ip6_tclass(key->label);
>>> +   dsfield = RT_TOS(key->tos);
>>> } else {
>>> offset = ip6_tnl_parse_tlv_enc_lim(skb, 
>>> skb_network_header(skb));
>>> /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head 
>>> */
>> 
>> I don't think it is correct to apply RT_TOS
>> 
>> Here is my understanding based on the RFCs.
>> 
>> IPv4/6 Header:0 |0 1 2 3 |0 1 2 3 |0 1 2 3 |0 1 2 3 |
>> RFC2460(IPv6)   |Version | Traffic Class   ||
>> RFC2474(IPv6)   |Version | DSCP|ECN||
>> RFC2474(IPv4)   |Version |  IHL   |DSCP |ECN|
>> RFC1349(IPv4)   |Version |  IHL   | PREC |  TOS   |X|
>> RFC791 (IPv4)   |Version |  IHL   |  TOS|
>> 
>> u8 key->tos stores the full 8bits of Traffic class from an IPv6 header and;
>> u8 key->tos stores the full 8bits of TOS(RFC791) from an IPv4 header
>> u8 ip6_tclass will return the full 8bits of Traffic Class from an IPv6 
>> flowlabel
>> 
>> RT_TOS will return the RFC1349 4bit TOS field.
>> 
>> Applying RT_TOS to a key->tos will result in lost information and the 
>> inclusion of 1 bit of ECN if the original field was a DSCP+ECN.
>> 
>> Based on this understanding of the RFCs (but not years of experience) and 
>> since RFC1349 has been obsoleted by RFC2474 I think the use of RT_TOS should 
>> be deprecated.
>> 
>> This being said, dsfield = ip6_tclass(key->label) = key->tos isn't fully 
>> correct either because the result will contain the ECN bits as well as the 
>> DSCP.
>> 
>> I agree that code should be consistent, but not where there is a potential 
>> issue.
> 
> Yeah, you're right. Looks like initial dsfield = key->tos diff was
> the better choice then, sorry for my confusing comment.
> 
> For example, bpf_skb_set_tunnel_key() helper that populates the collect
> metadata as one user of this infra masks the key->label so that it really
> only holds the label meaning previous dsfield = ip6_tclass(key->label)
> will always be 0 in that case unlike key->tos that actually gets populated
> and would propagate it.
> 
Okay, I will change the commit back to initial version, thanks everyone.





Re: [PATCH v2] ip6_tunnel: Correct tos value in collect_md mode

2017-06-16 Thread 严海双


> On 16 Jun 2017, at 10:44 PM, Daniel Borkmann  wrote:
> 
> On 06/15/2017 05:54 AM, Peter Dawson wrote:
>> On Thu, 15 Jun 2017 10:30:29 +0800
>> Haishuang Yan  wrote:
>> 
>>> Same as ip_gre, geneve and vxlan, use key->tos as tos value.
>>> 
>>> CC: Peter Dawson 
>>> Fixes: 0e9a709560db ("ip6_tunnel, ip6_gre: fix setting of DSCP on
>>> encapsulated packets”)
>>> Suggested-by: Daniel Borkmann 
>>> Signed-off-by: Haishuang Yan 
>>> 
>>> ---
>>> Changes since v2:
>>>   * Add fixes information
>>>   * mask key->tos with RT_TOS() suggested by Daniel
>>> ---
>>>  net/ipv6/ip6_tunnel.c | 4 ++--
>>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>> 
>>> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
>>> index ef99d59..6400726 100644
>>> --- a/net/ipv6/ip6_tunnel.c
>>> +++ b/net/ipv6/ip6_tunnel.c
>>> @@ -1249,7 +1249,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct 
>>> net_device *dev, __u8 dsfield,
>>> fl6.flowi6_proto = IPPROTO_IPIP;
>>> fl6.daddr = key->u.ipv6.dst;
>>> fl6.flowlabel = key->label;
>>> -   dsfield = ip6_tclass(key->label);
>>> +   dsfield =  RT_TOS(key->tos);
>>> } else {
>>> if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
>>> encap_limit = t->parms.encap_limit;
>>> @@ -1320,7 +1320,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct 
>>> net_device *dev, __u8 dsfield,
>>> fl6.flowi6_proto = IPPROTO_IPV6;
>>> fl6.daddr = key->u.ipv6.dst;
>>> fl6.flowlabel = key->label;
>>> -   dsfield = ip6_tclass(key->label);
>>> +   dsfield = RT_TOS(key->tos);
>>> } else {
>>> offset = ip6_tnl_parse_tlv_enc_lim(skb, 
>>> skb_network_header(skb));
>>> /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head 
>>> */
>> 
>> I don't think it is correct to apply RT_TOS
>> 
>> Here is my understanding based on the RFCs.
>> 
>> IPv4/6 Header:0 |0 1 2 3 |0 1 2 3 |0 1 2 3 |0 1 2 3 |
>> RFC2460(IPv6)   |Version | Traffic Class   ||
>> RFC2474(IPv6)   |Version | DSCP|ECN||
>> RFC2474(IPv4)   |Version |  IHL   |DSCP |ECN|
>> RFC1349(IPv4)   |Version |  IHL   | PREC |  TOS   |X|
>> RFC791 (IPv4)   |Version |  IHL   |  TOS|
>> 
>> u8 key->tos stores the full 8bits of Traffic class from an IPv6 header and;
>> u8 key->tos stores the full 8bits of TOS(RFC791) from an IPv4 header
>> u8 ip6_tclass will return the full 8bits of Traffic Class from an IPv6 
>> flowlabel
>> 
>> RT_TOS will return the RFC1349 4bit TOS field.
>> 
>> Applying RT_TOS to a key->tos will result in lost information and the 
>> inclusion of 1 bit of ECN if the original field was a DSCP+ECN.
>> 
>> Based on this understanding of the RFCs (but not years of experience) and 
>> since RFC1349 has been obsoleted by RFC2474 I think the use of RT_TOS should 
>> be deprecated.
>> 
>> This being said, dsfield = ip6_tclass(key->label) = key->tos isn't fully 
>> correct either because the result will contain the ECN bits as well as the 
>> DSCP.
>> 
>> I agree that code should be consistent, but not where there is a potential 
>> issue.
> 
> Yeah, you're right. Looks like initial dsfield = key->tos diff was
> the better choice then, sorry for my confusing comment.
> 
> For example, bpf_skb_set_tunnel_key() helper that populates the collect
> metadata as one user of this infra masks the key->label so that it really
> only holds the label meaning previous dsfield = ip6_tclass(key->label)
> will always be 0 in that case unlike key->tos that actually gets populated
> and would propagate it.
> 
Okay, I will change the commit back to initial version, thanks everyone.





[PATCH 1/4] x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"

2017-06-16 Thread Len Brown
From: Len Brown 

cpufreq_quick_get() allows cpufreq drivers to over-ride cpu_khz
that is otherwise reported in x86 /proc/cpuinfo "cpu MHz".

There are four problems with this scheme,
any of them is sufficient justification to delete it.

1. Depending on which cpufreq driver is loaded, the behavior
   of this field is different.

2. Distros complain that they have to explain to users
   why and how this field changes.  Distros have requested a constant.

3. The two major providers of this information, acpi_cpufreq
   and intel_pstate, both "get it wrong" in different ways.

   acpi_cpufreq lies to the user by telling them that
   they are running at whatever frequency was last
   requested by software.

   intel_pstate lies to the user by telling them that
   they are running at the average frequency computed
   over an undefined measurement.  But an average computed
   over an undefined interval, is itself, undefined...

4. On modern processors, user space utilities, such as
   turbostat(1), are more accurate and more precise, while
   supporing concurrent measurement over arbitrary intervals.

Users who have been consulting /proc/cpuinfo to
track changing CPU frequency will be dissapointed that
it no longer wiggles -- perhaps being unaware of the
limitations of the information they have been consuming.

Yes, they can change their scripts to look in sysfs
cpufreq/scaling_cur_frequency.  Here they will find the same
data of dubious quality here removed from /proc/cpuinfo.
The value in sysfs will be addressed in a subsequent patch
to address issues 1-3, above.

Issue 4 will remain -- users that really care about
accurate frequency information should not be using either
proc or sysfs kernel interfaces.
They should be using using turbostat(8), or a similar
purpose-built analysis tool.

Signed-off-by: Len Brown 
---
 arch/x86/kernel/cpu/proc.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6df621a..218f798 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -2,7 +2,6 @@
 #include 
 #include 
 #include 
-#include 
 
 /*
  * Get CPU information for use by the procfs.
@@ -76,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
 
-   if (cpu_has(c, X86_FEATURE_TSC)) {
-   unsigned int freq = cpufreq_quick_get(cpu);
-
-   if (!freq)
-   freq = cpu_khz;
+   if (cpu_has(c, X86_FEATURE_TSC))
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-  freq / 1000, (freq % 1000));
-   }
+  cpu_khz / 1000, (cpu_khz % 1000));
 
/* Cache size */
if (c->x86_cache_size >= 0)
-- 
2.7.4



[PATCH 1/4] x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"

2017-06-16 Thread Len Brown
From: Len Brown 

cpufreq_quick_get() allows cpufreq drivers to over-ride cpu_khz
that is otherwise reported in x86 /proc/cpuinfo "cpu MHz".

There are four problems with this scheme,
any of them is sufficient justification to delete it.

1. Depending on which cpufreq driver is loaded, the behavior
   of this field is different.

2. Distros complain that they have to explain to users
   why and how this field changes.  Distros have requested a constant.

3. The two major providers of this information, acpi_cpufreq
   and intel_pstate, both "get it wrong" in different ways.

   acpi_cpufreq lies to the user by telling them that
   they are running at whatever frequency was last
   requested by software.

   intel_pstate lies to the user by telling them that
   they are running at the average frequency computed
   over an undefined measurement.  But an average computed
   over an undefined interval, is itself, undefined...

4. On modern processors, user space utilities, such as
   turbostat(1), are more accurate and more precise, while
   supporing concurrent measurement over arbitrary intervals.

Users who have been consulting /proc/cpuinfo to
track changing CPU frequency will be dissapointed that
it no longer wiggles -- perhaps being unaware of the
limitations of the information they have been consuming.

Yes, they can change their scripts to look in sysfs
cpufreq/scaling_cur_frequency.  Here they will find the same
data of dubious quality here removed from /proc/cpuinfo.
The value in sysfs will be addressed in a subsequent patch
to address issues 1-3, above.

Issue 4 will remain -- users that really care about
accurate frequency information should not be using either
proc or sysfs kernel interfaces.
They should be using using turbostat(8), or a similar
purpose-built analysis tool.

Signed-off-by: Len Brown 
---
 arch/x86/kernel/cpu/proc.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6df621a..218f798 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -2,7 +2,6 @@
 #include 
 #include 
 #include 
-#include 
 
 /*
  * Get CPU information for use by the procfs.
@@ -76,14 +75,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
 
-   if (cpu_has(c, X86_FEATURE_TSC)) {
-   unsigned int freq = cpufreq_quick_get(cpu);
-
-   if (!freq)
-   freq = cpu_khz;
+   if (cpu_has(c, X86_FEATURE_TSC))
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-  freq / 1000, (freq % 1000));
-   }
+  cpu_khz / 1000, (cpu_khz % 1000));
 
/* Cache size */
if (c->x86_cache_size >= 0)
-- 
2.7.4



[GIT PULL] x86,cpufreq: unify APERF/MPERF computation

2017-06-16 Thread Len Brown

In-Reply-To: 

Hi Rafael,

This patch series has 3 goals:

1. Make "cpu MHz" in /proc/cpuinfo supportable.

2. Make /sys/.../cpufreq/scaling_cur_freq meaningful
   and consistent on modern x86 systems.

3. Use 1. and 2. to remove scheduler and cpufreq overhead

There are 3 main changes since this series was proposed
about a year ago:

This update responds to distro feedback to make /proc/cpuinfo
"cpu MHz" constant.  Originally, we had proposed making it return
the same dynamic value as cpufreq sysfs.

Some community members suggested that sysfs MHz values should
be meaninful, even down to 10ms intervals.  So this has been
changed, versus the original proposal to not re-compute
at intervals shorter than 100ms.

(For those who really care about observing frequency, the
 recommendation remains to use turbostat(8) or equivalent utility,
 which can reliably measure concurrent intervals of arbitrary length)

The intel_pstate sampling mechanism has changed.
Originally this series removed an intel_pstate timer in HWP mode.
Now it removes the analogous scheduler call-back.

Most recently, in response to posting this patch on the list
about 10-days ago, the patch to remove frequency calculation
from inside intel_pstate was dropped, in order to maintain compatibility
with tracing scripts.  Also, the order of the last two patches
has been exchanged.

Please let me know if you see any issues with this series.

thanks!
Len Brown, Intel Open Source Technology Center

The following changes since commit 3c2993b8c6143d8a5793746a54eba8f86f95240f:

  Linux 4.12-rc4 (2017-06-04 16:47:43 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git x86

for you to fetch changes up to d020eed98440faa4a529c621f881aa9fda296956:

  intel_pstate: skip scheduler hook when in "performance" mode. (2017-06-16 
19:11:13 -0700)


Len Brown (4):
  x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"
  x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF
  intel_pstate: delete scheduler hook in HWP mode
  intel_pstate: skip scheduler hook when in "performance" mode.

 arch/x86/kernel/cpu/Makefile |  1 +
 arch/x86/kernel/cpu/aperfmperf.c | 82 
 arch/x86/kernel/cpu/proc.c   | 10 +
 drivers/cpufreq/cpufreq.c|  7 +++-
 drivers/cpufreq/intel_pstate.c   | 18 +++--
 include/linux/cpufreq.h  | 13 +++
 6 files changed, 109 insertions(+), 22 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/aperfmperf.c


[PATCH 3/4] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Len Brown
From: Len Brown 

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

In HWP mode, maintaining that value was the sole purpose of
the scheduler hook, intel_pstate_update_util_hwp(),
so it can now be removed.

Signed-off-by: Len Brown 
---
 drivers/cpufreq/intel_pstate.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index b7de5bd..4ec5668 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1732,16 +1732,6 @@ static void intel_pstate_adjust_pstate(struct cpudata 
*cpu, int target_pstate)
fp_toint(cpu->iowait_boost * 100));
 }
 
-static void intel_pstate_update_util_hwp(struct update_util_data *data,
-u64 time, unsigned int flags)
-{
-   struct cpudata *cpu = container_of(data, struct cpudata, update_util);
-   u64 delta_ns = time - cpu->sample.time;
-
-   if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL)
-   intel_pstate_sample(cpu, time);
-}
-
 static void intel_pstate_update_util_pid(struct update_util_data *data,
 u64 time, unsigned int flags)
 {
@@ -1933,6 +1923,9 @@ static void intel_pstate_set_update_util_hook(unsigned 
int cpu_num)
 {
struct cpudata *cpu = all_cpu_data[cpu_num];
 
+   if (hwp_active)
+   return;
+
if (cpu->update_util_set)
return;
 
@@ -2557,7 +2550,6 @@ static int __init intel_pstate_init(void)
} else {
hwp_active++;
intel_pstate.attr = hwp_cpufreq_attrs;
-   pstate_funcs.update_util = intel_pstate_update_util_hwp;
goto hwp_cpu_matched;
}
} else {
-- 
2.7.4



[GIT PULL] x86,cpufreq: unify APERF/MPERF computation

2017-06-16 Thread Len Brown

In-Reply-To: 

Hi Rafael,

This patch series has 3 goals:

1. Make "cpu MHz" in /proc/cpuinfo supportable.

2. Make /sys/.../cpufreq/scaling_cur_freq meaningful
   and consistent on modern x86 systems.

3. Use 1. and 2. to remove scheduler and cpufreq overhead

There are 3 main changes since this series was proposed
about a year ago:

This update responds to distro feedback to make /proc/cpuinfo
"cpu MHz" constant.  Originally, we had proposed making it return
the same dynamic value as cpufreq sysfs.

Some community members suggested that sysfs MHz values should
be meaninful, even down to 10ms intervals.  So this has been
changed, versus the original proposal to not re-compute
at intervals shorter than 100ms.

(For those who really care about observing frequency, the
 recommendation remains to use turbostat(8) or equivalent utility,
 which can reliably measure concurrent intervals of arbitrary length)

The intel_pstate sampling mechanism has changed.
Originally this series removed an intel_pstate timer in HWP mode.
Now it removes the analogous scheduler call-back.

Most recently, in response to posting this patch on the list
about 10-days ago, the patch to remove frequency calculation
from inside intel_pstate was dropped, in order to maintain compatibility
with tracing scripts.  Also, the order of the last two patches
has been exchanged.

Please let me know if you see any issues with this series.

thanks!
Len Brown, Intel Open Source Technology Center

The following changes since commit 3c2993b8c6143d8a5793746a54eba8f86f95240f:

  Linux 4.12-rc4 (2017-06-04 16:47:43 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git x86

for you to fetch changes up to d020eed98440faa4a529c621f881aa9fda296956:

  intel_pstate: skip scheduler hook when in "performance" mode. (2017-06-16 
19:11:13 -0700)


Len Brown (4):
  x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"
  x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF
  intel_pstate: delete scheduler hook in HWP mode
  intel_pstate: skip scheduler hook when in "performance" mode.

 arch/x86/kernel/cpu/Makefile |  1 +
 arch/x86/kernel/cpu/aperfmperf.c | 82 
 arch/x86/kernel/cpu/proc.c   | 10 +
 drivers/cpufreq/cpufreq.c|  7 +++-
 drivers/cpufreq/intel_pstate.c   | 18 +++--
 include/linux/cpufreq.h  | 13 +++
 6 files changed, 109 insertions(+), 22 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/aperfmperf.c


[PATCH 3/4] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Len Brown
From: Len Brown 

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

In HWP mode, maintaining that value was the sole purpose of
the scheduler hook, intel_pstate_update_util_hwp(),
so it can now be removed.

Signed-off-by: Len Brown 
---
 drivers/cpufreq/intel_pstate.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index b7de5bd..4ec5668 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1732,16 +1732,6 @@ static void intel_pstate_adjust_pstate(struct cpudata 
*cpu, int target_pstate)
fp_toint(cpu->iowait_boost * 100));
 }
 
-static void intel_pstate_update_util_hwp(struct update_util_data *data,
-u64 time, unsigned int flags)
-{
-   struct cpudata *cpu = container_of(data, struct cpudata, update_util);
-   u64 delta_ns = time - cpu->sample.time;
-
-   if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL)
-   intel_pstate_sample(cpu, time);
-}
-
 static void intel_pstate_update_util_pid(struct update_util_data *data,
 u64 time, unsigned int flags)
 {
@@ -1933,6 +1923,9 @@ static void intel_pstate_set_update_util_hook(unsigned 
int cpu_num)
 {
struct cpudata *cpu = all_cpu_data[cpu_num];
 
+   if (hwp_active)
+   return;
+
if (cpu->update_util_set)
return;
 
@@ -2557,7 +2550,6 @@ static int __init intel_pstate_init(void)
} else {
hwp_active++;
intel_pstate.attr = hwp_cpufreq_attrs;
-   pstate_funcs.update_util = intel_pstate_update_util_hwp;
goto hwp_cpu_matched;
}
} else {
-- 
2.7.4



[PATCH 4/4] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
From: Len Brown 

When the governor is set to "performance", intel_pstate does not
need the scheduler hook for doing any calculations.  Under these
conditions, its only purpose is to continue to maintain
cpufreq/scaling_cur_freq.

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

So in "performance" governor mode, the scheduler hook can be skipped.
This applies to both in Software and Hardware P-state control modes.

Suggested-by: Srinivas Pandruvada 
Signed-off-by: Len Brown 
---
 drivers/cpufreq/intel_pstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4ec5668..4538182 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2031,10 +2031,10 @@ static int intel_pstate_set_policy(struct 
cpufreq_policy *policy)
 */
intel_pstate_clear_update_util_hook(policy->cpu);
intel_pstate_max_within_limits(cpu);
+   } else {
+   intel_pstate_set_update_util_hook(policy->cpu);
}
 
-   intel_pstate_set_update_util_hook(policy->cpu);
-
if (hwp_active)
intel_pstate_hwp_set(policy->cpu);
 
-- 
2.7.4



[PATCH V2] staging: sm750fb - style fix

2017-06-16 Thread Derek Robson
Fixed checkpatch.pl warnings of the form "function definition argument
'foo' should also have an identifier name" in header files.

Signed-off-by: Derek Robson 

V1 had vague subject
---
 drivers/staging/sm750fb/sm750.h | 24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/sm750fb/sm750.h b/drivers/staging/sm750fb/sm750.h
index 5b186dafedec..4386122799b2 100644
--- a/drivers/staging/sm750fb/sm750.h
+++ b/drivers/staging/sm750fb/sm750.h
@@ -189,14 +189,22 @@ void hw_sm750_initAccel(struct sm750_dev *sm750_dev);
 int hw_sm750_deWait(void);
 int hw_sm750le_deWait(void);
 
-int hw_sm750_output_setMode(struct lynxfb_output*, struct fb_var_screeninfo*,
-   struct fb_fix_screeninfo*);
-int hw_sm750_crtc_checkMode(struct lynxfb_crtc*, struct fb_var_screeninfo*);
-int hw_sm750_crtc_setMode(struct lynxfb_crtc*, struct fb_var_screeninfo*,
- struct fb_fix_screeninfo*);
-int hw_sm750_setColReg(struct lynxfb_crtc*, ushort, ushort, ushort, ushort);
-int hw_sm750_setBLANK(struct lynxfb_output*, int);
-int hw_sm750le_setBLANK(struct lynxfb_output*, int);
+int hw_sm750_output_setMode(struct lynxfb_output *output,
+   struct fb_var_screeninfo *var,
+   struct fb_fix_screeninfo *fix);
+
+int hw_sm750_crtc_checkMode(struct lynxfb_crtc *crtc,
+   struct fb_var_screeninfo *var);
+
+int hw_sm750_crtc_setMode(struct lynxfb_crtc *crtc,
+ struct fb_var_screeninfo *var,
+ struct fb_fix_screeninfo *fix);
+
+int hw_sm750_setColReg(struct lynxfb_crtc *crtc, ushort index,
+  ushort red, ushort green, ushort blue);
+
+int hw_sm750_setBLANK(struct lynxfb_output *output, int blank);
+int hw_sm750le_setBLANK(struct lynxfb_output *output, int blank);
 int hw_sm750_pan_display(struct lynxfb_crtc *crtc,
 const struct fb_var_screeninfo *var,
 const struct fb_info *info);
-- 
2.13.0



[PATCH 4/4] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
From: Len Brown 

When the governor is set to "performance", intel_pstate does not
need the scheduler hook for doing any calculations.  Under these
conditions, its only purpose is to continue to maintain
cpufreq/scaling_cur_freq.

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

So in "performance" governor mode, the scheduler hook can be skipped.
This applies to both in Software and Hardware P-state control modes.

Suggested-by: Srinivas Pandruvada 
Signed-off-by: Len Brown 
---
 drivers/cpufreq/intel_pstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4ec5668..4538182 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2031,10 +2031,10 @@ static int intel_pstate_set_policy(struct 
cpufreq_policy *policy)
 */
intel_pstate_clear_update_util_hook(policy->cpu);
intel_pstate_max_within_limits(cpu);
+   } else {
+   intel_pstate_set_update_util_hook(policy->cpu);
}
 
-   intel_pstate_set_update_util_hook(policy->cpu);
-
if (hwp_active)
intel_pstate_hwp_set(policy->cpu);
 
-- 
2.7.4



[PATCH V2] staging: sm750fb - style fix

2017-06-16 Thread Derek Robson
Fixed checkpatch.pl warnings of the form "function definition argument
'foo' should also have an identifier name" in header files.

Signed-off-by: Derek Robson 

V1 had vague subject
---
 drivers/staging/sm750fb/sm750.h | 24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/sm750fb/sm750.h b/drivers/staging/sm750fb/sm750.h
index 5b186dafedec..4386122799b2 100644
--- a/drivers/staging/sm750fb/sm750.h
+++ b/drivers/staging/sm750fb/sm750.h
@@ -189,14 +189,22 @@ void hw_sm750_initAccel(struct sm750_dev *sm750_dev);
 int hw_sm750_deWait(void);
 int hw_sm750le_deWait(void);
 
-int hw_sm750_output_setMode(struct lynxfb_output*, struct fb_var_screeninfo*,
-   struct fb_fix_screeninfo*);
-int hw_sm750_crtc_checkMode(struct lynxfb_crtc*, struct fb_var_screeninfo*);
-int hw_sm750_crtc_setMode(struct lynxfb_crtc*, struct fb_var_screeninfo*,
- struct fb_fix_screeninfo*);
-int hw_sm750_setColReg(struct lynxfb_crtc*, ushort, ushort, ushort, ushort);
-int hw_sm750_setBLANK(struct lynxfb_output*, int);
-int hw_sm750le_setBLANK(struct lynxfb_output*, int);
+int hw_sm750_output_setMode(struct lynxfb_output *output,
+   struct fb_var_screeninfo *var,
+   struct fb_fix_screeninfo *fix);
+
+int hw_sm750_crtc_checkMode(struct lynxfb_crtc *crtc,
+   struct fb_var_screeninfo *var);
+
+int hw_sm750_crtc_setMode(struct lynxfb_crtc *crtc,
+ struct fb_var_screeninfo *var,
+ struct fb_fix_screeninfo *fix);
+
+int hw_sm750_setColReg(struct lynxfb_crtc *crtc, ushort index,
+  ushort red, ushort green, ushort blue);
+
+int hw_sm750_setBLANK(struct lynxfb_output *output, int blank);
+int hw_sm750le_setBLANK(struct lynxfb_output *output, int blank);
 int hw_sm750_pan_display(struct lynxfb_crtc *crtc,
 const struct fb_var_screeninfo *var,
 const struct fb_info *info);
-- 
2.13.0



  1   2   3   4   5   6   7   8   9   10   >