Re: [PATCH 0/3] Remove x86-specific code from generic headers

2019-07-12 Thread Thiago Jung Bauermann


I forgot to mark this series as v2 when generating the patches.
Sorry for the confusion.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


[PATCH 2/3] DMA mapping: Move SME handling to x86-specific files

2019-07-12 Thread Thiago Jung Bauermann
Secure Memory Encryption is an x86-specific feature, so it shouldn't appear
in generic kernel code.

In DMA mapping code, Christoph Hellwig mentioned that "There is no reason
why we should have a special debug printk just for one specific reason why
there is a requirement for a large DMA mask.", so we just remove
dma_check_mask().

In SWIOTLB code, there's no need to mention which memory encryption feature
is active, so just use a more generic warning. Besides, other architectures
will have different names for similar technology.

Signed-off-by: Thiago Jung Bauermann 
---
 arch/s390/include/asm/mem_encrypt.h |  4 +---
 arch/x86/include/asm/mem_encrypt.h  | 10 ++
 include/linux/mem_encrypt.h | 14 +-
 kernel/dma/mapping.c|  8 
 kernel/dma/swiotlb.c|  3 +--
 5 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/s390/include/asm/mem_encrypt.h 
b/arch/s390/include/asm/mem_encrypt.h
index 3eb018508190..ff813a56bc30 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,9 +4,7 @@
 
 #ifndef __ASSEMBLY__
 
-#define sme_me_mask0ULL
-
-static inline bool sme_active(void) { return false; }
+static inline bool mem_encrypt_active(void) { return false; }
 extern bool sev_active(void);
 
 int set_memory_encrypted(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 0c196c47d621..848ce43b9040 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -92,6 +92,16 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned 
long size) { return 0;
 
 extern char __start_bss_decrypted[], __end_bss_decrypted[], 
__start_bss_decrypted_unused[];
 
+static inline bool mem_encrypt_active(void)
+{
+   return sme_me_mask;
+}
+
+static inline u64 sme_get_me_mask(void)
+{
+   return sme_me_mask;
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 470bd53a89df..0c5b0ff9eb29 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -18,23 +18,11 @@
 
 #else  /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-#define sme_me_mask0ULL
-
-static inline bool sme_active(void) { return false; }
+static inline bool mem_encrypt_active(void) { return false; }
 static inline bool sev_active(void) { return false; }
 
 #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-static inline bool mem_encrypt_active(void)
-{
-   return sme_me_mask;
-}
-
-static inline u64 sme_get_me_mask(void)
-{
-   return sme_me_mask;
-}
-
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 /*
  * The __sme_set() and __sme_clr() macros are useful for adding or removing
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdadb6770..b53fc7ec4914 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -291,12 +291,6 @@ void dma_free_attrs(struct device *dev, size_t size, void 
*cpu_addr,
 }
 EXPORT_SYMBOL(dma_free_attrs);
 
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
-   if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
-   dev_warn(dev, "SME is active, device will require DMA bounce 
buffers\n");
-}
-
 int dma_supported(struct device *dev, u64 mask)
 {
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -321,7 +315,6 @@ int dma_set_mask(struct device *dev, u64 mask)
return -EIO;
 
arch_dma_set_mask(dev, mask);
-   dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
 }
@@ -333,7 +326,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
if (!dma_supported(dev, mask))
return -EIO;
 
-   dma_check_mask(dev, mask);
dev->coherent_dma_mask = mask;
return 0;
 }
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 62fa5a82a065..e52401f94e91 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -459,8 +459,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
panic("Can not allocate SWIOTLB buffer earlier and can't now 
provide you with the DMA bounce buffer");
 
if (mem_encrypt_active())
-   pr_warn_once("%s is active and system is using DMA bounce 
buffers\n",
-sme_active() ? "SME" : "SEV");
+   pr_warn_once("Memory encryption is active and system is using 
DMA bounce buffers\n");
 
mask = dma_get_seg_boundary(hwdev);
 


[PATCH 3/3] fs/core/vmcore: Move sev_active() reference to x86 arch code

2019-07-12 Thread Thiago Jung Bauermann
Secure Encrypted Virtualization is an x86-specific feature, so it shouldn't
appear in generic kernel code because it forces non-x86 architectures to
define the sev_active() function, which doesn't make a lot of sense.

To solve this problem, add an x86 elfcorehdr_read() function to override
the generic weak implementation. To do that, it's necessary to make
read_from_oldmem() public so that it can be used outside of vmcore.c.

Signed-off-by: Thiago Jung Bauermann 
---
 arch/x86/kernel/crash_dump_64.c |  5 +
 fs/proc/vmcore.c|  8 
 include/linux/crash_dump.h  | 14 ++
 include/linux/mem_encrypt.h |  1 -
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 22369dd5de3b..045e82e8945b 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char 
*buf, size_t csize,
 {
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
 }
+
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+   return read_from_oldmem(buf, count, ppos, 0, sev_active());
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 57957c91c6df..ca1f20bedd8c 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -100,9 +100,9 @@ static int pfn_is_ram(unsigned long pfn)
 }
 
 /* Reads a page from the oldmem device from given offset. */
-static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf,
-   bool encrypted)
+ssize_t read_from_oldmem(char *buf, size_t count,
+u64 *ppos, int userbuf,
+bool encrypted)
 {
unsigned long pfn, offset;
size_t nr_bytes;
@@ -166,7 +166,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0, sev_active());
+   return read_from_oldmem(buf, count, ppos, 0, false);
 }
 
 /*
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f774c5eb9e3c..4664fc1871de 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -115,4 +115,18 @@ static inline int vmcore_add_device_dump(struct 
vmcoredd_data *data)
return -EOPNOTSUPP;
 }
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+#ifdef CONFIG_PROC_VMCORE
+ssize_t read_from_oldmem(char *buf, size_t count,
+u64 *ppos, int userbuf,
+bool encrypted);
+#else
+static inline ssize_t read_from_oldmem(char *buf, size_t count,
+  u64 *ppos, int userbuf,
+  bool encrypted)
+{
+   return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE */
+
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 0c5b0ff9eb29..5c4a18a91f89 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -19,7 +19,6 @@
 #else  /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
 static inline bool mem_encrypt_active(void) { return false; }
-static inline bool sev_active(void) { return false; }
 
 #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 



[PATCH 1/3] x86, s390: Move ARCH_HAS_MEM_ENCRYPT definition to arch/Kconfig

2019-07-12 Thread Thiago Jung Bauermann
powerpc is also going to use this feature, so put it in a generic location.

Signed-off-by: Thiago Jung Bauermann 
Reviewed-by: Thomas Gleixner 
---
 arch/Kconfig  | 3 +++
 arch/s390/Kconfig | 3 ---
 arch/x86/Kconfig  | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index c47b328eada0..4ef3499d4480 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -927,6 +927,9 @@ config LOCK_EVENT_COUNTS
  the chance of application behavior change because of timing
  differences. The counts are reported via debugfs.
 
+config ARCH_HAS_MEM_ENCRYPT
+   bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5d8570ed6cab..f820e631bf89 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-config ARCH_HAS_MEM_ENCRYPT
-def_bool y
-
 config MMU
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9f331bb538b..5d3295f2df94 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -68,6 +68,7 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOVif X86_64
+   select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_APIif X86_64
select ARCH_HAS_PTE_SPECIAL
@@ -1520,9 +1521,6 @@ config X86_CPA_STATISTICS
  helps to determine the effectiveness of preserving large and huge
  page mappings when mapping protections are changed.
 
-config ARCH_HAS_MEM_ENCRYPT
-   def_bool y
-
 config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD



[PATCH 0/3] Remove x86-specific code from generic headers

2019-07-12 Thread Thiago Jung Bauermann
Hello,

This version mostly changes patch 2/3, removing dma_check_mask() from
kernel/dma/mapping.c as suggested by Christoph Hellwig, and also adapting
s390's . There's also a small change in patch 1/3 as
mentioned in the changelog below.

Patch 3/3 may or may not need to change s390 code depending on how Tom
Lendacky's patch is fixed to avoid breaking that architecture, so I haven't
made any changes for now.

These patches are applied on top of today's master which at the time was at
commit 9787aed57dd3 ("coresight: Make the coresight_device_fwnode_match
declaration's fwnode parameter const"), plus a cherry-pick of commit
e67a5ed1f86f ("dma-direct: Force unencrypted DMA under SME for certain DMA
masks"), which is in dma-mapping/for-next and comes from this patch:

https://lore.kernel.org/linux-iommu/10b83d9ff31bca88e94da2ff34e30619eb396078.1562785123.git.thomas.lenda...@amd.com/

I don't have a way to test SME, SEV, nor s390's PEF so the patches have only
been build tested.

Original cover letter below:

Both powerpc¹ and s390² are adding  headers. Currently,
they have to supply definitions for functions and macros which only have a
meaning on x86: sme_me_mask, sme_active() and sev_active().

Christoph Hellwig made a suggestion to "clean up the Kconfig and generic
headers bits for memory encryption so that we don't need all this
boilerplate code", and this is what this patch does.

After this patch set, this is powerpc's :

#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
#define _ASM_POWERPC_MEM_ENCRYPT_H

#include 

static inline bool mem_encrypt_active(void)
{
return is_secure_guest();
}

static inline bool force_dma_unencrypted(struct device *dev)
{
return is_secure_guest();
}

int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);

#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */

Changelog

Since v1:

- Patch "x86,s390: Move ARCH_HAS_MEM_ENCRYPT definition to arch/Kconfig"
  - Remove definition of ARCH_HAS_MEM_ENCRYPT from s390/Kconfig as well.
  - Reworded patch title and message a little bit.

- Patch "DMA mapping: Move SME handling to x86-specific files"
  - Adapt s390's  as well.
  - Remove dma_check_mask() from kernel/dma/mapping.c. Suggested by
Christoph Hellwig.

-- 

¹ 
https://lore.kernel.org/linuxppc-dev/20190521044912.1375-12-bauer...@linux.ibm.com/
² https://lore.kernel.org/kvm/20190612111236.99538-2-pa...@linux.ibm.com/

Thiago Jung Bauermann (3):
  x86,s390: Move ARCH_HAS_MEM_ENCRYPT definition to arch/Kconfig
  DMA mapping: Move SME handling to x86-specific files
  fs/core/vmcore: Move sev_active() reference to x86 arch code

 arch/Kconfig|  3 +++
 arch/s390/Kconfig   |  3 ---
 arch/s390/include/asm/mem_encrypt.h |  4 +---
 arch/x86/Kconfig|  4 +---
 arch/x86/include/asm/mem_encrypt.h  | 10 ++
 arch/x86/kernel/crash_dump_64.c |  5 +
 fs/proc/vmcore.c|  8 
 include/linux/crash_dump.h  | 14 ++
 include/linux/mem_encrypt.h | 15 +--
 kernel/dma/mapping.c|  8 
 kernel/dma/swiotlb.c|  3 +--
 11 files changed, 40 insertions(+), 37 deletions(-)



[GIT PULL] Please pull powerpc/linux.git powerpc-5.3-1 tag

2019-07-12 Thread Michael Ellerman
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Hi Linus,

Please pull powerpc updates for 5.3.

A bit of a small batch for us, just due to me not getting the time to review
things. Only one conflict that I'm aware of, in our pgtable.h, resolution is
simply to take both sides.

cheers


The following changes since commit cd6c84d8f0cdc911df435bb075ba22ce3c605b07:

  Linux 5.2-rc2 (2019-05-26 16:49:19 -0700)

are available in the git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
tags/powerpc-5.3-1

for you to fetch changes up to f5a9e488d62360c91c5770bd55a0b40e419a71ce:

  powerpc/powernv/idle: Fix restore of SPRN_LDBAR for POWER9 stop state. 
(2019-07-12 22:25:26 +1000)

- --
powerpc updates for 5.3

Notable changes:

 - Removal of the NPU DMA code, used by the out-of-tree Nvidia driver, as well
   as some other functions only used by drivers that haven't (yet?) made it
   upstream.

 - A fix for a bug in our handling of hardware watchpoints (eg. perf record -e
   mem: ...) which could lead to register corruption and kernel crashes.

 - Enable HAVE_ARCH_HUGE_VMAP, which allows us to use large pages for vmalloc
   when using the Radix MMU.

 - A large but incremental rewrite of our exception handling code to use gas
   macros rather than multiple levels of nested CPP macros.

And the usual small fixes, cleanups and improvements.

Thanks to:
  Alastair D'Silva, Alexey Kardashevskiy, Andreas Schwab, Aneesh Kumar K.V, Anju
  T Sudhakar, Anton Blanchard, Arnd Bergmann, Athira Rajeev, Cédric Le Goater,
  Christian Lamparter, Christophe Leroy, Christophe Lombard, Christoph Hellwig,
  Daniel Axtens, Denis Efremov, Enrico Weigelt, Frederic Barrat, Gautham R.
  Shenoy, Geert Uytterhoeven, Geliang Tang, Gen Zhang, Greg Kroah-Hartman, Greg
  Kurz, Gustavo Romero, Krzysztof Kozlowski, Madhavan Srinivasan, Masahiro
  Yamada, Mathieu Malaterre, Michael Neuling, Nathan Lynch, Naveen N. Rao,
  Nicholas Piggin, Nishad Kamdar, Oliver O'Halloran, Qian Cai, Ravi Bangoria,
  Sachin Sant, Sam Bobroff, Satheesh Rajendran, Segher Boessenkool, Shaokun
  Zhang, Shawn Anastasio, Stewart Smith, Suraj Jitindar Singh, Thiago Jung
  Bauermann, YueHaibing.

- --
Alastair D'Silva (1):
  ocxl: Update for AFU descriptor template version 1.1

Alexey Kardashevskiy (5):
  powerpc/pci/of: Fix OF flags parsing for 64bit BARs
  powerpc/pseries/dma: Allow SWIOTLB
  powerpc/pseries/dma: Enable SWIOTLB
  powerpc/pci/of: Parse unassigned resources
  powerpc/powernv: Fix stale iommu table base after VFIO

Andreas Schwab (1):
  powerpc/mm/32s: fix condition that is always true

Aneesh Kumar K.V (16):
  powerpc/mm: Remove unused variable declaration
  powerpc/mm/hash/4k: Don't use 64K page size for vmemmap with 4K pagesize
  powerpc/mm/radix: Use the right page size for vmemmap mapping
  powerpc/mm/drconf: Use NUMA_NO_NODE on failures instead of node 0
  powerpc/mm: Fix node look up with numa=off boot
  powerpc/mm: Consolidate numa_enable check and min_common_depth check
  powerpc/mm/nvdimm: Add an informative message if we fail to allocate 
altmap block
  powerpc/pseries/scm: Mark the region volatile if cache flush not required
  powerpc/nvdimm: Add support for multibyte read/write for metadata
  powerpc/pseries/scm: Use a specific endian format for storing uuid from 
the device tree
  powerpc/book3s: Use config independent helpers for page table walk
  powerpc/mm: pmd_devmap implies pmd_large().
  powerpc/mm: Remove radix dependency on HugeTLB page
  powerpc/mm: Handle page table allocation failures
  powerpc/mm/hugetlb: Fix kernel crash if we fail to allocate page table 
caches
  powerpc/mm/hugetlb: Don't enable HugeTLB if we don't have a page table 
cache

Anju T Sudhakar (1):
  powerpc/perf: Use cpumask_last() to determine the designated cpu for 
nest/core units.

Anton Blanchard (1):
  powerpc/configs: Disable latencytop

Athira Rajeev (1):
  powerpc/powernv/idle: Fix restore of SPRN_LDBAR for POWER9 stop state.

Christian Lamparter (1):
  powerpc/4xx/uic: clear pending interrupt after irq type/pol change

Christoph Hellwig (5):
  powerpc/powernv: remove the unused pnv_pci_set_p2p function
  powerpc/powernv: remove the unused tunneling exports
  powerpc/powernv: remove unused NPU DMA code
  powerpc/powernv: remove the unused vas_win_paste_addr and vas_win_id 
functions
  powerpc: remove device_to_mask()

Christophe Leroy (25):
  powerpc/lib: fix redundant inclusion of quad.o
  powerpc/lib: only build ldstfp.o when CONFIG_PPC_FPU is set
  powerpc/64: mark start_here_multiplatform as __ref
  powerpc/32s: fix suspend/resume when IBATs 4-7 are used
  powerpc/ftrace: Enable C Version of recordmcount
  powerpc: slightly improve cache helpers
 

Re: [PATCH 1/3] KVM: PPC: Book3S HV: Always save guest pmu for guest capable of nesting

2019-07-12 Thread Michael Ellerman
Suraj Jitindar Singh  writes:
> The performance monitoring unit (PMU) registers are saved on guest exit
> when the guest has set the pmcregs_in_use flag in its lppaca, if it
> exists, or unconditionally if it doesn't. If a nested guest is being
> run then the hypervisor doesn't, and in most cases can't, know if the
> pmu registers are in use since it doesn't know the location of the lppaca
> for the nested guest, although it may have one for its immediate guest.
> This results in the values of these registers being lost across nested
> guest entry and exit in the case where the nested guest was making use
> of the performance monitoring facility while it's nested guest hypervisor
> wasn't.
>
> Further more the hypervisor could interrupt a guest hypervisor between
> when it has loaded up the pmu registers and it calling H_ENTER_NESTED or
> between returning from the nested guest to the guest hypervisor and the
> guest hypervisor reading the pmu registers, in kvmhv_p9_guest_entry().
> This means that it isn't sufficient to just save the pmu registers when
> entering or exiting a nested guest, but that it is necessary to always
> save the pmu registers whenever a guest is capable of running nested guests
> to ensure the register values aren't lost in the context switch.
>
> Ensure the pmu register values are preserved by always saving their
> value into the vcpu struct when a guest is capable of running nested
> guests.
>
> This should have minimal performance impact however any impact can be
> avoided by booting a guest with "-machine pseries,cap-nested-hv=false"
> on the qemu commandline.
>
> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path 
> on P9 for radix guests"

I'm not clear why this and the next commit are marked as fixing the
above commit. Wasn't it broken prior to that commit as well?

cheers

> Signed-off-by: Suraj Jitindar Singh 
> ---
>  arch/powerpc/kvm/book3s_hv.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index ec1804f822af..b682a429f3ef 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3654,6 +3654,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 
> time_limit,
>   vcpu->arch.vpa.dirty = 1;
>   save_pmu = lp->pmcregs_in_use;
>   }
> + /* Must save pmu if this guest is capable of running nested guests */
> + save_pmu |= nesting_enabled(vcpu->kvm);
>  
>   kvmhv_save_guest_pmu(vcpu, save_pmu);
>  
> -- 
> 2.13.6


[PATCH] powerpc: remove meaningless KBUILD_ARFLAGS addition

2019-07-12 Thread Masahiro Yamada
The KBUILD_ARFLAGS addition in arch/powerpc/Makefile has never worked
in a useful way because it is always overridden by the following code
in the top Makefile:

  # use the deterministic mode of AR if available
  KBUILD_ARFLAGS := $(call ar-option,D)

The code in the top Makefile was added in 2011, by commit 40df759e2b9e
("kbuild: Fix build with binutils <= 2.19").

The KBUILD_ARFLAGS addition for ppc has always been dead code from the
beginning.

Nobody has reported a problem since 43c9127d94d6 ("powerpc: Add option
to use thin archives"), so this code was unneeded.

Signed-off-by: Masahiro Yamada 
---

 arch/powerpc/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index c345b79414a9..46ed198a3aa3 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -112,7 +112,6 @@ ifeq ($(HAS_BIARCH),y)
 KBUILD_CFLAGS  += -m$(BITS)
 KBUILD_AFLAGS  += -m$(BITS) -Wl,-a$(BITS)
 KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION)
-KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET)
 endif
 
 cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard=tls
-- 
2.17.1



Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 04:00:26PM +0100, Al Viro wrote:
> On Fri, Jul 12, 2019 at 02:25:53PM +0100, Al Viro wrote:
> 
> > if (flags & LOOKUP_BENEATH) {
> > nd->root = nd->path;
> > if (!(flags & LOOKUP_RCU))
> > path_get(>root);
> > else
> > nd->root_seq = nd->seq;
> 
> BTW, this assignment is needed for LOOKUP_RCU case.  Without it
> you are pretty much guaranteed that lazy pathwalk will fail,
> when it comes to complete_walk().
> 
> Speaking of which, what would happen if LOOKUP_ROOT/LOOKUP_BENEATH
> combination would someday get passed?

I don't understand what's going on with ->r_seq in there - your
call of path_is_under() is after having (re-)sampled rename_lock,
but if that was the only .. in there, who's going to recheck
the value?  For that matter, what's to guarantee that the thing
won't get moved just as you are returning from handle_dots()?

IOW, what does LOOKUP_IN_ROOT guarantee for caller (openat2())?


Re: [RFC PATCH kernel] powerpc/xive: Drop deregistered irqs

2019-07-12 Thread Benjamin Herrenschmidt
On Fri, 2019-07-12 at 19:37 +1000, Alexey Kardashevskiy wrote:
> 

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/irq.c#n614
> 
> If so, then in order to do EOI, I'll need the desc which is gone, or
> I am missing the point?

All you need is drop the local CPU priority.

Cheers,
Ben.




Re: [PATCH 2/3] DMA mapping: Move SME handling to x86-specific files

2019-07-12 Thread Thiago Jung Bauermann


[ Cc'ing Tom Lendacky which I forgot to do earlier. Sorry about that. ]

Hello Christoph,

Christoph Hellwig  writes:

> Honestly I think this code should go away without any replacement.
> There is no reason why we should have a special debug printk just
> for one specific reason why there is a requirement for a large DMA
> mask.

Makes sense. I'll submit a v2 which just removes this code.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH 1/3] x86/Kconfig: Move ARCH_HAS_MEM_ENCRYPT to arch/Kconfig

2019-07-12 Thread Thiago Jung Bauermann


Hello Thomas,

Thanks for quickly reviewing the patches.

Thomas Gleixner  writes:

> On Fri, 12 Jul 2019, Thiago Jung Bauermann wrote:
>
>> powerpc and s390 are going to use this feature as well, so put it in a
>> generic location.
>> 
>> Signed-off-by: Thiago Jung Bauermann 
>
> Reviewed-by: Thomas Gleixner 

Thanks!

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH 3/3] fs/core/vmcore: Move sev_active() reference to x86 arch code

2019-07-12 Thread Thiago Jung Bauermann


[ Cc'ing Tom Lendacky which I forgot to do earlier. Sorry about that. ]

Hello Halil,

Thanks for the quick review.

Halil Pasic  writes:

> On Fri, 12 Jul 2019 02:36:31 -0300
> Thiago Jung Bauermann  wrote:
>
>> Secure Encrypted Virtualization is an x86-specific feature, so it shouldn't
>> appear in generic kernel code because it forces non-x86 architectures to
>> define the sev_active() function, which doesn't make a lot of sense.
>
> sev_active() might be just bad (too specific) name for a general
> concept. s390 code defines it drives the right behavior in
> kernel/dma/direct.c (which uses it).

I thought about that but couldn't put my finger on a general concept.
Is it "guest with memory inaccessible to the host"?

Since your proposed definiton for force_dma_unencrypted() is simply to
make it equivalent to sev_active(), I thought it was more
straightforward to make each arch define force_dma_unencrypted()
directly.

Also, does sev_active() drive the right behavior for s390 in
elfcorehdr_read() as well?

>> To solve this problem, add an x86 elfcorehdr_read() function to override
>> the generic weak implementation. To do that, it's necessary to make
>> read_from_oldmem() public so that it can be used outside of vmcore.c.
>>
>> Signed-off-by: Thiago Jung Bauermann 
>> ---
>>  arch/x86/kernel/crash_dump_64.c |  5 +
>>  fs/proc/vmcore.c|  8 
>>  include/linux/crash_dump.h  | 14 ++
>>  include/linux/mem_encrypt.h |  1 -
>>  4 files changed, 23 insertions(+), 5 deletions(-)
>
> Does not seem to apply to today's or yesterdays master.

It assumes the presence of the two patches I mentioned in the cover
letter. Only one of them is in master.

I hadn't realized the s390 virtio patches were on their way to upstream.
I was keeping an eye on the email thread but didn't see they were picked
up in the s390 pull request. I'll add a new patch to this series making
the corresponding changes to s390's  as well.

--
Thiago Jung Bauermann
IBM Linux Technology Center



Re: [PATCH v4 1/8] KVM: PPC: Ultravisor: Introduce the MSR_S bit

2019-07-12 Thread Claudio Carvalho


On 7/11/19 9:57 PM, Nicholas Piggin wrote:
> Claudio Carvalho's on June 29, 2019 6:08 am:
>> From: Sukadev Bhattiprolu 
>>
>> The ultravisor processor mode is introduced in POWER platforms that
>> supports the Protected Execution Facility (PEF). Ultravisor is higher
>> privileged than hypervisor mode.
>>
>> In PEF enabled platforms, the MSR_S bit is used to indicate if the
>> thread is in secure state. With the MSR_S bit, the privilege state of
>> the thread is now determined by MSR_S, MSR_HV and MSR_PR, as follows:
>>
>> S   HV  PR
>> ---
>> 0   x   1   problem
>> 1   0   1   problem
>> x   x   0   privileged
>> x   1   0   hypervisor
>> 1   1   0   ultravisor
>> 1   1   1   reserved
> What does this table mean? I thought 'x' meant either


Yes, it means either. The table was arranged that way to say that:
- hypervisor state is also a privileged state,
- ultravisor state is also a hypervisor state.


> , but in that
> case there are several states that can apply to the same
> combination of bits.
>
> Would it be clearer to rearrange the table so the columns are the HV
> and PR bits we know and love, plus the effect of S=1 on each of them?
>
>   HV  PR  S=0 S=1
>   -
>   0   0   privileged  privileged (secure guest kernel)
>   0   1   problem problem (secure guest userspace)
>   1   0   hypervisor  ultravisor
>   1   1   problem reserved
>
> Is that accurate?

Yes, it is. I also like this format. I will consider it.


>
>
>> The hypervisor doesn't (and can't) run with the MSR_S bit set, but a
>> secure guest and the ultravisor firmware do.
>>
>> Signed-off-by: Sukadev Bhattiprolu 
>> Signed-off-by: Ram Pai 
>> [ Update the commit message ]
>> Signed-off-by: Claudio Carvalho 
>> ---
>>  arch/powerpc/include/asm/reg.h | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
>> index 10caa145f98b..39b4c0a519f5 100644
>> --- a/arch/powerpc/include/asm/reg.h
>> +++ b/arch/powerpc/include/asm/reg.h
>> @@ -38,6 +38,7 @@
>>  #define MSR_TM_LG   32  /* Trans Mem Available */
>>  #define MSR_VEC_LG  25  /* Enable AltiVec */
>>  #define MSR_VSX_LG  23  /* Enable VSX */
>> +#define MSR_S_LG22  /* Secure VM bit */
>>  #define MSR_POW_LG  18  /* Enable Power Management */
>>  #define MSR_WE_LG   18  /* Wait State Enable */
>>  #define MSR_TGPR_LG 17  /* TLB Update registers in use */
>> @@ -71,11 +72,13 @@
>>  #define MSR_SF  __MASK(MSR_SF_LG)   /* Enable 64 bit mode */
>>  #define MSR_ISF __MASK(MSR_ISF_LG)  /* Interrupt 64b mode 
>> valid on 630 */
>>  #define MSR_HV  __MASK(MSR_HV_LG)   /* Hypervisor state */
>> +#define MSR_S   __MASK(MSR_S_LG)/* Secure state */
> This is a real nitpick, but why two different comments for the bit 
> number and the mask?

Fixed for the next version. Both comments will be /* Secure state */

Thanks
Claudio




Re: [PATCH 01/12] Documentation: move architectures together

2019-07-12 Thread Jonathan Corbet
On Fri, 12 Jul 2019 10:20:07 +0800
Alex Shi  wrote:

> There are many different archs in Documentation/ dir, it's better to
> move them together in 'Documentation/arch' which follows from kernel source.

So this seems certain to collide badly with Mauro's RST-conversion monster
patch set.

More to the point, though...if we are going to thrash up things this
badly, we want to be sure that we're doing it right so we don't end up
renaming everything again.  Grouping stuff into a new arch/ subdirectory
adds a bit of order, but it doesn't do much toward trying to organize our
documentation for its readers, and it doesn't help us to modernize the
docs and get rid of the old, useless stuff.  A quick check shows that many
of these files have seen no changes other than typo fixes since the
beginning of the Git era.

So, in my mind, this needs some thought.  Maybe we want a
Documentation/arch in the end, but I'm not convinced that we should just
create it and fill it with a snow shovel.  This might be a good thing to
discuss at the kernel summit in September.

Thanks,

jon


Re: [PATCH 2/3] DMA mapping: Move SME handling to x86-specific files

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Thiago Jung Bauermann wrote:
> diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
> index b310a9c18113..f2e399fb626b 100644
> --- a/include/linux/mem_encrypt.h
> +++ b/include/linux/mem_encrypt.h
> @@ -21,23 +21,11 @@
>  
>  #else/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
>  
> -#define sme_me_mask  0ULL
> -
> -static inline bool sme_active(void) { return false; }
>  static inline bool sev_active(void) { return false; }

You want to move out sev_active as well, the only relevant thing is
mem_encrypt_active(). Everything SME/SEV is an architecture detail.

> +static inline bool mem_encrypt_active(void) { return false; }

Thanks,

tglx


Re: [PATCH 1/3] x86/Kconfig: Move ARCH_HAS_MEM_ENCRYPT to arch/Kconfig

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Thiago Jung Bauermann wrote:

> powerpc and s390 are going to use this feature as well, so put it in a
> generic location.
> 
> Signed-off-by: Thiago Jung Bauermann 

Reviewed-by: Thomas Gleixner 


[Bug 204125] FTBFS on ppc64 big endian and gcc9 because of -mcall-aixdesc and missing __linux__

2019-07-12 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=204125

--- Comment #8 from Daniel Kolesa (li...@octaforge.org) ---
Using this patch on my machines now:
https://gist.github.com/q66/625cbec5d7317829a302773f89533b51 seems to work well

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH] treewide: Rename rcu_dereference_raw_notrace to _check

2019-07-12 Thread Joel Fernandes
On Fri, Jul 12, 2019 at 08:01:07AM -0700, Paul E. McKenney wrote:
> On Thu, Jul 11, 2019 at 04:45:41PM -0400, Joel Fernandes (Google) wrote:
> > The rcu_dereference_raw_notrace() API name is confusing.
> > It is equivalent to rcu_dereference_raw() except that it also does
> > sparse pointer checking.
> > 
> > There are only a few users of rcu_dereference_raw_notrace(). This
> > patches renames all of them to be rcu_dereference_raw_check with the
> > "check" indicating sparse checking.
> > 
> > Signed-off-by: Joel Fernandes (Google) 
> 
> I queued this, but reworked the commit log and fixed a couple of
> irritating checkpatch issues that were in the original code.
> Does this work for you?

Thanks, yes it looks good to me.

thanks,

 - Joel

> 
>   Thanx, Paul
> 
> 
> 
> commit bd5c0fea6016c90cf7a9eb0435cd0c373dfdac2f
> Author: Joel Fernandes (Google) 
> Date:   Thu Jul 11 16:45:41 2019 -0400
> 
> treewide: Rename rcu_dereference_raw_notrace() to _check()
> 
> The rcu_dereference_raw_notrace() API name is confusing.  It is equivalent
> to rcu_dereference_raw() except that it also does sparse pointer checking.
> 
> There are only a few users of rcu_dereference_raw_notrace(). This patches
> renames all of them to be rcu_dereference_raw_check() with the "_check()"
> indicating sparse checking.
> 
> Signed-off-by: Joel Fernandes (Google) 
> [ paulmck: Fix checkpatch warnings about parentheses. ]
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
> b/Documentation/RCU/Design/Requirements/Requirements.html
> index f04c467e55c5..467251f7fef6 100644
> --- a/Documentation/RCU/Design/Requirements/Requirements.html
> +++ b/Documentation/RCU/Design/Requirements/Requirements.html
> @@ -2514,7 +2514,7 @@ disabled across the entire RCU read-side critical 
> section.
>  
>  It is possible to use tracing on RCU code, but tracing itself
>  uses RCU.
> -For this reason, rcu_dereference_raw_notrace()
> +For this reason, rcu_dereference_raw_check()
>  is provided for use by tracing, which avoids the destructive
>  recursion that could otherwise ensue.
>  This API is also used by virtualization in some architectures,
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
> b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 21b1ed5df888..53388a311967 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -546,7 +546,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
>   */
>  static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
>  {
> - return rcu_dereference_raw_notrace(kvm->memslots[0]);
> + return rcu_dereference_raw_check(kvm->memslots[0]);
>  }
>  
>  extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
> diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> index e91ec9ddcd30..932296144131 100644
> --- a/include/linux/rculist.h
> +++ b/include/linux/rculist.h
> @@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
> *n,
>   * as long as the traversal is guarded by rcu_read_lock().
>   */
>  #define hlist_for_each_entry_rcu(pos, head, member)  \
> - for (pos = hlist_entry_safe 
> (rcu_dereference_raw(hlist_first_rcu(head)),\
> + for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
>   typeof(*(pos)), member);\
>   pos;\
>   pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
> @@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct 
> hlist_node *n,
>   * not do any RCU debugging or tracing.
>   */
>  #define hlist_for_each_entry_rcu_notrace(pos, head, member)  
> \
> - for (pos = hlist_entry_safe 
> (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
> + for (pos = 
> hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
>   typeof(*(pos)), member);\
>   pos;\
> - pos = 
> hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
> + pos = 
> hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
>   &(pos)->member)), typeof(*(pos)), member))
>  
>  /**
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 0c9b92799abc..e5161e377ad4 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -478,7 +478,7 @@ do {  
>   \
>   * The no-tracing version of rcu_dereference_raw() must not call
>   * rcu_read_lock_held().
>   */
> -#define rcu_dereference_raw_notrace(p) 

Re: [PATCH v9 00/10] namei: openat2(2) path resolution restrictions

2019-07-12 Thread Aleksa Sarai
On 2019-07-12, Al Viro  wrote:
> On Sun, Jul 07, 2019 at 12:57:27AM +1000, Aleksa Sarai wrote:
> > Patch changelog:
> >   v9:
> > * Replace resolveat(2) with openat2(2). [Linus]
> > * Output a warning to dmesg if may_open_magiclink() is violated.
> > * Add an openat2(O_CREAT) testcase.
> 
> One general note for the future, BTW: for such series it's generally
> a good idea to put it into a public git tree somewhere and mention that
> in the announcement...

Sure, I'll mention it next time. For the record the tree is
  

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH v3] powerpc/setup_64: fix -Wempty-body warnings

2019-07-12 Thread Qian Cai
Ping.

On Fri, 2019-06-28 at 10:03 -0400, Qian Cai wrote:
> At the beginning of setup_64.c, it has,
> 
>   #ifdef DEBUG
>   #define DBG(fmt...) udbg_printf(fmt)
>   #else
>   #define DBG(fmt...)
>   #endif
> 
> where DBG() could be compiled away, and generate warnings,
> 
> arch/powerpc/kernel/setup_64.c: In function 'initialize_cache_info':
> arch/powerpc/kernel/setup_64.c:579:49: warning: suggest braces around
> empty body in an 'if' statement [-Wempty-body]
> DBG("Argh, can't find dcache properties !\n");
>  ^
> arch/powerpc/kernel/setup_64.c:582:49: warning: suggest braces around
> empty body in an 'if' statement [-Wempty-body]
> DBG("Argh, can't find icache properties !\n");
> 
> Fix it by using the no_printk() macro, and make sure that format and
> argument are always verified by the compiler.
> 
> Suggested-by: Tyrel Datwyler 
> Suggested-by: Joe Perches 
> Signed-off-by: Qian Cai 
> ---
> 
> v3: Use no_printk() macro, and make sure that format and argument are always
> verified by the compiler using a more generic form ##__VA_ARGS__ per Joe.
> 
> v2: Fix it by using a NOP while loop per Tyrel.
> 
>  arch/powerpc/kernel/setup_64.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 44b4c432a273..cea933a43f0a 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -69,9 +69,9 @@
>  #include "setup.h"
>  
>  #ifdef DEBUG
> -#define DBG(fmt...) udbg_printf(fmt)
> +#define DBG(fmt, ...) udbg_printf(fmt, ##__VA_ARGS__)
>  #else
> -#define DBG(fmt...)
> +#define DBG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
>  #endif
>  
>  int spinning_secondaries;


Re: [PATCH] powerpc/powernv: fix a W=1 compilation warning

2019-07-12 Thread Qian Cai
Ping.

On Wed, 2019-05-22 at 12:09 -0400, Qian Cai wrote:
> The commit b575c731fe58 ("powerpc/powernv/npu: Add set/unset window
> helpers") called pnv_npu_set_window() in a void function
> pnv_npu_dma_set_32(), but the return code from pnv_npu_set_window() has
> no use there as all the error logging happen in pnv_npu_set_window(),
> so just remove the unused variable to avoid a compilation warning,
> 
> arch/powerpc/platforms/powernv/npu-dma.c: In function
> 'pnv_npu_dma_set_32':
> arch/powerpc/platforms/powernv/npu-dma.c:198:10: warning: variable ‘rc’
> set but not used [-Wunused-but-set-variable]
> 
> Signed-off-by: Qian Cai 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index 495550432f3d..035208ed591f 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -195,7 +195,6 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
>  {
>   struct pci_dev *gpdev;
>   struct pnv_ioda_pe *gpe;
> - int64_t rc;
>  
>   /*
>    * Find the assoicated PCI devices and get the dma window
> @@ -208,8 +207,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
>   if (!gpe)
>   return;
>  
> - rc = pnv_npu_set_window(>table_group, 0,
> - gpe->table_group.tables[0]);
> + pnv_npu_set_window(>table_group, 0,
> +    gpe->table_group.tables[0]);
>  
>   /*
>    * NVLink devices use the same TCE table configuration as


Re: [PATCH kernel v4 0/4 repost] powerpc/ioda2: Yet another attempt to allow DMA masks between 32 and 59

2019-07-12 Thread Christoph Hellwig
On Fri, Jul 12, 2019 at 07:45:05PM +1000, Alexey Kardashevskiy wrote:
> This is an attempt to allow DMA masks between 32..59 which are not large
> enough to use either a PHB3 bypass mode or a sketchy bypass. Depending
> on the max order, up to 40 is usually available.

Can you elaborate what you man with supported in detail?  In the end
a DMA devices DMA capability is only really interesting as a lower
bound.

e.g. if you have a DMA that supports 40-bit DMA addressing we could
always treat it as if supports 32-bit addressing, and I thought the
powerpc code does that, as the DMA API now relies on that.  Did I miss
something and it explicitly rejected that (in which case I didn't spot
the fix in this series), or is this just an optimization to handle these
devices more optimally, in which case maybe the changelog could be
improved a bit.


Re: [PATCH kernel v4 4/4] powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages

2019-07-12 Thread Christoph Hellwig
> -extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
> - int nid);
> +extern struct iommu_table *iommu_init_table_res(struct iommu_table *tbl,
> + int nid, unsigned long res_start, unsigned long res_end);
> +#define iommu_init_table(tbl, nid) iommu_init_table_res((tbl), (nid), 0, 0)

I'd just pass the two additional paramters to iommu_init_table, there
are only 10 callers of it.

> +  * the max order of allocation possible. The TCE tableis likely to

Missing whitespace between tabke and is.

> +  * end up being multilevel and with on-demand allocation in place,
> +  * the initial use is not going to be huge as the default window aims
> +  * to support cripplied devices (i.e. not fully 64bit DMAble) only.

s/cripplied/crippled/


Re: [PATCH kernel v4 2/4] powerpc/iommu: Allow bypass-only for DMA

2019-07-12 Thread Christoph Hellwig
> This skips the 32bit DMA setup check if the bypass is can be selected.

That sentence does not parse.  I think you need to dop the "can be"
based on the actual patch.


Re: [PATCH v9 00/10] namei: openat2(2) path resolution restrictions

2019-07-12 Thread Al Viro
On Sun, Jul 07, 2019 at 12:57:27AM +1000, Aleksa Sarai wrote:
> Patch changelog:
>   v9:
> * Replace resolveat(2) with openat2(2). [Linus]
> * Output a warning to dmesg if may_open_magiclink() is violated.
> * Add an openat2(O_CREAT) testcase.

One general note for the future, BTW: for such series it's generally
a good idea to put it into a public git tree somewhere and mention that
in the announcement...


Re: [PATCH 3/3] fs/core/vmcore: Move sev_active() reference to x86 arch code

2019-07-12 Thread Christoph Hellwig
On Fri, Jul 12, 2019 at 04:51:53PM +0200, Halil Pasic wrote:
> Thank you very much! I will have another look, but it seems to me,
> without further measures taken, this would break protected virtualization
> support on s390. The effect of the che for s390 is that
> force_dma_unencrypted() will always return false instead calling into
> the platform code like it did before the patch, right?
> 
> Should I send a  Fixes: e67a5ed1f86f "dma-direct: Force unencrypted DMA
> under SME for certain DMA masks" (Tom Lendacky, 2019-07-10) patch that
> rectifies things for s390 or how do we want handle this?

Yes, please do.  I hadn't noticed the s390 support had landed in
mainline already.


Re: [PATCH] treewide: Rename rcu_dereference_raw_notrace to _check

2019-07-12 Thread Paul E. McKenney
On Thu, Jul 11, 2019 at 04:45:41PM -0400, Joel Fernandes (Google) wrote:
> The rcu_dereference_raw_notrace() API name is confusing.
> It is equivalent to rcu_dereference_raw() except that it also does
> sparse pointer checking.
> 
> There are only a few users of rcu_dereference_raw_notrace(). This
> patches renames all of them to be rcu_dereference_raw_check with the
> "check" indicating sparse checking.
> 
> Signed-off-by: Joel Fernandes (Google) 

I queued this, but reworked the commit log and fixed a couple of
irritating checkpatch issues that were in the original code.
Does this work for you?

Thanx, Paul



commit bd5c0fea6016c90cf7a9eb0435cd0c373dfdac2f
Author: Joel Fernandes (Google) 
Date:   Thu Jul 11 16:45:41 2019 -0400

treewide: Rename rcu_dereference_raw_notrace() to _check()

The rcu_dereference_raw_notrace() API name is confusing.  It is equivalent
to rcu_dereference_raw() except that it also does sparse pointer checking.

There are only a few users of rcu_dereference_raw_notrace(). This patches
renames all of them to be rcu_dereference_raw_check() with the "_check()"
indicating sparse checking.

Signed-off-by: Joel Fernandes (Google) 
[ paulmck: Fix checkpatch warnings about parentheses. ]
Signed-off-by: Paul E. McKenney 

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
b/Documentation/RCU/Design/Requirements/Requirements.html
index f04c467e55c5..467251f7fef6 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2514,7 +2514,7 @@ disabled across the entire RCU read-side critical section.
 
 It is possible to use tracing on RCU code, but tracing itself
 uses RCU.
-For this reason, rcu_dereference_raw_notrace()
+For this reason, rcu_dereference_raw_check()
 is provided for use by tracing, which avoids the destructive
 recursion that could otherwise ensue.
 This API is also used by virtualization in some architectures,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 21b1ed5df888..53388a311967 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -546,7 +546,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-   return rcu_dereference_raw_notrace(kvm->memslots[0]);
+   return rcu_dereference_raw_check(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e91ec9ddcd30..932296144131 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
+   for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
@@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * not do any RCU debugging or tracing.
  */
 #define hlist_for_each_entry_rcu_notrace(pos, head, member)
\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
+   for (pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
-   pos = 
hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
+   pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
 
 /**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0c9b92799abc..e5161e377ad4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -478,7 +478,7 @@ do {
  \
  * The no-tracing version of rcu_dereference_raw() must not call
  * rcu_read_lock_held().
  */
-#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
+#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
 
 /**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- 

Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 02:25:53PM +0100, Al Viro wrote:

>   if (flags & LOOKUP_BENEATH) {
>   nd->root = nd->path;
>   if (!(flags & LOOKUP_RCU))
>   path_get(>root);
>   else
>   nd->root_seq = nd->seq;

BTW, this assignment is needed for LOOKUP_RCU case.  Without it
you are pretty much guaranteed that lazy pathwalk will fail,
when it comes to complete_walk().

Speaking of which, what would happen if LOOKUP_ROOT/LOOKUP_BENEATH
combination would someday get passed?


Re: [PATCH v2 0/5] mm: Enable CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA

2019-07-12 Thread Michal Hocko
On Fri 12-07-19 15:37:30, Will Deacon wrote:
> Hi all,
> 
> On Fri, Jul 12, 2019 at 02:12:23PM +0200, Michal Hocko wrote:
> > On Fri 12-07-19 10:56:47, Hoan Tran OS wrote:
> > [...]
> > > It would be good if we can enable it by-default. Otherwise, let arch 
> > > enables it by them-self. Do you have any suggestions?
> > 
> > I can hardly make any suggestions when it is not really clear _why_ you
> > want to remove this config option in the first place. Please explain
> > what motivated you to make this change.
> 
> Sorry, I think this confusion might actually be my fault and Hoan has just
> been implementing my vague suggestion here:
> 
> https://lore.kernel.org/linux-arm-kernel/20190625101245.s4vxfosoop52gl4e@willie-the-truck/
> 
> If the preference of the mm folks is to leave CONFIG_NODES_SPAN_OTHER_NODES
> as it is, then we can define it for arm64. I just find it a bit weird that
> the majority of NUMA-capable architectures have to add a symbol in the arch
> Kconfig file, for what appears to be a performance optimisation applicable
> only to ia64, mips and sh.
> 
> At the very least we could make the thing selectable.

Hmm, I thought this was selectable. But I am obviously wrong here.
Looking more closely, it seems that this is indeed only about
__early_pfn_to_nid and as such not something that should add a config
symbol. This should have been called out in the changelog though.

Also while at it, does HAVE_MEMBLOCK_NODE_MAP fall into a similar
bucket? Do we have any NUMA architecture that doesn't enable it?

Thanks!
-- 
Michal Hocko
SUSE Labs


Re: [PATCH v2 0/5] mm: Enable CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA

2019-07-12 Thread Will Deacon
Hi all,

On Fri, Jul 12, 2019 at 02:12:23PM +0200, Michal Hocko wrote:
> On Fri 12-07-19 10:56:47, Hoan Tran OS wrote:
> [...]
> > It would be good if we can enable it by-default. Otherwise, let arch 
> > enables it by them-self. Do you have any suggestions?
> 
> I can hardly make any suggestions when it is not really clear _why_ you
> want to remove this config option in the first place. Please explain
> what motivated you to make this change.

Sorry, I think this confusion might actually be my fault and Hoan has just
been implementing my vague suggestion here:

https://lore.kernel.org/linux-arm-kernel/20190625101245.s4vxfosoop52gl4e@willie-the-truck/

If the preference of the mm folks is to leave CONFIG_NODES_SPAN_OTHER_NODES
as it is, then we can define it for arm64. I just find it a bit weird that
the majority of NUMA-capable architectures have to add a symbol in the arch
Kconfig file, for what appears to be a performance optimisation applicable
only to ia64, mips and sh.

At the very least we could make the thing selectable.

Will


Re: [PATCH 3/3] fs/core/vmcore: Move sev_active() reference to x86 arch code

2019-07-12 Thread Christoph Hellwig
On Fri, Jul 12, 2019 at 03:09:12PM +0200, Halil Pasic wrote:
> This is the implementation for the guys that don't
> have ARCH_HAS_MEM_ENCRYPT.
> 
> Means sev_active() may not be used in such code after this
> patch. What about 
> 
> static inline bool force_dma_unencrypted(void)
> {
> return sev_active();
> }
> 
> in kernel/dma/direct.c?

FYI, I have this pending in the dma-mapping tree:

http://git.infradead.org/users/hch/dma-mapping.git/commitdiff/e67a5ed1f86f4370991c601f2fcad9ebf9e1eebb


Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 01:55:52PM +0100, Al Viro wrote:
> On Fri, Jul 12, 2019 at 01:39:24PM +0100, Al Viro wrote:
> > On Fri, Jul 12, 2019 at 08:57:45PM +1000, Aleksa Sarai wrote:
> > 
> > > > > @@ -2350,9 +2400,11 @@ static const char *path_init(struct nameidata 
> > > > > *nd, unsigned flags)
> > > > >   s = ERR_PTR(error);
> > > > >   return s;
> > > > >   }
> > > > > - error = dirfd_path_init(nd);
> > > > > - if (unlikely(error))
> > > > > - return ERR_PTR(error);
> > > > > + if (likely(!nd->path.mnt)) {
> > > > 
> > > > Is that a weird way of saying "if we hadn't already called 
> > > > dirfd_path_init()"?
> > > 
> > > Yes. I did it to be more consistent with the other "have we got the
> > > root" checks elsewhere. Is there another way you'd prefer I do it?
> > 
> > "Have we got the root" checks are inevitable evil; here you are making the
> > control flow in a single function hard to follow.
> > 
> > I *think* what you are doing is
> > absolute pathname, no LOOKUP_BENEATH:
> > set_root
> > error = nd_jump_root(nd)
> > else
> > error = dirfd_path_init(nd)
> > return unlikely(error) ? ERR_PTR(error) : s;
> > which should be a lot easier to follow (not to mention shorter), but I might
> > be missing something in all of that.
> 
> PS: if that's what's going on, I would be tempted to turn the entire
> path_init() part into this:
>   if (flags & LOOKUP_BENEATH)
>   while (*s == '/')
>   s++;
> in the very beginning (plus the handling of nd_jump_root() prototype
> change, but that belongs with nd_jump_root() change itself, obviously).
> Again, I might be missing something here...

Argh... I am, at that - you have setting path->root (and grabbing it)
in LOOKUP_BENEATH cases and you do it after dirfd_path_init().  So
how about
if (flags & LOOKUP_BENEATH)
while (*s == '/')
s++;
before the whole thing and
if (*s == '/') { /* can happen only without LOOKUP_BENEATH */
set_root(nd);
error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
} else if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;

do {
seq = read_seqcount_begin(>seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = 
__read_seqcount_begin(>path.dentry->d_seq);
} while (read_seqcount_retry(>seq, seq));
} else {
get_fs_pwd(current->fs, >path);
nd->inode = nd->path.dentry->d_inode;
}  
} else {
/* Caller must check execute permissions on the starting path 
component */
struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;

if (!f.file)
return ERR_PTR(-EBADF);

dentry = f.file->f_path.dentry;

if (*s && unlikely(!d_can_lookup(dentry))) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}

nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
nd->inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(>path.dentry->d_seq);
} else {
path_get(>path);
nd->inode = nd->path.dentry->d_inode;
}
fdput(f);
}
if (flags & LOOKUP_BENEATH) {
nd->root = nd->path;
if (!(flags & LOOKUP_RCU))
path_get(>root);
else
nd->root_seq = nd->seq;
}
return s;
replacing the part in the end?  Makes for much smaller change; it might
very well still make sense to add dirfd_path_init() as a separate
cleanup (perhaps with the *s == '/' case included), though.


Re: [PATCH v9 01/10] namei: obey trailing magic-link DAC permissions

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 10:20:17PM +1000, Aleksa Sarai wrote:
> On 2019-07-12, Al Viro  wrote:
> > On Sun, Jul 07, 2019 at 12:57:28AM +1000, Aleksa Sarai wrote:
> > > @@ -514,7 +516,14 @@ static void set_nameidata(struct nameidata *p, int 
> > > dfd, struct filename *name)
> > >   p->stack = p->internal;
> > >   p->dfd = dfd;
> > >   p->name = name;
> > > - p->total_link_count = old ? old->total_link_count : 0;
> > > + p->total_link_count = 0;
> > > + p->acc_mode = 0;
> > > + p->opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE;
> > > + if (old) {
> > > + p->total_link_count = old->total_link_count;
> > > + p->acc_mode = old->acc_mode;
> > > + p->opath_mask = old->opath_mask;
> > > + }
> > 
> > Huh?  Could somebody explain why traversals of NFS4 referrals should inherit
> > ->acc_mode and ->opath_mask?
> 
> I'll be honest -- I don't understand what set_nameidata() did so I just
> did what I thought would be an obvious change (to just copy the
> contents). I thought it was related to some aspect of the symlink stack
> handling.

No.  It's handling of (very rare) nested pathwalk.  The only case I can think
of is handling of NFS4 referrals - they are triggered by ->d_automount()
and include NFS4 mount.  Which does internal pathwalk of its own, to get
to the root of subtree being automounted.

NFS has its own recursion protection on that path (no deeper nesting than
one level of referral traversals), but there some nesting is inevitable;
we do get another nameidata instance on stack.  And for nd_jump_link() we
need to keep track of the innermost one.

For symlinks nothing of that sort happens - they are dealt with on the same
struct nameidata.  ->total_link_count copying is there for one reason only -
we want the total amount of symlinks traversed during the pathwalk (including
the referral processing, etc.) to count towards MAXSYMLINKS check.  It could've
been moved from nameidata to task_struct, but it's cheaper to handle it that
way.

Again, nesting is *rare*.

> In that case, should they both be set to 0 on set_nameidata()? This will
> mean that fd re-opening (or magic-link opening) through a
> set_nameidata() would always fail.

Huh?  set_nameidata() is done for *all* instances - it's pretty much the
constructor of that object (and restore_nameidata() - a destructor).
Everything goes through it.

And again, I'm not sure we want these fields in nameidata - IMO they belong
in open_flags.  Things like e.g. stat() don't need them at all.

Incidentally, O_PATH opening of symlinks combined with subsequent procfs
symlink traversals is worth testing - that's where the things get subtle
and that's where it's easy to get in trouble on modifications.


Re: [PATCH] powerpc: mm: Limit rma_size to 1TB when running without HV mode

2019-07-12 Thread Michael Ellerman
Suraj Jitindar Singh  writes:
> The virtual real mode addressing (VRMA) mechanism is used when a
> partition is using HPT (Hash Page Table) translation and performs
> real mode accesses (MSR[IR|DR] = 0) in non-hypervisor mode. In this
> mode effective address bits 0:23 are treated as zero (i.e. the access
> is aliased to 0) and the access is performed using an implicit 1TB SLB
> entry.
>
> The size of the RMA (Real Memory Area) is communicated to the guest as
> the size of the first memory region in the device tree. And because of
> the mechanism described above can be expected to not exceed 1TB. In the
> event that the host erroneously represents the RMA as being larger than
> 1TB, guest accesses in real mode to memory addresses above 1TB will be
> aliased down to below 1TB. This means that a memory access performed in
> real mode may differ to one performed in virtual mode for the same memory
> address, which would likely have unintended consequences.
>
> To avoid this outcome have the guest explicitly limit the size of the
> RMA to the current maximum, which is 1TB. This means that even if the
> first memory block is larger than 1TB, only the first 1TB should be
> accessed in real mode.
>
> Signed-off-by: Suraj Jitindar Singh 

I added:

Fixes: c3ab300ea555 ("powerpc: Add POWER9 cputable entry")
Cc: sta...@vger.kernel.org # v4.6+


Which is not exactly correct, but probably good enough?

cheers

> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 28ced26f2a00..4d0e2cce9cd5 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -1901,11 +1901,19 @@ void hash__setup_initial_memory_limit(phys_addr_t 
> first_memblock_base,
>*
>* For guests on platforms before POWER9, we clamp the it limit to 1G
>* to avoid some funky things such as RTAS bugs etc...
> +  * On POWER9 we limit to 1TB in case the host erroneously told us that
> +  * the RMA was >1TB. Effective address bits 0:23 are treated as zero
> +  * (meaning the access is aliased to zero i.e. addr = addr % 1TB)
> +  * for virtual real mode addressing and so it doesn't make sense to
> +  * have an area larger than 1TB as it can't be addressed.
>*/
>   if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
>   ppc64_rma_size = first_memblock_size;
>   if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
>   ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x4000);
> + else
> + ppc64_rma_size = min_t(u64, ppc64_rma_size,
> +1UL << SID_SHIFT_1T);
>  
>   /* Finally limit subsequent allocations */
>   memblock_set_current_limit(ppc64_rma_size);
> -- 
> 2.13.6


Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 01:39:24PM +0100, Al Viro wrote:
> On Fri, Jul 12, 2019 at 08:57:45PM +1000, Aleksa Sarai wrote:
> 
> > > > @@ -2350,9 +2400,11 @@ static const char *path_init(struct nameidata 
> > > > *nd, unsigned flags)
> > > > s = ERR_PTR(error);
> > > > return s;
> > > > }
> > > > -   error = dirfd_path_init(nd);
> > > > -   if (unlikely(error))
> > > > -   return ERR_PTR(error);
> > > > +   if (likely(!nd->path.mnt)) {
> > > 
> > > Is that a weird way of saying "if we hadn't already called 
> > > dirfd_path_init()"?
> > 
> > Yes. I did it to be more consistent with the other "have we got the
> > root" checks elsewhere. Is there another way you'd prefer I do it?
> 
> "Have we got the root" checks are inevitable evil; here you are making the
> control flow in a single function hard to follow.
> 
> I *think* what you are doing is
>   absolute pathname, no LOOKUP_BENEATH:
>   set_root
>   error = nd_jump_root(nd)
>   else
>   error = dirfd_path_init(nd)
>   return unlikely(error) ? ERR_PTR(error) : s;
> which should be a lot easier to follow (not to mention shorter), but I might
> be missing something in all of that.

PS: if that's what's going on, I would be tempted to turn the entire
path_init() part into this:
if (flags & LOOKUP_BENEATH)
while (*s == '/')
s++;
in the very beginning (plus the handling of nd_jump_root() prototype
change, but that belongs with nd_jump_root() change itself, obviously).
Again, I might be missing something here...


Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 08:57:45PM +1000, Aleksa Sarai wrote:

> > > @@ -2350,9 +2400,11 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > >   s = ERR_PTR(error);
> > >   return s;
> > >   }
> > > - error = dirfd_path_init(nd);
> > > - if (unlikely(error))
> > > - return ERR_PTR(error);
> > > + if (likely(!nd->path.mnt)) {
> > 
> > Is that a weird way of saying "if we hadn't already called 
> > dirfd_path_init()"?
> 
> Yes. I did it to be more consistent with the other "have we got the
> root" checks elsewhere. Is there another way you'd prefer I do it?

"Have we got the root" checks are inevitable evil; here you are making the
control flow in a single function hard to follow.

I *think* what you are doing is
absolute pathname, no LOOKUP_BENEATH:
set_root
error = nd_jump_root(nd)
else
error = dirfd_path_init(nd)
return unlikely(error) ? ERR_PTR(error) : s;
which should be a lot easier to follow (not to mention shorter), but I might
be missing something in all of that.


Re: [PATCH v2] powerpc/book3s/mm: Update Oops message to print the correct translation in use

2019-07-12 Thread Christophe Leroy




Le 12/07/2019 à 14:22, Michael Ellerman a écrit :

Christophe Leroy  writes:

Le 12/07/2019 à 08:25, Michael Ellerman a écrit :

"Aneesh Kumar K.V"  writes:

...

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..b181d6860f28 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -250,15 +250,22 @@ static void oops_end(unsigned long flags, struct pt_regs 
*regs,
   }
   NOKPROBE_SYMBOL(oops_end);
   
+static char *get_mmu_str(void)

+{
+   if (early_radix_enabled())
+   return " MMU=Radix";
+   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+   return " MMU=Hash";
+   return "";
+}


We don't change MMU once we're up, so just do this logic once and stash
it into a static string, rather than rechecking on every oops.


Do we really have oops so often that we have to worry about that ?


Sometimes :)

But no I don't mean it's a performance issue, it just seems simpler to
compute the value once and store it. In fact for most platforms it can
just be a static string at compile time, it's only 64-bit Book3S that
needs to do anything at runtime.


Right, but I'm sure GCC will take care of that since the function is 
static and called only once.


Christophe


cheers



Re: [PATCH v2] powerpc/book3s/mm: Update Oops message to print the correct translation in use

2019-07-12 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 12/07/2019 à 08:25, Michael Ellerman a écrit :
>> "Aneesh Kumar K.V"  writes:
...
>>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>>> index 11caa0291254..b181d6860f28 100644
>>> --- a/arch/powerpc/kernel/traps.c
>>> +++ b/arch/powerpc/kernel/traps.c
>>> @@ -250,15 +250,22 @@ static void oops_end(unsigned long flags, struct 
>>> pt_regs *regs,
>>>   }
>>>   NOKPROBE_SYMBOL(oops_end);
>>>   
>>> +static char *get_mmu_str(void)
>>> +{
>>> +   if (early_radix_enabled())
>>> +   return " MMU=Radix";
>>> +   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
>>> +   return " MMU=Hash";
>>> +   return "";
>>> +}
>> 
>> We don't change MMU once we're up, so just do this logic once and stash
>> it into a static string, rather than rechecking on every oops.
>
> Do we really have oops so often that we have to worry about that ?

Sometimes :)

But no I don't mean it's a performance issue, it just seems simpler to
compute the value once and store it. In fact for most platforms it can
just be a static string at compile time, it's only 64-bit Book3S that
needs to do anything at runtime.

cheers


Re: [PATCH v9 01/10] namei: obey trailing magic-link DAC permissions

2019-07-12 Thread Aleksa Sarai
On 2019-07-12, Al Viro  wrote:
> On Sun, Jul 07, 2019 at 12:57:28AM +1000, Aleksa Sarai wrote:
> > @@ -514,7 +516,14 @@ static void set_nameidata(struct nameidata *p, int 
> > dfd, struct filename *name)
> > p->stack = p->internal;
> > p->dfd = dfd;
> > p->name = name;
> > -   p->total_link_count = old ? old->total_link_count : 0;
> > +   p->total_link_count = 0;
> > +   p->acc_mode = 0;
> > +   p->opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE;
> > +   if (old) {
> > +   p->total_link_count = old->total_link_count;
> > +   p->acc_mode = old->acc_mode;
> > +   p->opath_mask = old->opath_mask;
> > +   }
> 
> Huh?  Could somebody explain why traversals of NFS4 referrals should inherit
> ->acc_mode and ->opath_mask?

I'll be honest -- I don't understand what set_nameidata() did so I just
did what I thought would be an obvious change (to just copy the
contents). I thought it was related to some aspect of the symlink stack
handling.

In that case, should they both be set to 0 on set_nameidata()? This will
mean that fd re-opening (or magic-link opening) through a
set_nameidata() would always fail.

> >  static __always_inline
> > -const char *get_link(struct nameidata *nd)
> > +const char *get_link(struct nameidata *nd, bool trailing)
> >  {
> > struct saved *last = nd->stack + nd->depth - 1;
> > struct dentry *dentry = last->link.dentry;
> > @@ -1081,6 +1134,44 @@ const char *get_link(struct nameidata *nd)
> > } else {
> > res = get(dentry, inode, >done);
> > }
> > +   /* If we just jumped it was because of a magic-link. */
> > +   if (unlikely(nd->flags & LOOKUP_JUMPED)) {
> [...]
> In any case, this "bool trailing" is completely wrong; whether that
> check belongs in trailing_symlink() or (some of) its callers, putting
> it into get_link() is a mistake, forced by kludgy check for procfs-style
> symlinks.

The error path for LOOKUP_JUMPED comes from the old O_BENEATH patchset,
but all of the "bool trailing" logic is definitely my gaff (I was
quietly hoping you'd have a much better solution than the whole
get_link() thing -- it definitely felt very kludgey to write).

I will work on the suggestion in your follow-up email. Thanks!

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH v9 04/10] namei: split out nd->dfd handling to dirfd_path_init

2019-07-12 Thread Aleksa Sarai
On 2019-07-12, Aleksa Sarai  wrote:
> On 2019-07-12, Al Viro  wrote:
> > On Sun, Jul 07, 2019 at 12:57:31AM +1000, Aleksa Sarai wrote:
> > > Previously, path_init's handling of *at(dfd, ...) was only done once,
> > > but with LOOKUP_BENEATH (and LOOKUP_IN_ROOT) we have to parse the
> > > initial nd->path at different times (before or after absolute path
> > > handling) depending on whether we have been asked to scope resolution
> > > within a root.
> > 
> > >   if (*s == '/') {
> > > - set_root(nd);
> > > - if (likely(!nd_jump_root(nd)))
> > > - return s;
> > > - return ERR_PTR(-ECHILD);
> > 
> > > + if (likely(!nd->root.mnt))
> > > + set_root(nd);
> > 
> > How can we get there with non-NULL nd->root.mnt, when LOOKUP_ROOT case
> > has been already handled by that point?
> 
> Yup, you're completely right. I will remove the
>   if (!nd->root.mnt)
> in the next version.

Ah sorry, there is a reason for it -- later in the series the
LOOKUP_BENEATH case means that you might end up with a non-NULL
nd->root.mnt. If you want, I can move the addition of the conditional to
later in the series (it was easier to split the patch by-hunk back when
you originally asked me to split out dirfd_path_init()).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH v2 0/5] mm: Enable CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA

2019-07-12 Thread Michal Hocko
On Fri 12-07-19 10:56:47, Hoan Tran OS wrote:
[...]
> It would be good if we can enable it by-default. Otherwise, let arch 
> enables it by them-self. Do you have any suggestions?

I can hardly make any suggestions when it is not really clear _why_ you
want to remove this config option in the first place. Please explain
what motivated you to make this change.
-- 
Michal Hocko
SUSE Labs


Re: [PATCH v9 04/10] namei: split out nd->dfd handling to dirfd_path_init

2019-07-12 Thread Aleksa Sarai
On 2019-07-12, Al Viro  wrote:
> On Sun, Jul 07, 2019 at 12:57:31AM +1000, Aleksa Sarai wrote:
> > Previously, path_init's handling of *at(dfd, ...) was only done once,
> > but with LOOKUP_BENEATH (and LOOKUP_IN_ROOT) we have to parse the
> > initial nd->path at different times (before or after absolute path
> > handling) depending on whether we have been asked to scope resolution
> > within a root.
> 
> > if (*s == '/') {
> > -   set_root(nd);
> > -   if (likely(!nd_jump_root(nd)))
> > -   return s;
> > -   return ERR_PTR(-ECHILD);
> 
> > +   if (likely(!nd->root.mnt))
> > +   set_root(nd);
> 
> How can we get there with non-NULL nd->root.mnt, when LOOKUP_ROOT case
> has been already handled by that point?

Yup, you're completely right. I will remove the
  if (!nd->root.mnt)
in the next version.


-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags

2019-07-12 Thread Aleksa Sarai
On 2019-07-12, Al Viro  wrote:
> On Sun, Jul 07, 2019 at 12:57:32AM +1000, Aleksa Sarai wrote:
> > @@ -1442,8 +1464,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > struct inode *inode = nd->inode;
> >  
> > while (1) {
> > -   if (path_equal(>path, >root))
> > +   if (path_equal(>path, >root)) {
> > +   if (unlikely(nd->flags & LOOKUP_BENEATH))
> > +   return -EXDEV;
> 
> > @@ -1468,6 +1493,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -ECHILD;
> > if (>mnt == nd->path.mnt)
> > break;
> > +   if (unlikely(nd->flags & LOOKUP_XDEV))
> > +   return -EXDEV;
> > /* we know that mountpoint was pinned */
> > nd->path.dentry = mountpoint;
> > nd->path.mnt = >mnt;
> > @@ -1482,6 +1509,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -ECHILD;
> > if (!mounted)
> > break;
> > +   if (unlikely(nd->flags & LOOKUP_XDEV))
> > +   return -EXDEV;
> 
> Are you sure these failure exits in follow_dotdot_rcu() won't give
> suprious hard errors?

I could switch to -ECHILD for the *_rcu() checks if you'd prefer that.
Though, I'd have (probably naively) thought that you'd have already
gotten -ECHILD from the seqlock checks if there was a race during ".."
handling.

> > +   if (unlikely(nd->flags & LOOKUP_BENEATH)) {
> > +   error = dirfd_path_init(nd);
> > +   if (unlikely(error))
> > +   return ERR_PTR(error);
> > +   nd->root = nd->path;
> > +   if (!(nd->flags & LOOKUP_RCU))
> > +   path_get(>root);
> > +   }
> > if (*s == '/') {
> > if (likely(!nd->root.mnt))
> > set_root(nd);
> > @@ -2350,9 +2400,11 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > s = ERR_PTR(error);
> > return s;
> > }
> > -   error = dirfd_path_init(nd);
> > -   if (unlikely(error))
> > -   return ERR_PTR(error);
> > +   if (likely(!nd->path.mnt)) {
> 
> Is that a weird way of saying "if we hadn't already called dirfd_path_init()"?

Yes. I did it to be more consistent with the other "have we got the
root" checks elsewhere. Is there another way you'd prefer I do it?

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH v2 0/5] mm: Enable CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA

2019-07-12 Thread Hoan Tran OS
Hi,

On 7/12/19 2:02 PM, Michal Hocko wrote:
> On Thu 11-07-19 23:25:44, Hoan Tran OS wrote:
>> In NUMA layout which nodes have memory ranges that span across other nodes,
>> the mm driver can detect the memory node id incorrectly.
>>
>> For example, with layout below
>> Node 0 address:    
>> Node 1 address:    
>>
>> Note:
>>   - Memory from low to high
>>   - 0/1: Node id
>>   - x: Invalid memory of a node
>>
>> When mm probes the memory map, without CONFIG_NODES_SPAN_OTHER_NODES
>> config, mm only checks the memory validity but not the node id.
>> Because of that, Node 1 also detects the memory from node 0 as below
>> when it scans from the start address to the end address of node 1.
>>
>> Node 0 address:    
>> Node 1 address:    
>>
>> This layout could occur on any architecture. This patch enables
>> CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA to fix this issue.
> 
> Yes it can occur on any arch but most sane platforms simply do not
> overlap physical ranges. So I do not really see any reason to
> unconditionally enable the config for everybody. What is an advantage?
> 

As I observed from arch folder, there are 9 arch support NUMA config.

./arch/ia64/Kconfig:387:config NUMA
./arch/powerpc/Kconfig:582:config NUMA
./arch/sparc/Kconfig:281:config NUMA
./arch/alpha/Kconfig:557:config NUMA
./arch/sh/mm/Kconfig:112:config NUMA
./arch/arm64/Kconfig:841:config NUMA
./arch/x86/Kconfig:1531:config NUMA
./arch/mips/Kconfig:2646:config NUMA
./arch/s390/Kconfig:441:config NUMA

And there are 5 arch enables CONFIG_NODES_SPAN_OTHER_NODES with NUMA

arch/powerpc/Kconfig:637:config NODES_SPAN_OTHER_NODES
arch/sparc/Kconfig:299:config NODES_SPAN_OTHER_NODES
arch/x86/Kconfig:1575:config NODES_SPAN_OTHER_NODES
arch/s390/Kconfig:446:config NODES_SPAN_OTHER_NODES
arch/arm64 (which I intended to enable in the original patch)

It would be good if we can enable it by-default. Otherwise, let arch 
enables it by them-self. Do you have any suggestions?

Thanks
Hoan




Re: [PATCH V2] mm/ioremap: Probe platform for p4d huge map support

2019-07-12 Thread Anshuman Khandual



On 07/12/2019 12:37 PM, Michael Ellerman wrote:
> Anshuman Khandual  writes:
>> On 07/03/2019 04:36 AM, Andrew Morton wrote:
>>> On Fri, 28 Jun 2019 10:50:31 +0530 Anshuman Khandual 
>>>  wrote:
>>>
 Finishing up what the commit c2febafc67734a ("mm: convert generic code to
 5-level paging") started out while levelling up P4D huge mapping support
 at par with PUD and PMD. A new arch call back arch_ioremap_p4d_supported()
 is being added which just maintains status quo (P4D huge map not supported)
 on x86, arm64 and powerpc.
>>>
>>> Does this have any runtime effects?  If so, what are they and why?  If
>>> not, what's the actual point?
>>
>> It just finishes up what the previous commit c2febafc67734a ("mm: convert
>> generic code to 5-level paging") left off with respect p4d based huge page
>> enablement for ioremap. When HAVE_ARCH_HUGE_VMAP is enabled its just a simple
>> check from the arch about the support, hence runtime effects are minimal.
> 
> The return value of arch_ioremap_p4d_supported() is stored in the
> variable ioremap_p4d_capable which is then returned by
> ioremap_p4d_enabled().
> 
> That is used by ioremap_try_huge_p4d() called from ioremap_p4d_range()
> from ioremap_page_range().

That is right.

> 
> The runtime effect is that it prevents ioremap_page_range() from trying
> to create huge mappings at the p4d level on arches that don't support
> it.

But now after first checking with an arch callback. Previously p4d huge
mappings were disabled on all platforms as ioremap_p4d_capable remained
clear through out being a static.


[PATCH kernel v4 3/4] powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window

2019-07-12 Thread Alexey Kardashevskiy
We allocate only the first level of multilevel TCE tables for KVM
already (alloc_userspace_copy==true), and the rest is allocated on demand.
This is not enabled though for bare metal.

This removes the KVM limitation (implicit, via the alloc_userspace_copy
parameter) and always allocates just the first level. The on-demand
allocation of missing levels is already implemented.

As from now on DMA map might happen with disabled interrupts, this
allocates TCEs with GFP_ATOMIC; otherwise lockdep reports errors 1].
In practice just a single page is allocated there so chances for failure
are quite low.

To save time when creating a new clean table, this skips non-allocated
indirect TCE entries in pnv_tce_free just like we already do in
the VFIO IOMMU TCE driver.

This changes the default level number from 1 to 2 to reduce the amount
of memory required for the default 32bit DMA window at the boot time.
The default window size is up to 2GB which requires 4MB of TCEs which is
unlikely to be used entirely or at all as most devices these days are
64bit capable so by switching to 2 levels by default we save 4032KB of
RAM per a device.

While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace
can trigger this path via VFIO, see the failure and try creating a table
again with different parameters which might succeed.

[1]:
===
BUG: sleeping function called from invalid context at mm/page_alloc.c:4596
in_atomic(): 1, irqs_disabled(): 1, pid: 1038, name: scsi_eh_1
2 locks held by scsi_eh_1/1038:
 #0: 5efd659a (>eh_mutex){+.+.}, at: ata_eh_acquire+0x34/0x80
 #1: 06cf56a6 (&(>lock)->rlock){}, at: 
ata_exec_internal_sg+0xb0/0x5c0
irq event stamp: 500
hardirqs last  enabled at (499): [] 
_raw_spin_unlock_irqrestore+0x94/0xd0
hardirqs last disabled at (500): [] 
_raw_spin_lock_irqsave+0x44/0x120
softirqs last  enabled at (0): [] 
copy_process.isra.4.part.5+0x640/0x1a80
softirqs last disabled at (0): [<>] 0x0
CPU: 73 PID: 1038 Comm: scsi_eh_1 Not tainted 5.2.0-rc6-le_nv2_aikATfstn1-p1 
#634
Call Trace:
[c03d064cef50] [c0c8e6c4] dump_stack+0xe8/0x164 (unreliable)
[c03d064cefa0] [c014ed78] ___might_sleep+0x2f8/0x310
[c03d064cf020] [c03ca084] __alloc_pages_nodemask+0x2a4/0x1560
[c03d064cf220] [c00c2530] pnv_alloc_tce_level.isra.0+0x90/0x130
[c03d064cf290] [c00c2888] pnv_tce+0x128/0x3b0
[c03d064cf360] [c00c2c00] pnv_tce_build+0xb0/0xf0
[c03d064cf3c0] [c00bbd9c] pnv_ioda2_tce_build+0x3c/0xb0
[c03d064cf400] [c004cfe0] ppc_iommu_map_sg+0x210/0x550
[c03d064cf510] [c004b7a4] dma_iommu_map_sg+0x74/0xb0
[c03d064cf530] [c0863944] ata_qc_issue+0x134/0x470
[c03d064cf5b0] [c0863ec4] ata_exec_internal_sg+0x244/0x5c0
[c03d064cf700] [c08642d0] ata_exec_internal+0x90/0xe0
[c03d064cf780] [c08650ac] ata_dev_read_id+0x2ec/0x640
[c03d064cf8d0] [c0878e28] ata_eh_recover+0x948/0x16d0
[c03d064cfa10] [c087d760] sata_pmp_error_handler+0x480/0xbf0
[c03d064cfbc0] [c0884624] ahci_error_handler+0x74/0xe0
[c03d064cfbf0] [c0879fa8] ata_scsi_port_error_handler+0x2d8/0x7c0
[c03d064cfca0] [c087a544] ata_scsi_error+0xb4/0x100
[c03d064cfd00] [c0802450] scsi_error_handler+0x120/0x510
[c03d064cfdb0] [c0140c48] kthread+0x1b8/0x1c0
[c03d064cfe20] [c000bd8c] ret_from_kernel_thread+0x5c/0x70
ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
irq event stamp: 2305


hardirqs last  enabled at (2305): [] 
fast_exc_return_irq+0x28/0x34
hardirqs last disabled at (2303): [] __do_softirq+0x4a0/0x654
WARNING: possible irq lock inversion dependency detected
5.2.0-rc6-le_nv2_aikATfstn1-p1 #634 Tainted: GW
softirqs last  enabled at (2304): [] __do_softirq+0x524/0x654
softirqs last disabled at (2297): [] irq_exit+0x128/0x180

swapper/0/0 just changed the state of lock:
06cf56a6 (&(>lock)->rlock){-...}, at: 
ahci_single_level_irq_intr+0xac/0x120
but this lock took another, HARDIRQ-unsafe lock in the past:
 (fs_reclaim){+.+.}


and interrupts could create inverse lock ordering between them.


other info that might help us debug this:
 Possible interrupt unsafe locking scenario:

   CPU0CPU1
   
  lock(fs_reclaim);
   local_irq_disable();
   lock(&(>lock)->rlock);
   lock(fs_reclaim);
  
lock(&(>lock)->rlock);

 *** DEADLOCK ***

no locks held by swapper/0/0.

the shortest dependencies between 2nd lock and 1st lock:
 -> (fs_reclaim){+.+.} ops: 167579 {
HARDIRQ-ON-W at:
  lock_acquire+0xf8/0x2a0
  fs_reclaim_acquire.part.23+0x44/0x60
  

[PATCH kernel v4 2/4] powerpc/iommu: Allow bypass-only for DMA

2019-07-12 Thread Alexey Kardashevskiy
POWER8 and newer support a bypass mode which maps all host memory to
PCI buses so an IOMMU table is not always required. However if we fail to
create such a table, the DMA setup fails and the kernel does not boot.

This skips the 32bit DMA setup check if the bypass is can be selected.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---

This minor thing helped me debugging next 2 patches so it can help
somebody else too.
---
 arch/powerpc/kernel/dma-iommu.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index a0879674a9c8..c963d704fa31 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 {
struct iommu_table *tbl = get_iommu_table_base(dev);
 
-   if (!tbl) {
-   dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
-   ", table unavailable\n", mask);
-   return 0;
-   }
-
if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
dev->archdata.iommu_bypass = true;
dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
return 1;
}
 
+   if (!tbl) {
+   dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, 
table unavailable\n", mask);
+   return 0;
+   }
+
if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
dev_info(dev, "Warning: IOMMU offset too big for device 
mask\n");
dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
-- 
2.17.1



[PATCH kernel v4 4/4] powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages

2019-07-12 Thread Alexey Kardashevskiy
At the moment we create a small window only for 32bit devices, the window
maps 0..2GB of the PCI space only. For other devices we either use
a sketchy bypass or hardware bypass but the former can only work if
the amount of RAM is no bigger than the device's DMA mask and the latter
requires devices to support at least 59bit DMA.

This extends the default DMA window to the maximum size possible to allow
a wider DMA mask than just 32bit. The default window size is now limited
by the the iommu_table::it_map allocation bitmap which is a contiguous
array, 1 bit per an IOMMU page.

This increases the default IOMMU page size from hard coded 4K to
the system page size to allow wider DMA masks.

This increases the level number to not exceed the max order allocation
limit per TCE level. By the same time, this keeps minimal levels number
as 2 in order to save memory.

As the extended window now overlaps the 32bit MMIO region, this adds
an area reservation to iommu_init_table().

After this change the default window size is 0x800==1<<43 so
devices limited to DMA mask smaller than the amount of system RAM can
still use more than just 2GB of memory for DMA.

With the on-demand allocation of indirect TCE table levels enabled and
2 levels, the first TCE level size is just
1<
---
Changes:
v4:
* fixed take/release ownership handlers
* fixed reserved region for tables with it_offset!=0 (this is not going
to be exploited here but still this is a correct behavior)

v3:
* fixed tce levels calculation

v2:
* adjusted level number to the max order
---
 arch/powerpc/include/asm/iommu.h  |  8 ++-
 arch/powerpc/kernel/iommu.c   | 74 ---
 arch/powerpc/platforms/powernv/pci-ioda.c | 40 ++--
 3 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 18d342b815e4..a527a5fe1b01 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -111,6 +111,8 @@ struct iommu_table {
struct iommu_table_ops *it_ops;
struct krefit_kref;
int it_nid;
+   unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
+   unsigned long it_reserved_end;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -149,8 +151,10 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
  */
-extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
-   int nid);
+extern struct iommu_table *iommu_init_table_res(struct iommu_table *tbl,
+   int nid, unsigned long res_start, unsigned long res_end);
+#define iommu_init_table(tbl, nid) iommu_init_table_res((tbl), (nid), 0, 0)
+
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
 struct iommu_table_group;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0a67ce9f827e..2b501dc15352 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl)
 #endif
 }
 
+static void iommu_table_reserve_pages(struct iommu_table *tbl,
+   unsigned long res_start, unsigned long res_end)
+{
+   int i;
+
+   WARN_ON_ONCE(res_end >= res_start);
+   /*
+* Reserve page 0 so it will not be used for any mappings.
+* This avoids buggy drivers that consider page 0 to be invalid
+* to crash the machine or even lose data.
+*/
+   if (tbl->it_offset == 0)
+   set_bit(0, tbl->it_map);
+
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
+
+   /* Check if res_start..res_end isn't empty and overlaps the table */
+   if (res_start && res_end &&
+   (tbl->it_offset + tbl->it_size < res_start ||
+res_end < tbl->it_offset))
+   return;
+
+   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+   set_bit(i - tbl->it_offset, tbl->it_map);
+}
+
+static void iommu_table_release_pages(struct iommu_table *tbl)
+{
+   int i;
+
+   /*
+* In case we have reserved the first bit, we should not emit
+* the warning below.
+*/
+   if (tbl->it_offset == 0)
+   clear_bit(0, tbl->it_map);
+
+   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
+   clear_bit(i - tbl->it_offset, tbl->it_map);
+}
+
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
  */
-struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
+struct iommu_table *iommu_init_table_res(struct iommu_table *tbl, int nid,
+   unsigned long res_start, unsigned long res_end)
 {
unsigned long sz;
static int welcomed = 0;
@@ -656,13 +699,7 @@ struct iommu_table 

[PATCH kernel v4 1/4] powerpc/powernv/ioda: Fix race in TCE level allocation

2019-07-12 Thread Alexey Kardashevskiy
pnv_tce() returns a pointer to a TCE entry and originally a TCE table
would be pre-allocated. For the default case of 2GB window the table
needs only a single level and that is fine. However if more levels are
requested, it is possible to get a race when 2 threads want a pointer
to a TCE entry from the same page of TCEs.

This adds cmpxchg to handle the race. Note that once TCE is non-zero,
it cannot become zero again.

CC: sta...@vger.kernel.org # v4.19+
Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on 
demand")
Signed-off-by: Alexey Kardashevskiy 
---

The race occurs about 30 times in the first 3 minutes of copying files
via rsync and that's about it.

This fixes EEH's from
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=110810

---
Changes:
v2:
* replaced spin_lock with cmpxchg+readonce
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..8d6569590161 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -48,6 +48,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int 
shift)
return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+   unsigned long size, unsigned int levels);
+
 static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool 
alloc)
 {
__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +60,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, 
long idx, bool alloc)
 
while (level) {
int n = (idx & mask) >> (level * shift);
-   unsigned long tce;
+   unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
 
-   if (tmp[n] == 0) {
+   if (!tce) {
__be64 *tmp2;
 
if (!alloc)
@@ -70,10 +73,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, 
long idx, bool alloc)
if (!tmp2)
return NULL;
 
-   tmp[n] = cpu_to_be64(__pa(tmp2) |
-   TCE_PCI_READ | TCE_PCI_WRITE);
+   tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+   oldtce = be64_to_cpu(cmpxchg([n], 0,
+   cpu_to_be64(tce)));
+   if (oldtce) {
+   pnv_pci_ioda2_table_do_free_pages(tmp2,
+   ilog2(tbl->it_level_size) + 3, 1);
+   tce = oldtce;
+   }
}
-   tce = be64_to_cpu(tmp[n]);
 
tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
idx &= ~mask;
-- 
2.17.1



[PATCH kernel v4 0/4 repost] powerpc/ioda2: Yet another attempt to allow DMA masks between 32 and 59

2019-07-12 Thread Alexey Kardashevskiy
This is an attempt to allow DMA masks between 32..59 which are not large
enough to use either a PHB3 bypass mode or a sketchy bypass. Depending
on the max order, up to 40 is usually available.

Changelogs are in the patches.


This is based on sha1
a2b6f26c264e Christophe Leroy "powerpc/module64: Use symbolic instructions 
names.".

Please comment. Thanks.



Alexey Kardashevskiy (4):
  powerpc/powernv/ioda: Fix race in TCE level allocation
  powerpc/iommu: Allow bypass-only for DMA
  powerpc/powernv/ioda2: Allocate TCE table levels on demand for default
DMA window
  powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU
pages

 arch/powerpc/include/asm/iommu.h  |  8 +-
 arch/powerpc/platforms/powernv/pci.h  |  2 +-
 arch/powerpc/kernel/dma-iommu.c   | 11 ++-
 arch/powerpc/kernel/iommu.c   | 74 +--
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 38 ++
 arch/powerpc/platforms/powernv/pci-ioda.c | 40 --
 6 files changed, 121 insertions(+), 52 deletions(-)

-- 
2.17.1



Re: [RFC PATCH kernel] powerpc/xive: Drop deregistered irqs

2019-07-12 Thread Alexey Kardashevskiy




On 12/07/2019 18:29, Benjamin Herrenschmidt wrote:

On Fri, 2019-07-12 at 18:20 +1000, Alexey Kardashevskiy wrote:


diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 082c7e1c20f0..65742e280337 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -148,8 +148,12 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool 
just_peek)
irq = xive_read_eq(>queue[prio], just_peek);
  
  		/* Found something ? That's it */

-   if (irq)
-   break;
+   if (irq) {
+   /* Another CPU may have shut this irq down, check it */
+   if (irq_to_desc(irq))


What if it gets deregistered here  ?


Yeah that is the problem.




+   break;
+   irq = 0;
+   }
  
  		/* Clear pending bits */

xc->pending_prio &= ~(1 << prio);


Wouldn't it be better to check the return value from generic_handle_irq
instead ?


Where exactly, here?
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/irq.c#n614

If so, then in order to do EOI, I'll need the desc which is gone, or I 
am missing the point?



--
Alexey


Re: [PATCH V2] mm/ioremap: Probe platform for p4d huge map support

2019-07-12 Thread Stephen Rothwell
Hi all,

On Fri, 12 Jul 2019 17:07:48 +1000 Michael Ellerman  wrote:
>
> The return value of arch_ioremap_p4d_supported() is stored in the
> variable ioremap_p4d_capable which is then returned by
> ioremap_p4d_enabled().
> 
> That is used by ioremap_try_huge_p4d() called from ioremap_p4d_range()
> from ioremap_page_range().

When I first saw this, I wondered if we expect
arch_ioremap_p4d_supported() to ever return something that is not
computable at compile time.  If not, why do we have this level of
redirection?  Why not just make it a static inline functions defined in
an arch specific include file (or even just a CONFIG_ option)?

In particular, ioremap_p4d_enabled() either returns ioremap_p4d_capable
or 0 and is static to one file and has one call site ...  The same is
true of ioremap_pud_enabled() and ioremap_pmd_enabled().
-- 
Cheers,
Stephen Rothwell


pgp114b1w_u50.pgp
Description: OpenPGP digital signature


Re: [RFC PATCH kernel] powerpc/xive: Drop deregistered irqs

2019-07-12 Thread Benjamin Herrenschmidt
On Fri, 2019-07-12 at 18:20 +1000, Alexey Kardashevskiy wrote:
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index 082c7e1c20f0..65742e280337 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -148,8 +148,12 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, 
> bool just_peek)
>   irq = xive_read_eq(>queue[prio], just_peek);
>  
>   /* Found something ? That's it */
> - if (irq)
> - break;
> + if (irq) {
> + /* Another CPU may have shut this irq down, check it */
> + if (irq_to_desc(irq))

What if it gets deregistered here  ?

> + break;
> + irq = 0;
> + }
>  
>   /* Clear pending bits */
>   xc->pending_prio &= ~(1 << prio);

Wouldn't it be better to check the return value from generic_handle_irq
instead ?

Cheers,
Ben.




[RFC PATCH kernel] powerpc/xive: Drop deregistered irqs

2019-07-12 Thread Alexey Kardashevskiy
There is a race between releasing an irq on one cpu and fetching it
from XIVE on another cpu as there does not seem to be any locking between
these, probably because xive_irq_chip::irq_shutdown() is supposed to
remove the irq from all queues in the system which it does not do.

As a result, when such released irq appears in a queue, we take it
from the queue but we do not change the current priority on that cpu and
since there is no handler for the irq, EOI is never called and the cpu
current priority remains elevated (7 vs. 0xff==unmasked). If another irq
is assigned to the same cpu, then that device stops working until irq
is moved to another cpu or the device is reset.

This checks if irq is still registered, if not, it assumes no valid irq
was fetched from the loop and if there is none left, it continues to
the irq==0 case (not visible in this patch) and sets priority to 0xff
which is basically unmasking. This calls irq_to_desc() on a hot path now
which is a radix tree lookup; hopefully this won't be noticeable as
that tree is quite small.

Signed-off-by: Alexey Kardashevskiy 
---

Found it on P9 system with:
- a host with 8 cpus online
- a boot disk on ahci with its msix on cpu#0
- a guest with 2xGPUs + 6xNVLink + 4 cpus
- GPU#0 from the guest is bound to the same cpu#0.

Killing a guest killed ahci and therefore the host because of the race.
Note that VFIO masks interrupts first and only then resets the device.

Alternatives:

1. Fix xive_irq_chip::irq_shutdown() to walk through all cpu queues and
drop deregistered irqs.

2. Exploit chip->irq_get_irqchip_state function from
62e0468650c30f0298 "genirq: Add optional hardware synchronization for shutdown".

Both require deep XIVE knowledge which I do not have.
---
 arch/powerpc/sysdev/xive/common.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 082c7e1c20f0..65742e280337 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -148,8 +148,12 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool 
just_peek)
irq = xive_read_eq(>queue[prio], just_peek);
 
/* Found something ? That's it */
-   if (irq)
-   break;
+   if (irq) {
+   /* Another CPU may have shut this irq down, check it */
+   if (irq_to_desc(irq))
+   break;
+   irq = 0;
+   }
 
/* Clear pending bits */
xc->pending_prio &= ~(1 << prio);
-- 
2.17.1



Re: [PATCH 2/3] DMA mapping: Move SME handling to x86-specific files

2019-07-12 Thread Christoph Hellwig
Honestly I think this code should go away without any replacement.
There is no reason why we should have a special debug printk just
for one specific reason why there is a requirement for a large DMA
mask.


Re: [PATCH V2] mm/ioremap: Probe platform for p4d huge map support

2019-07-12 Thread Michael Ellerman
Anshuman Khandual  writes:
> On 07/03/2019 04:36 AM, Andrew Morton wrote:
>> On Fri, 28 Jun 2019 10:50:31 +0530 Anshuman Khandual 
>>  wrote:
>> 
>>> Finishing up what the commit c2febafc67734a ("mm: convert generic code to
>>> 5-level paging") started out while levelling up P4D huge mapping support
>>> at par with PUD and PMD. A new arch call back arch_ioremap_p4d_supported()
>>> is being added which just maintains status quo (P4D huge map not supported)
>>> on x86, arm64 and powerpc.
>> 
>> Does this have any runtime effects?  If so, what are they and why?  If
>> not, what's the actual point?
>
> It just finishes up what the previous commit c2febafc67734a ("mm: convert
> generic code to 5-level paging") left off with respect p4d based huge page
> enablement for ioremap. When HAVE_ARCH_HUGE_VMAP is enabled its just a simple
> check from the arch about the support, hence runtime effects are minimal.

The return value of arch_ioremap_p4d_supported() is stored in the
variable ioremap_p4d_capable which is then returned by
ioremap_p4d_enabled().

That is used by ioremap_try_huge_p4d() called from ioremap_p4d_range()
from ioremap_page_range().

The runtime effect is that it prevents ioremap_page_range() from trying
to create huge mappings at the p4d level on arches that don't support
it.

cheers


[RFC v4 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 17e20e408ffe..29add322d0c4 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = >states[i];
-   struct cpuidle_state_usage *su = >states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
/* Ensure that the timeout is at least one microsecond
 * greater than current decrement value. Else, we will
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8a0e54bd0d5d..31662b657b9c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v4 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-07-12 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  4 
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index eb20adb5de23..141671a53967 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -415,8 +415,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
return ret;
 }
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb9a0db89f1a..8a0e54bd0d5d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,10 @@ struct cpuidle_driver {
 
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
+
+   void (*disable_callback)(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv);
+
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
2.17.1



[PATCH v4 1/3] cpuidle-powernv : forced wakeup for stop states

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 54f4ec1f9fab..294a472ce161 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -188,6 +188,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308cd5..d004c0d8e099 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe212b3..17e20e408ffe 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = >states[i];
+   struct cpuidle_state_usage *su = >states_usage[i];
+
+   if (s->disabled || su->disable)
+   continue;
+
+   return (s->target_residency + 2 * s->exit_latency) *
+   

[PATCH v4 0/3] Forced-wakeup for stop states on Powernv

2019-07-12 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  5 +++
 5 files changed, 106 insertions(+), 14 deletions(-)

-- 
2.17.1



Re: [PATCH v2 0/5] mm: Enable CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA

2019-07-12 Thread Michal Hocko
On Thu 11-07-19 23:25:44, Hoan Tran OS wrote:
> In NUMA layout which nodes have memory ranges that span across other nodes,
> the mm driver can detect the memory node id incorrectly.
> 
> For example, with layout below
> Node 0 address:    
> Node 1 address:    
> 
> Note:
>  - Memory from low to high
>  - 0/1: Node id
>  - x: Invalid memory of a node
> 
> When mm probes the memory map, without CONFIG_NODES_SPAN_OTHER_NODES
> config, mm only checks the memory validity but not the node id.
> Because of that, Node 1 also detects the memory from node 0 as below
> when it scans from the start address to the end address of node 1.
> 
> Node 0 address:    
> Node 1 address:    
> 
> This layout could occur on any architecture. This patch enables
> CONFIG_NODES_SPAN_OTHER_NODES by default for NUMA to fix this issue.

Yes it can occur on any arch but most sane platforms simply do not
overlap physical ranges. So I do not really see any reason to
unconditionally enable the config for everybody. What is an advantage?

-- 
Michal Hocko
SUSE Labs


Re: [PATCH v3 3/3] powerpc/module64: Use symbolic instructions names.

2019-07-12 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 08/07/2019 à 02:56, Michael Ellerman a écrit :
>> Christophe Leroy  writes:
>>> To increase readability/maintainability, replace hard coded
>>> instructions values by symbolic names.
>>>
>>> Signed-off-by: Christophe Leroy 
>>> ---
>>> v3: fixed warning by adding () in an 'if' around X | Y (unlike said in v2 
>>> history, this change was forgotten in v2)
>>> v2: rearranged comments
>>>
>>>   arch/powerpc/kernel/module_64.c | 53 
>>> +++--
>>>   1 file changed, 35 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kernel/module_64.c 
>>> b/arch/powerpc/kernel/module_64.c
>>> index c2e1b06253b8..b33a5d5e2d35 100644
>>> --- a/arch/powerpc/kernel/module_64.c
>>> +++ b/arch/powerpc/kernel/module_64.c
>>> @@ -704,18 +711,21 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
>> ...
>>> /*
>>>  * If found, replace it with:
>>>  *  addis r2, r12, (.TOC.-func)@ha
>>>  *  addi r2, r12, (.TOC.-func)@l
>>>  */
>>> -   ((uint32_t *)location)[0] = 0x3c4c + PPC_HA(value);
>>> -   ((uint32_t *)location)[1] = 0x3842 + PPC_LO(value);
>>> +   ((uint32_t *)location)[0] = PPC_INST_ADDIS | 
>>> __PPC_RT(R2) |
>>> +   __PPC_RA(R12) | 
>>> PPC_HA(value);
>>> +   ((uint32_t *)location)[1] = PPC_INST_ADDI | 
>>> __PPC_RT(R2) |
>>> +   __PPC_RA(R12) | 
>>> PPC_LO(value);
>>> break;
>> 
>> This was crashing and it's amazing how long you can stare at a
>> disassembly and not see the difference between `r2` and `r12` :)
>
> Argh, yes. I was misleaded by the comment I guess. Sorry for that and 
> thanks for fixing.

No worries, yes the comment was the problem. I fixed that as well.

cheers


Re: [PATCH v2] powerpc/book3s/mm: Update Oops message to print the correct translation in use

2019-07-12 Thread Christophe Leroy




Le 12/07/2019 à 08:25, Michael Ellerman a écrit :

"Aneesh Kumar K.V"  writes:


Avoids confusion when printing Oops message like below

  Faulting instruction address: 0xc008bdb4
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV

This was because we never clear the MMU_FTR_HPTE_TABLE feature flag
even if we run with radix translation. It was discussed that we should
look at this feature flag as an indication of the capability to run
hash translation and we should not clear the flag even if we run in
radix translation. All the code paths check for radix_enabled() check and
if found true consider we are running with radix translation. Follow the
same sequence for finding the MMU translation string to be used in Oops
message.

Signed-off-by: Aneesh Kumar K.V 
---

Changes from V1:
* Don't clear the HPTE feature flag.

  arch/powerpc/kernel/traps.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..b181d6860f28 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -250,15 +250,22 @@ static void oops_end(unsigned long flags, struct pt_regs 
*regs,
  }
  NOKPROBE_SYMBOL(oops_end);
  
+static char *get_mmu_str(void)

+{
+   if (early_radix_enabled())
+   return " MMU=Radix";
+   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+   return " MMU=Hash";
+   return "";
+}


We don't change MMU once we're up, so just do this logic once and stash
it into a static string, rather than rechecking on every oops.


Do we really have oops so often that we have to worry about that ?

Christophe



cheers


  static int __die(const char *str, struct pt_regs *regs, long err)
  {
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
  
-	printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n",

+   printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
   IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
-  PAGE_SIZE / 1024,
-  early_radix_enabled() ? " MMU=Radix" : "",
-  early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "",
+  PAGE_SIZE / 1024, get_mmu_str(),
   IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
   IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
   IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
--
2.21.0


Re: [PATCH v9 01/10] namei: obey trailing magic-link DAC permissions

2019-07-12 Thread Al Viro
On Fri, Jul 12, 2019 at 05:14:54AM +0100, Al Viro wrote:

> That's not quite guaranteed (it is possible to bind a symlink on top
> of a regular file, and you will get LOOKUP_JUMPED on the entry into
> trailing_symlink() when looking the result up).  Moreover, why bother
> with LOOKUP_JUMPED here?  See that
>   nd->last_type = LAST_BIND;
> several lines prior?  That's precisely to be able to recognize those
> suckers.

... except that this won't work these days (once upon a time it used
to, but that had been a long time ago).  However, that doesn't change
the fact that the test is really wrong.  So let's do it right:

* set a new flag along with LOOKUP_JUMPED in nd_jump_link()
* clear it in get_link() right before
res = READ_ONCE(inode->i_link);
* check it in trailing_symlink() (or callers thereof)

The rest of comments stand, AFAICS.


Re: [PATCH v4 1/8] KVM: PPC: Ultravisor: Introduce the MSR_S bit

2019-07-12 Thread Michael Ellerman
Nicholas Piggin  writes:

> Claudio Carvalho's on June 29, 2019 6:08 am:
>> From: Sukadev Bhattiprolu 
>> 
>> The ultravisor processor mode is introduced in POWER platforms that
>> supports the Protected Execution Facility (PEF). Ultravisor is higher
>> privileged than hypervisor mode.
>> 
>> In PEF enabled platforms, the MSR_S bit is used to indicate if the
>> thread is in secure state. With the MSR_S bit, the privilege state of
>> the thread is now determined by MSR_S, MSR_HV and MSR_PR, as follows:
>> 
>> S   HV  PR
>> ---
>> 0   x   1   problem
>> 1   0   1   problem
>> x   x   0   privileged
>> x   1   0   hypervisor
>> 1   1   0   ultravisor
>> 1   1   1   reserved
>
> What does this table mean? I thought 'x' meant either, but in that
> case there are several states that can apply to the same
> combination of bits.
>
> Would it be clearer to rearrange the table so the columns are the HV
> and PR bits we know and love, plus the effect of S=1 on each of them?
>
>   HV  PR  S=0 S=1
>   -
>   0   0   privileged  privileged (secure guest kernel)
>   0   1   problem problem (secure guest userspace)
>   1   0   hypervisor  ultravisor
>   1   1   problem reserved
>
> Is that accurate?

I like that format.

cheers


Re: [PATCH v2] powerpc/book3s/mm: Update Oops message to print the correct translation in use

2019-07-12 Thread Michael Ellerman
"Aneesh Kumar K.V"  writes:

> Avoids confusion when printing Oops message like below
>
>  Faulting instruction address: 0xc008bdb4
>  Oops: Kernel access of bad area, sig: 11 [#1]
>  LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
>
> This was because we never clear the MMU_FTR_HPTE_TABLE feature flag
> even if we run with radix translation. It was discussed that we should
> look at this feature flag as an indication of the capability to run
> hash translation and we should not clear the flag even if we run in
> radix translation. All the code paths check for radix_enabled() check and
> if found true consider we are running with radix translation. Follow the
> same sequence for finding the MMU translation string to be used in Oops
> message.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>
> Changes from V1:
> * Don't clear the HPTE feature flag.
>
>  arch/powerpc/kernel/traps.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 11caa0291254..b181d6860f28 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -250,15 +250,22 @@ static void oops_end(unsigned long flags, struct 
> pt_regs *regs,
>  }
>  NOKPROBE_SYMBOL(oops_end);
>  
> +static char *get_mmu_str(void)
> +{
> + if (early_radix_enabled())
> + return " MMU=Radix";
> + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
> + return " MMU=Hash";
> + return "";
> +}

We don't change MMU once we're up, so just do this logic once and stash
it into a static string, rather than rechecking on every oops.

cheers

>  static int __die(const char *str, struct pt_regs *regs, long err)
>  {
>   printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
>  
> - printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n",
> + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
>  IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
> -PAGE_SIZE / 1024,
> -early_radix_enabled() ? " MMU=Radix" : "",
> -early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "",
> +PAGE_SIZE / 1024, get_mmu_str(),
>  IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
>  IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
>  IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
> -- 
> 2.21.0