Re: [PATCH 4/4] mm: numa: Slow PTE scan rate if migration failures occur

2015-03-10 Thread Dave Chinner
On Mon, Mar 09, 2015 at 09:52:18AM -0700, Linus Torvalds wrote:
 On Mon, Mar 9, 2015 at 4:29 AM, Dave Chinner da...@fromorbit.com wrote:
 
  Also, is there some sane way for me to actually see this behavior on a
  regular machine with just a single socket? Dave is apparently running
  in some fake-numa setup, I'm wondering if this is easy enough to
  reproduce that I could see it myself.
 
  Should be - I don't actually use 500TB of storage to generate this -
  50GB on an SSD is all you need from the storage side. I just use a
  sparse backing file to make it look like a 500TB device. :P
 
 What's your virtual environment setup? Kernel config, and
 virtualization environment to actually get that odd fake NUMA thing
 happening?

I don't have the exact .config with me (test machines at home
are shut down because I'm half a world away), but it's pretty much
this (copied and munged from a similar test vm on my laptop):

$ cat run-vm-4.sh
sudo qemu-system-x86_64 \
-machine accel=kvm \
-no-fd-bootchk \
-localtime \
-boot c \
-serial pty \
-nographic \
-alt-grab \
-smp 16 -m 16384 \
-hda /data/vm-2/root.img \
-drive file=/vm/vm-4/vm-4-test.img,if=virtio,cache=none \
-drive file=/vm/vm-4/vm-4-scratch.img,if=virtio,cache=none \
-drive file=/vm/vm-4/vm-4-500TB.img,if=virtio,cache=none \
-kernel /vm/vm-4/vmlinuz \
-append console=ttyS0,115200 root=/dev/sda1,numa=fake=4
$

And on the host I have /vm on a ssd that is an XFS filesystem, and
I've created /vm/vm-4/vm-4-500TB.img by doing:

$ xfs_io -f -c truncate 500t -c extsize 1m /vm/vm-4/vm-4-500TB.img

and in the guest the filesystem is created with:

# mkfs.xfs -f -mcrc=1,finobt=1 /dev/vdc

And that will create a 500TB filesystem that you can then mount and
run fsmark on it, then unmount and run xfs_repair on it.

the .config I have on my laptop is from 3.18-rc something, but it
should work just with a make oldconfig update. I'ts attached below.

Hopefully this will be sufficient for you, otherwise it'll have to
wait until I get home to get the exact configs for you.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86 3.18.0-rc1 Kernel Configuration
#
CONFIG_64BIT=y
CONFIG_X86_64=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_OUTPUT_FORMAT=elf64-x86-64
CONFIG_ARCH_DEFCONFIG=arch/x86/configs/x86_64_defconfig
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_MMU=y
CONFIG_NEED_DMA_MAP_STATE=y
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y
CONFIG_ARCH_WANT_GENERAL_HUGETLB=y
CONFIG_ZONE_DMA32=y
CONFIG_AUDIT_ARCH=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_X86_64_SMP=y
CONFIG_X86_HT=y
CONFIG_ARCH_HWEIGHT_CFLAGS=-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx 
-fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 
-fcall-saved-r11
CONFIG_ARCH_SUPPORTS_UPROBES=y
CONFIG_FIX_EARLYCON_MEM=y
CONFIG_DEFCONFIG_LIST=/lib/modules/$UNAME_RELEASE/.config
CONFIG_IRQ_WORK=y
CONFIG_BUILDTIME_EXTABLE_SORT=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=
# CONFIG_COMPILE_TEST is not set
CONFIG_LOCALVERSION=
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_HAVE_KERNEL_GZIP=y
CONFIG_HAVE_KERNEL_BZIP2=y
CONFIG_HAVE_KERNEL_LZMA=y
CONFIG_HAVE_KERNEL_XZ=y
CONFIG_HAVE_KERNEL_LZO=y
CONFIG_HAVE_KERNEL_LZ4=y
CONFIG_KERNEL_GZIP=y
# CONFIG_KERNEL_BZIP2 is not set
# CONFIG_KERNEL_LZMA is not set
# CONFIG_KERNEL_XZ is not set
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
CONFIG_DEFAULT_HOSTNAME=(none)
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_CROSS_MEMORY_ATTACH=y
# CONFIG_FHANDLE is not set
CONFIG_USELIB=y
CONFIG_AUDIT=y
CONFIG_HAVE_ARCH_AUDITSYSCALL=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_WATCH=y
CONFIG_AUDIT_TREE=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_IRQ_DOMAIN=y
# CONFIG_IRQ_DOMAIN_DEBUG is not set
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_SPARSE_IRQ=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_ARCH_CLOCKSOURCE_DATA=y
CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y
CONFIG_GENERIC_CMOS_UPDATE=y

#
# 

Re: [RFC PATCH 1/2] powerpc: Add a proper syscall for switching endianness

2015-03-10 Thread Benjamin Herrenschmidt
On Tue, 2015-03-10 at 18:36 +1100, Michael Ellerman wrote:
 We currently have a special syscall for switching endianness. This is
 syscall number 0x1ebe, which is handled explicitly in the 64-bit syscall
 exception entry.
 
 That has a few problems, firstly the syscall number is outside of the
 usual range, which confuses various tools. For example strace doesn't
 recognise the syscalls at all.
 
 Secondly it's handled explicitly as a special case in the syscall
 exception entry, which is complicated enough without it.
 
 As a first step toward removing the special syscall, we need to add a
 regular syscall that implements the same functionality.
 
 The logic is simple, it simply toggles the MSR_LE bit in the userspace
 MSR. This is the same as the special syscall, with the caveat that the
 special syscall clobbers fewer registers.

You can set _TIF_RESTOREALL to force a restore of all the registers on
the way back which should do the job.

Cheers,
Ben.

 
 Signed-off-by: Michael Ellerman m...@ellerman.id.au
 ---
  arch/powerpc/include/asm/systbl.h  | 1 +
  arch/powerpc/include/asm/unistd.h  | 2 +-
  arch/powerpc/include/uapi/asm/unistd.h | 1 +
  arch/powerpc/kernel/syscalls.c | 7 +++
  4 files changed, 10 insertions(+), 1 deletion(-)
 
 diff --git a/arch/powerpc/include/asm/systbl.h 
 b/arch/powerpc/include/asm/systbl.h
 index 91062eef582f..524bf5dff9f9 100644
 --- a/arch/powerpc/include/asm/systbl.h
 +++ b/arch/powerpc/include/asm/systbl.h
 @@ -367,3 +367,4 @@ SYSCALL_SPU(getrandom)
  SYSCALL_SPU(memfd_create)
  SYSCALL_SPU(bpf)
  COMPAT_SYS(execveat)
 +SYSCALL(switch_endian)
 diff --git a/arch/powerpc/include/asm/unistd.h 
 b/arch/powerpc/include/asm/unistd.h
 index 36b79c31eedd..f4f8b667d75b 100644
 --- a/arch/powerpc/include/asm/unistd.h
 +++ b/arch/powerpc/include/asm/unistd.h
 @@ -12,7 +12,7 @@
  #include uapi/asm/unistd.h
  
 
 -#define __NR_syscalls363
 +#define __NR_syscalls364
  
  #define __NR__exit __NR_exit
  #define NR_syscalls  __NR_syscalls
 diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
 b/arch/powerpc/include/uapi/asm/unistd.h
 index ef5b5b1f3123..e4aa173dae62 100644
 --- a/arch/powerpc/include/uapi/asm/unistd.h
 +++ b/arch/powerpc/include/uapi/asm/unistd.h
 @@ -385,5 +385,6 @@
  #define __NR_memfd_create360
  #define __NR_bpf 361
  #define __NR_execveat362
 +#define __NR_switch_endian   363
  
  #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
 diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
 index b2702e87db0d..0ea01957990e 100644
 --- a/arch/powerpc/kernel/syscalls.c
 +++ b/arch/powerpc/kernel/syscalls.c
 @@ -121,3 +121,10 @@ long ppc_fadvise64_64(int fd, int advice, u32 
 offset_high, u32 offset_low,
   return sys_fadvise64(fd, (u64)offset_high  32 | offset_low,
(u64)len_high  32 | len_low, advice);
  }
 +
 +long sys_switch_endian(void)
 +{
 + current-thread.regs-msr ^= MSR_LE;
 +
 + return 0;
 +}


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: book3e_64: fix the align size for paca_struct

2015-03-10 Thread Kevin Hao
On Mon, Mar 09, 2015 at 06:31:25PM +1100, Benjamin Herrenschmidt wrote:
 On Mon, 2015-03-09 at 09:13 +0800, Kevin Hao wrote:
  On Sun, Mar 08, 2015 at 08:13:26PM +1100, Benjamin Herrenschmidt wrote:
   On Sat, 2015-03-07 at 19:14 +0800, Kevin Hao wrote:
All the cache line size of the current book3e 64bit SoCs are 64 bytes.
So we should use this size to align the member of paca_struct.
With this change we save 192 bytes. Also change it to __aligned(size)
since it is preferred over __attribute__((aligned(size))).
   
   Why should we favor the book3e CPUs over the book3s ones ?
  
  Why do you think so? This only change the align size of the paca_struct's
  members which are private to book3e CPUs, and should not have any effect
  to book3s ones. Did I miss something?
 
 No, your explanation was lacking that important detail :-)

Sorry for the confusion, I will add this information in the commit log.

Thanks,
Kevin


pgpNhzA4BHwp3.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 1/2] powerpc: Add a proper syscall for switching endianness

2015-03-10 Thread Michael Ellerman
On Tue, 2015-03-10 at 20:34 +1100, Benjamin Herrenschmidt wrote:
 On Tue, 2015-03-10 at 18:36 +1100, Michael Ellerman wrote:
  We currently have a special syscall for switching endianness. This is
  syscall number 0x1ebe, which is handled explicitly in the 64-bit syscall
  exception entry.
  
  That has a few problems, firstly the syscall number is outside of the
  usual range, which confuses various tools. For example strace doesn't
  recognise the syscalls at all.
  
  Secondly it's handled explicitly as a special case in the syscall
  exception entry, which is complicated enough without it.
  
  As a first step toward removing the special syscall, we need to add a
  regular syscall that implements the same functionality.
  
  The logic is simple, it simply toggles the MSR_LE bit in the userspace
  MSR. This is the same as the special syscall, with the caveat that the
  special syscall clobbers fewer registers.
 
 You can set _TIF_RESTOREALL to force a restore of all the registers on
 the way back which should do the job.

Right, I'd forgotten we talked about that.

I'll try that tomorrow.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 4/4] mm: numa: Slow PTE scan rate if migration failures occur

2015-03-10 Thread Mel Gorman
On Mon, Mar 09, 2015 at 09:02:19PM +, Mel Gorman wrote:
 On Sun, Mar 08, 2015 at 08:40:25PM +, Mel Gorman wrote:
   Because if the answer is 'yes', then we can safely say: 'we regressed 
   performance because correctness [not dropping dirty bits] comes before 
   performance'.
   
   If the answer is 'no', then we still have a mystery (and a regression) 
   to track down.
   
   As a second hack (not to be applied), could we change:
   
#define _PAGE_BIT_PROTNONE  _PAGE_BIT_GLOBAL
   
   to:
   
#define _PAGE_BIT_PROTNONE  (_PAGE_BIT_GLOBAL+1)
   
  
  In itself, that's not enough. The SWP_OFFSET_SHIFT would also need updating
  as a partial revert of 21d9ee3eda7792c45880b2f11bff8e95c9a061fb but it
  can be done.
  
 
 More importantily, _PAGE_BIT_GLOBAL+1 == the special PTE bit so just
 updating the value should crash. For the purposes of testing the idea, I
 thought the straight-forward option was to break soft dirty page tracking
 and steal their bit for testing (patch below). Took most of the day to
 get access to the test machine so tests are not long running and only
 the autonuma one has completed;
 

And the xfsrepair workload also does not show any benefit from using a
different bit either

   3.19.0 4.0.0-rc1 
4.0.0-rc1 4.0.0-rc1
  vanilla   vanilla 
slowscan-v2r7protnone-v3r17
Min  real-fsmark1164.44 (  0.00%) 1157.41 (  0.60%) 1150.38 
(  1.21%) 1173.22 ( -0.75%)
Min  syst-fsmark4016.12 (  0.00%) 3998.06 (  0.45%) 3988.42 
(  0.69%) 4037.90 ( -0.54%)
Min  real-xfsrepair  442.64 (  0.00%)  497.64 (-12.43%)  456.87 
( -3.21%)  489.60 (-10.61%)
Min  syst-xfsrepair  194.97 (  0.00%)  500.61 (-156.76%)  
263.41 (-35.10%)  544.56 (-179.30%)
Ameanreal-fsmark1166.28 (  0.00%) 1166.63 ( -0.03%) 1155.97 
(  0.88%) 1183.19 ( -1.45%)
Ameansyst-fsmark4025.87 (  0.00%) 4020.94 (  0.12%) 4004.19 
(  0.54%) 4061.64 ( -0.89%)
Ameanreal-xfsrepair  447.66 (  0.00%)  507.85 (-13.45%)  459.58 
( -2.66%)  498.71 (-11.40%)
Ameansyst-xfsrepair  202.93 (  0.00%)  519.88 (-156.19%)  
281.63 (-38.78%)  569.21 (-180.50%)
Stddev   real-fsmark   1.44 (  0.00%)6.55 (-354.10%)
3.97 (-175.65%)9.20 (-537.90%)
Stddev   syst-fsmark   9.76 (  0.00%)   16.22 (-66.27%)   15.09 
(-54.69%)   17.47 (-79.13%)
Stddev   real-xfsrepair5.57 (  0.00%)   11.17 (-100.68%)
3.41 ( 38.66%)6.77 (-21.63%)
Stddev   syst-xfsrepair5.69 (  0.00%)   13.98 (-145.78%)   
19.94 (-250.49%)   20.03 (-252.05%)
CoeffVar real-fsmark   0.12 (  0.00%)0.56 (-353.96%)
0.34 (-178.11%)0.78 (-528.79%)
CoeffVar syst-fsmark   0.24 (  0.00%)0.40 (-66.48%)0.38 
(-55.53%)0.43 (-77.55%)
CoeffVar real-xfsrepair1.24 (  0.00%)2.20 (-76.89%)0.74 
( 40.25%)1.36 ( -9.17%)
CoeffVar syst-xfsrepair2.80 (  0.00%)2.69 (  4.06%)7.08 
(-152.54%)3.52 (-25.51%)
Max  real-fsmark1167.96 (  0.00%) 1171.98 ( -0.34%) 1159.25 
(  0.75%) 1195.41 ( -2.35%)
Max  syst-fsmark4039.20 (  0.00%) 4033.84 (  0.13%) 4024.53 
(  0.36%) 4079.45 ( -1.00%)
Max  real-xfsrepair  455.42 (  0.00%)  523.40 (-14.93%)  464.40 
( -1.97%)  505.82 (-11.07%)
Max  syst-xfsrepair  207.94 (  0.00%)  533.37 (-156.50%)  
309.38 (-48.78%)  593.62 (-185.48%)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC V2 03/12] i2c: at91: make use of the new infrastructure for quirks

2015-03-10 Thread Ludovic Desroches
Hi Wolfram,

You can add my 

Acked-by and Tested-By: Ludovic Desroches ludovic.desroc...@atmel.com

Tested on sama5d3, some problems with at24 eeprom on sama5d4 but it
doesn't come from the i2c quirks patch series.

Regards

Ludovic

On Sun, Mar 08, 2015 at 09:28:45AM +0100, Wolfram Sang wrote:
 On Wed, Feb 25, 2015 at 05:01:54PM +0100, Wolfram Sang wrote:
  From: Wolfram Sang wsa+rene...@sang-engineering.com
  
  Signed-off-by: Wolfram Sang wsa+rene...@sang-engineering.com
 
 Hi Ludovic,
 
 if you have a few minutes, could you please test this series? I'd like to
 include it in 4.1. and because at91 is using the quirk infrastructure in
 a more complex way, it is a really good test candidate.
 
 Thanks,
 
Wolfram
 
  ---
   drivers/i2c/busses/i2c-at91.c | 32 +++-
   1 file changed, 11 insertions(+), 21 deletions(-)
  
  diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
  index 636fd2efad8850..b3a70e8fc653c5 100644
  --- a/drivers/i2c/busses/i2c-at91.c
  +++ b/drivers/i2c/busses/i2c-at91.c
  @@ -487,30 +487,10 @@ static int at91_twi_xfer(struct i2c_adapter *adap, 
  struct i2c_msg *msg, int num)
  if (ret  0)
  goto out;
   
  -   /*
  -* The hardware can handle at most two messages concatenated by a
  -* repeated start via it's internal address feature.
  -*/
  -   if (num  2) {
  -   dev_err(dev-dev,
  -   cannot handle more than two concatenated messages.\n);
  -   ret = 0;
  -   goto out;
  -   } else if (num == 2) {
  +   if (num == 2) {
  int internal_address = 0;
  int i;
   
  -   if (msg-flags  I2C_M_RD) {
  -   dev_err(dev-dev, first transfer must be write.\n);
  -   ret = -EINVAL;
  -   goto out;
  -   }
  -   if (msg-len  3) {
  -   dev_err(dev-dev, first message size must be = 3.\n);
  -   ret = -EINVAL;
  -   goto out;
  -   }
  -
  /* 1st msg is put into the internal address, start with 2nd */
  m_start = msg[1];
  for (i = 0; i  msg-len; ++i) {
  @@ -540,6 +520,15 @@ out:
  return ret;
   }
   
  +/*
  + * The hardware can handle at most two messages concatenated by a
  + * repeated start via it's internal address feature.
  + */
  +static struct i2c_adapter_quirks at91_twi_quirks = {
  +   .flags = I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | I2C_AQ_COMB_SAME_ADDR,
  +   .max_comb_1st_msg_len = 3,
  +};
  +
   static u32 at91_twi_func(struct i2c_adapter *adapter)
   {
  return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL
  @@ -777,6 +766,7 @@ static int at91_twi_probe(struct platform_device *pdev)
  dev-adapter.owner = THIS_MODULE;
  dev-adapter.class = I2C_CLASS_DEPRECATED;
  dev-adapter.algo = at91_twi_algorithm;
  +   dev-adapter.quirks = at91_twi_quirks;
  dev-adapter.dev.parent = dev-dev;
  dev-adapter.nr = pdev-id;
  dev-adapter.timeout = AT91_I2C_TIMEOUT;
  -- 
  2.1.4
  


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc: book3e_64: fix the align size for paca_struct

2015-03-10 Thread Kevin Hao
All the cache line size of the current book3e 64bit SoCs are 64 bytes.
So we should use this size to align the member of paca_struct.
This only change the paca_struct's members which are private to book3e
CPUs, and should not have any effect to book3s ones. With this, we save
192 bytes. Also change it to __aligned(size) since it is preferred over
__attribute__((aligned(size))).

Before:
/* size: 1920, cachelines: 30, members: 46 */
/* sum members: 1667, holes: 6, sum holes: 141 */
/* padding: 112 */

After:
/* size: 1728, cachelines: 27, members: 46 */
/* sum members: 1667, holes: 4, sum holes: 13 */
/* padding: 48 */

Signed-off-by: Kevin Hao haoke...@gmail.com
---
v2: Only update the commit log.

 arch/powerpc/include/asm/paca.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index e5f22c6c4bf9..70bd4381f8e6 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -106,9 +106,9 @@ struct paca_struct {
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
 #ifdef CONFIG_PPC_BOOK3E
-   u64 exgen[8] __attribute__((aligned(0x80)));
+   u64 exgen[8] __aligned(0x40);
/* Keep pgd in the same cacheline as the start of extlb */
-   pgd_t *pgd __attribute__((aligned(0x80))); /* Current PGD */
+   pgd_t *pgd __aligned(0x40); /* Current PGD */
pgd_t *kernel_pgd;  /* Kernel PGD */
 
/* Shared by all threads of a core -- points to tcd of first thread */
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc64: use fixed lock token for !CONFIG_PPC_SPLPAR

2015-03-10 Thread Kevin Hao
On Tue, Mar 10, 2015 at 11:15:18AM +1100, Michael Ellerman wrote:
 On Mon, 2015-03-09 at 17:53 +1100, Benjamin Herrenschmidt wrote:
  On Sat, 2015-03-07 at 19:19 +0800, Kevin Hao wrote:
   It makes no sense to use a variant lock token on a platform which
   doesn't support for shared-processor logical partitions. Actually we
   can eliminate a memory load by using a fixed lock token on these
   platforms.
  
  Does this provide an actual measurable benefit ? I found that the lock
  token was quite handy for debugging ...
 
 Yeah. It can be very useful to know which cpu holds a lock when you get into a
 dead lock.
 
 Unless you can show this is a performance boost I'm inclined to leave it 
 as-is.

I am not sure if there is more suitable benchmark for spinlock. I tried the
perf locking benchmark got from here [1]. The test was running on a t4240rdb
board with xfs rootfs. I have run the following commands four times before and
after applying this patch.
./perf record ./perf bench locking vfs; ./perf report

Before:
3.06%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock
3.04%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock
3.03%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock
3.00%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock

After:
3.05%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock 
2.97%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock
2.96%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock
2.97%  locking-vfs  [kernel.kallsyms]   [k] ._raw_spin_lock

[1] http://lkml.iu.edu/hypermail/linux/kernel/1412.1/01419.html

Thanks,
Kevin


pgpsAPyXngmuG.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 03/15] fbdev: aty128fb: replace PPC_OF with PPC

2015-03-10 Thread Tomi Valkeinen
On 27/02/15 03:05, Kevin Hao wrote:
 On Fri, Feb 27, 2015 at 11:11:15AM +1100, Benjamin Herrenschmidt wrote:
 On Sat, 2015-01-31 at 21:47 +0800, Kevin Hao wrote:
 The PPC_OF is a ppc specific option which is used to mean that the
 firmware device tree access functions are available. Since all the
 ppc platforms have a device tree, it is aways set to 'y' for ppc.
 So it makes no sense to keep a such option in the current kernel.
 Replace it with PPC.

 Signed-off-by: Kevin Hao haoke...@gmail.com

 For this and generally the whole series,

 Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org
 
 Thanks Ben.
 

 Which tree do we expect this to go through ?
 
 I just sent out a v2 [1] a few hours earlier with some minor updates. We plan
 to merge this patch series via the powerpc tree in 4.1 cycle if I can collect
 all the acks from the corresponding driver maintainers.

Fbdev changes look ok to me.

 Tomi




signature.asc
Description: OpenPGP digital signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/4] powerpc/eeh: fix powernv_eeh_wait_state delay logic

2015-03-10 Thread Gavin Shan
On Mon, Mar 09, 2015 at 11:17:31AM +0800, Wei Yang wrote:
As the comment indicates, powernv_eeh_get_state() will inform EEH core to
delay 1 second. This means the delay doesn't happen when
powernv_eeh_get_state() returns.

This patch moves the delay subtraction just before msleep(), which is the
same logic in pseries_eeh_wait_state().

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com

The patch would conflict with the patches to remove EEH chip layer done
in https://patchwork.ozlabs.org/patch/439956/. Could you please repost
with corrected function names in commit log when the dependent patches
show up in Michael's next branch?

Except that, the changes look ok to me:

Acked-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/platforms/powernv/eeh-powernv.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 5abb4c2..af1be8f 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -334,13 +334,13 @@ static int powernv_eeh_wait_state(struct eeh_pe *pe, int 
max_wait)
   if (ret != EEH_STATE_UNAVAILABLE)
   return ret;

-  max_wait -= mwait;
   if (max_wait = 0) {
   pr_warn(%s: Timeout getting PE#%x's state (%d)\n,
   __func__, pe-addr, max_wait);
   return EEH_STATE_NOT_SUPPORT;
   }

+  max_wait -= mwait;
   msleep(mwait);
   }

-- 
1.7.9.5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 1/2] powerpc: Add a proper syscall for switching endianness

2015-03-10 Thread Benjamin Herrenschmidt
On Wed, 2015-03-11 at 11:08 +0530, Anshuman Khandual wrote:
 On 03/10/2015 04:25 PM, Michael Ellerman wrote:
  On Tue, 2015-03-10 at 20:34 +1100, Benjamin Herrenschmidt wrote:
  On Tue, 2015-03-10 at 18:36 +1100, Michael Ellerman wrote:
  We currently have a special syscall for switching endianness. This is
  syscall number 0x1ebe, which is handled explicitly in the 64-bit syscall
  exception entry.
 
  That has a few problems, firstly the syscall number is outside of the
  usual range, which confuses various tools. For example strace doesn't
  recognise the syscalls at all.
 
  Secondly it's handled explicitly as a special case in the syscall
  exception entry, which is complicated enough without it.
 
  As a first step toward removing the special syscall, we need to add a
  regular syscall that implements the same functionality.
 
  The logic is simple, it simply toggles the MSR_LE bit in the userspace
  MSR. This is the same as the special syscall, with the caveat that the
  special syscall clobbers fewer registers.
 
  You can set _TIF_RESTOREALL to force a restore of all the registers on
  the way back which should do the job.
  
  Right, I'd forgotten we talked about that.
  
  I'll try that tomorrow.
 
 The test fails when we add set_thread_flag(TIF_RESTOREALL) after the MSR flip.
 Though the test passes with the original patch.

We also need to wrap the syscall like we do for fork() etc... to
save_nvgpr's.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v12 15/21] powerpc/powernv: Reserve additional space for IOV BAR according to the number of total_pe

2015-03-10 Thread Bjorn Helgaas
On Mon, Mar 02, 2015 at 03:41:32PM +0800, Wei Yang wrote:
 On Tue, Feb 24, 2015 at 02:52:34AM -0600, Bjorn Helgaas wrote:
 On Tue, Feb 24, 2015 at 02:34:42AM -0600, Bjorn Helgaas wrote:
  From: Wei Yang weiy...@linux.vnet.ibm.com
  
  On PHB3, PF IOV BAR will be covered by M64 window to have better PE
  isolation.  The total_pe number is usually different from total_VFs, which
  can lead to a conflict between MMIO space and the PE number.
  
  For example, if total_VFs is 128 and total_pe is 256, the second half of
  M64 window will be part of other PCI device, which may already belong
  to other PEs.
 
 I'm still trying to wrap my mind around the explanation here.
 
 I *think* what's going on is that the M64 window must be a power-of-two
 size.  If the VF(n) BAR space doesn't completely fill it, we might allocate
 the leftover space to another device.  Then the M64 window for *this*
 device may cause the other device to be associated with a PE it didn't
 expect.
 
 Yes, this is the exact reason.

Can you include some of this text in your changelog, then?  I can wordsmith
it and try to make it fit together better.

  +#ifdef CONFIG_PCI_IOV
  +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
  +{
  +  struct pci_controller *hose;
  +  struct pnv_phb *phb;
  +  struct resource *res;
  +  int i;
  +  resource_size_t size;
  +  struct pci_dn *pdn;
  +
  +  if (!pdev-is_physfn || pdev-is_added)
  +  return;
  +
  +  hose = pci_bus_to_host(pdev-bus);
  +  phb = hose-private_data;
  +
  +  pdn = pci_get_pdn(pdev);
  +  pdn-max_vfs = 0;
  +
  +  for (i = 0; i  PCI_SRIOV_NUM_BARS; i++) {
  +  res = pdev-resource[i + PCI_IOV_RESOURCES];
  +  if (!res-flags || res-parent)
  +  continue;
  +  if (!pnv_pci_is_mem_pref_64(res-flags)) {
  +  dev_warn(pdev-dev, Skipping expanding VF BAR%d: 
  %pR\n,
  +   i, res);
  +  continue;
  +  }
  +
  +  dev_dbg(pdev-dev,  Fixing VF BAR%d: %pR to\n, i, res);
  +  size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
  +  res-end = res-start + size * phb-ioda.total_pe - 1;
  +  dev_dbg(pdev-dev,%pR\n, res);
  +  dev_info(pdev-dev, VF BAR%d: %pR (expanded to %d VFs for PE 
  alignment),
  +  i, res, phb-ioda.total_pe);
  +  }
  +  pdn-max_vfs = phb-ioda.total_pe;
  +}
  +
  +static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
  +{
  +  struct pci_dev *pdev;
  +  struct pci_bus *b;
  +
  +  list_for_each_entry(pdev, bus-devices, bus_list) {
  +  b = pdev-subordinate;
  +
  +  if (b)
  +  pnv_pci_ioda_fixup_sriov(b);
  +
  +  pnv_pci_ioda_fixup_iov_resources(pdev);
 
 I'm not sure this happens at the right time.  We have this call chain:
 
   pcibios_scan_phb
 pci_create_root_bus
 pci_scan_child_bus
 pnv_pci_ioda_fixup_sriov
   pnv_pci_ioda_fixup_iov_resources
  for (i = 0; i  PCI_SRIOV_NUM_BARS; i++)
increase res-size to accomodate 256 PEs (or roundup(totalVFs)
 
 so we only do the fixup_iov_resources() when we scan the PHB, and we
 wouldn't do it at all for hot-added devices.
 
 Yep, you are right :-)
 
 I had a separate patch to do this in pcibios_add_pci_devices(). Looks we could
 merge them.

Did you fix this in v13?  I don't see the change if you did.

  +  }
  +}
  +#endif /* CONFIG_PCI_IOV */
  +
   /*
* This function is supposed to be called on basis of PE from top
* to bottom style. So the the I/O or MMIO segment assigned to
  @@ -2125,6 +2180,9 @@ static void __init pnv_pci_init_ioda_phb(struct 
  device_node *np,
 ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
 ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
 ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
  +#ifdef CONFIG_PCI_IOV
  +  ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
  +#endif /* CONFIG_PCI_IOV */
 pci_add_flags(PCI_REASSIGN_ALL_RSRC);
   
 /* Reset IODA tables to a clean state */
  
 
 -- 
 Richard Yang
 Help you, Help me
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/4] powerpc/eeh: fix start/end/flags type in struct pci_io_addr_range{}

2015-03-10 Thread Gavin Shan
On Mon, Mar 09, 2015 at 11:17:29AM +0800, Wei Yang wrote:
struct pci_io_addr_range{} stores the information of pci resources. It
would be better to keep these related fields have the same type as in
struct resource{}.

This patch fixes the start/end/flags type in struct pci_io_addr_range{} to
have the same type as in struct resource{}.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com

Acked-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/kernel/eeh_cache.c |   16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 07d8a24..a30ed67 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -48,11 +48,11 @@
  */
 struct pci_io_addr_range {
   struct rb_node rb_node;
-  unsigned long addr_lo;
-  unsigned long addr_hi;
+  resource_size_t addr_lo;
+  resource_size_t addr_hi;
   struct eeh_dev *edev;
   struct pci_dev *pcidev;
-  unsigned int flags;
+  unsigned long flags;
 };

 static struct pci_io_addr_cache {
@@ -125,8 +125,8 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache 
*cache)

 /* Insert address range into the rb tree. */
 static struct pci_io_addr_range *
-eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
-unsigned long ahi, unsigned int flags)
+eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
+resource_size_t ahi, unsigned long flags)
 {
   struct rb_node **p = pci_io_addr_cache_root.rb_root.rb_node;
   struct rb_node *parent = NULL;
@@ -200,9 +200,9 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev 
*dev)

   /* Walk resources on this device, poke them into the tree */
   for (i = 0; i  DEVICE_COUNT_RESOURCE; i++) {
-  unsigned long start = pci_resource_start(dev,i);
-  unsigned long end = pci_resource_end(dev,i);
-  unsigned int flags = pci_resource_flags(dev,i);
+  resource_size_t start = pci_resource_start(dev,i);
+  resource_size_t end = pci_resource_end(dev,i);
+  unsigned long flags = pci_resource_flags(dev,i);

   /* We are interested only bus addresses, not dma or other stuff 
 */
   if (0 == (flags  (IORESOURCE_IO | IORESOURCE_MEM)))
-- 
1.7.9.5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v12 14/21] powerpc/powernv: Allocate struct pnv_ioda_pe iommu_table dynamically

2015-03-10 Thread Bjorn Helgaas
On Mon, Mar 02, 2015 at 03:50:37PM +0800, Wei Yang wrote:
 On Tue, Feb 24, 2015 at 02:46:53AM -0600, Bjorn Helgaas wrote:
 On Tue, Feb 24, 2015 at 02:34:35AM -0600, Bjorn Helgaas wrote:
  From: Wei Yang weiy...@linux.vnet.ibm.com
  
  Current iommu_table of a PE is a static field.  This will have a problem
  when iommu_free_table() is called.
  
  Allocate iommu_table dynamically.
 
 I'd like a little more explanation about why we're calling
 iommu_free_table() now when we didn't call it before.  Maybe this happens
 when we disable SR-IOV and the VFs go away?
 
 Yes, it is called in disable path.
 
 pcibios_sriov_disable
 pnv_pci_sriov_disable
 pnv_ioda_release_vf_PE
   pnv_pci_ioda2_release_dma_pe
   iommu_free_table--- here it is invoked
 
 
 
 Is there a hotplug remove path where we should also be calling
 iommu_free_table()?
 
 When VF is not introduced, no one calls this on powernv platform.
 
 Each PCI bus is a PE and it has its own iommu table, even a device is
 hotpluged, the iommu table will not be released.

None of this explanation made it into the v13 patch.  And I don't quite
understand it anyway.

Something like Previously the iommu_table had the same lifetime as a
struct pnv_ioda_pe and was embedded in it.  The pnv_ioda_pe was allocated
when XXX and freed when YYY.  This no longer works: we can't allocate the
iommu_table at the same time as the pnv_ioda_pe because XXX, so we allocate
it when XXX and free it when YYY.

Bjorn
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/4] powerpc/eeh: fix comment for wait_state()

2015-03-10 Thread Gavin Shan
On Mon, Mar 09, 2015 at 11:17:30AM +0800, Wei Yang wrote:
To retrieve the PCI slot state, EEH driver would set a timeout for that.
While current comment is not aligned to what the code does.

This patch fixes those comments according to the code.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com

Please repost after https://patchwork.ozlabs.org/patch/439956/ shows up
in PPC next branch. The changes look good to me:

Acked-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/kernel/eeh_driver.c |2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c 
b/arch/powerpc/kernel/eeh_driver.c
index d099540..cc8d1db 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -682,7 +682,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
   eeh_pe_dev_traverse(pe, eeh_report_error, result);

   /* Get the current PCI slot state. This can take a long time,
-   * sometimes over 3 seconds for certain systems.
+   * sometimes over 300 seconds for certain systems.
*/
   rc = eeh_ops-wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
   if (rc  0 || rc == EEH_STATE_NOT_SUPPORT) {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 7a5021b..5abb4c2 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -313,7 +313,7 @@ static int powernv_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * powernv_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index a6c7e19..9de46c5 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -538,7 +538,7 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * pseries_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
-- 
1.7.9.5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [REGRESSION in 3.18][PPC] PA Semi fails to boot after: of/base: Fix PowerPC address parsing hack

2015-03-10 Thread Steven Rostedt
On Tue, 10 Mar 2015 11:28:03 +1100
Michael Ellerman m...@ellerman.id.au wrote:

 
 Mine is running:
 
   CFE version PAS-2.0.29 for ATHENA (64bit,MP,BE,PPC)
   Build Date: Mon Jun 30 11:47:25 PDT 2008 (mpl@mitch-1)
 
 Steve is your CFE older than that?

Seems so:

CFE version PAS-2.0.20 for ELECTRA (64bit,MP,BE,PPC)
Build Date: Tue Nov  6 22:35:48 PST 2007 (mpl@mitch-1)

-- Steve
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC V2 04/12] i2c: opal: make use of the new infrastructure for quirks

2015-03-10 Thread Neelesh Gupta

I tested the i2c opal driver after updating the patch as below.
Basically I think we can also support write-then-{read/write}
for the number of messages = 2.
Ben, any issues if we support both write plus read/write in the
opal driver ?

Regards,
Neelesh


 drivers/i2c/busses/i2c-opal.c |   20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
index 16f90b1..85412ba 100644
--- a/drivers/i2c/busses/i2c-opal.c
+++ b/drivers/i2c/busses/i2c-opal.c
@@ -104,18 +104,8 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, 
struct i2c_msg *msgs,
req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
break;
case 2:
-   /* For two messages, we basically support only simple
-* smbus transactions of a write plus a read. We might
-* want to allow also two writes but we'd have to bounce
-* the data into a single buffer.
-*/
-   if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
-   return -EOPNOTSUPP;
-   if (msgs[0].len  4)
-   return -EOPNOTSUPP;
-   if (msgs[0].addr != msgs[1].addr)
-   return -EOPNOTSUPP;
-   req.type = OPAL_I2C_SM_READ;
+   req.type = (msgs[1].flags  I2C_M_RD) ?
+   OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE;
req.addr = cpu_to_be16(msgs[0].addr);
req.subaddr_sz = msgs[0].len;
for (i = 0; i  msgs[0].len; i++)
@@ -210,6 +200,11 @@ static const struct i2c_algorithm i2c_opal_algo = {
.functionality  = i2c_opal_func,
 };

+static struct i2c_adapter_quirks i2c_opal_quirks = {
+   .flags = I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | I2C_AQ_COMB_SAME_ADDR,
+   .max_comb_1st_msg_len = 4,
+};
+
 static int i2c_opal_probe(struct platform_device *pdev)
 {
struct i2c_adapter  *adapter;
@@ -232,6 +227,7 @@ static int i2c_opal_probe(struct platform_device *pdev)

adapter-algo = i2c_opal_algo;
adapter-algo_data = (void *)(unsigned long)opal_id;
+   adapter-quirks = i2c_opal_quirks;
adapter-dev.parent = pdev-dev;
adapter-dev.of_node = of_node_get(pdev-dev.of_node);
pname = of_get_property(pdev-dev.of_node, ibm,port-name, NULL);



On 02/25/2015 09:31 PM, Wolfram Sang wrote:

From: Wolfram Sang wsa+rene...@sang-engineering.com

Signed-off-by: Wolfram Sang wsa+rene...@sang-engineering.com
---
  drivers/i2c/busses/i2c-opal.c | 22 +++---
  1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
index 16f90b1a750894..b2788ecad5b3cb 100644
--- a/drivers/i2c/busses/i2c-opal.c
+++ b/drivers/i2c/busses/i2c-opal.c
@@ -104,17 +104,6 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, 
struct i2c_msg *msgs,
req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
break;
case 2:
-   /* For two messages, we basically support only simple
-* smbus transactions of a write plus a read. We might
-* want to allow also two writes but we'd have to bounce
-* the data into a single buffer.
-*/
-   if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
-   return -EOPNOTSUPP;
-   if (msgs[0].len  4)
-   return -EOPNOTSUPP;
-   if (msgs[0].addr != msgs[1].addr)
-   return -EOPNOTSUPP;
req.type = OPAL_I2C_SM_READ;
req.addr = cpu_to_be16(msgs[0].addr);
req.subaddr_sz = msgs[0].len;
@@ -210,6 +199,16 @@ static const struct i2c_algorithm i2c_opal_algo = {
.functionality  = i2c_opal_func,
  };
  
+/* For two messages, we basically support only simple

+ * smbus transactions of a write plus a read. We might
+ * want to allow also two writes but we'd have to bounce
+ * the data into a single buffer.
+ */
+static struct i2c_adapter_quirks i2c_opal_quirks = {
+   .flags = I2C_AQ_COMB_WRITE_THEN_READ,
+   .max_comb_1st_msg_len = 4,
+};
+
  static int i2c_opal_probe(struct platform_device *pdev)
  {
struct i2c_adapter  *adapter;
@@ -232,6 +231,7 @@ static int i2c_opal_probe(struct platform_device *pdev)
  
  	adapter-algo = i2c_opal_algo;

adapter-algo_data = (void *)(unsigned long)opal_id;
+   adapter-quirks = i2c_opal_quirks;
adapter-dev.parent = pdev-dev;
adapter-dev.of_node = of_node_get(pdev-dev.of_node);
pname = of_get_property(pdev-dev.of_node, ibm,port-name, NULL);


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/fsl-booke: Add T4080 SVR value

2015-03-10 Thread Igal . Liberman
From: Madalin Bucur madalin.bu...@freescale.com

Signed-off-by: Madalin Bucur madalin.bu...@freescale.com
---
 arch/powerpc/include/asm/mpc85xx.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/mpc85xx.h 
b/arch/powerpc/include/asm/mpc85xx.h
index 3bef74a..213f3a8 100644
--- a/arch/powerpc/include/asm/mpc85xx.h
+++ b/arch/powerpc/include/asm/mpc85xx.h
@@ -61,6 +61,7 @@
 #define SVR_T4240  0x824000
 #define SVR_T4120  0x824001
 #define SVR_T4160  0x824100
+#define SVR_T4080  0x824102
 #define SVR_C291   0x85
 #define SVR_C292   0x850020
 #define SVR_C293   0x850030
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 1/2] powerpc: Add a proper syscall for switching endianness

2015-03-10 Thread Anshuman Khandual
On 03/10/2015 04:25 PM, Michael Ellerman wrote:
 On Tue, 2015-03-10 at 20:34 +1100, Benjamin Herrenschmidt wrote:
 On Tue, 2015-03-10 at 18:36 +1100, Michael Ellerman wrote:
 We currently have a special syscall for switching endianness. This is
 syscall number 0x1ebe, which is handled explicitly in the 64-bit syscall
 exception entry.

 That has a few problems, firstly the syscall number is outside of the
 usual range, which confuses various tools. For example strace doesn't
 recognise the syscalls at all.

 Secondly it's handled explicitly as a special case in the syscall
 exception entry, which is complicated enough without it.

 As a first step toward removing the special syscall, we need to add a
 regular syscall that implements the same functionality.

 The logic is simple, it simply toggles the MSR_LE bit in the userspace
 MSR. This is the same as the special syscall, with the caveat that the
 special syscall clobbers fewer registers.

 You can set _TIF_RESTOREALL to force a restore of all the registers on
 the way back which should do the job.
 
 Right, I'd forgotten we talked about that.
 
 I'll try that tomorrow.

The test fails when we add set_thread_flag(TIF_RESTOREALL) after the MSR flip.
Though the test passes with the original patch.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC V2 04/12] i2c: opal: make use of the new infrastructure for quirks

2015-03-10 Thread Neelesh Gupta


On 03/11/2015 04:42 AM, Benjamin Herrenschmidt wrote:

On Tue, 2015-03-10 at 22:43 +0530, Neelesh Gupta wrote:

I tested the i2c opal driver after updating the patch as below.
Basically I think we can also support write-then-{read/write}
for the number of messages = 2.
Ben, any issues if we support both write plus read/write in the
opal driver ?

Nope, in fact it's a good idea, I found myself having to expoes such
an interface to some userspace tool of ours.

However...


diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
index 16f90b1..85412ba 100644
--- a/drivers/i2c/busses/i2c-opal.c
+++ b/drivers/i2c/busses/i2c-opal.c
@@ -104,18 +104,8 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, 
struct i2c_msg *msgs,
req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
break;
case 2:
-   /* For two messages, we basically support only simple
-* smbus transactions of a write plus a read. We might
-* want to allow also two writes but we'd have to bounce
-* the data into a single buffer.
-*/
-   if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
-   return -EOPNOTSUPP;

Don't we still want to enforce that the first message is a write ?
Somebody may not be looking at the quirks...


-   if (msgs[0].len  4)
-   return -EOPNOTSUPP;

And that the len is supported...


-   if (msgs[0].addr != msgs[1].addr)
-   return -EOPNOTSUPP;

Same...

Ie, the quirk indicates to the callers what we support, but we should
still check that we aren't called with something that doesn't match.


Quirk *also* return error to the user if any of the conditions mismatch with
what we have indicated through the quriks structure...

I think we can't land up here by-passing the check for quirks so above 
checks

are duplicated here..

Neelesh.




-   req.type = OPAL_I2C_SM_READ;
+   req.type = (msgs[1].flags  I2C_M_RD) ?
+   OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE;
req.addr = cpu_to_be16(msgs[0].addr);
req.subaddr_sz = msgs[0].len;
for (i = 0; i  msgs[0].len; i++)
@@ -210,6 +200,11 @@ static const struct i2c_algorithm i2c_opal_algo = {
.functionality  = i2c_opal_func,
   };

+static struct i2c_adapter_quirks i2c_opal_quirks = {
+   .flags = I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | I2C_AQ_COMB_SAME_ADDR,
+   .max_comb_1st_msg_len = 4,
+};
+
   static int i2c_opal_probe(struct platform_device *pdev)
   {
struct i2c_adapter  *adapter;
@@ -232,6 +227,7 @@ static int i2c_opal_probe(struct platform_device *pdev)

adapter-algo = i2c_opal_algo;
adapter-algo_data = (void *)(unsigned long)opal_id;
+   adapter-quirks = i2c_opal_quirks;
adapter-dev.parent = pdev-dev;
adapter-dev.of_node = of_node_get(pdev-dev.of_node);
pname = of_get_property(pdev-dev.of_node, ibm,port-name, NULL);



On 02/25/2015 09:31 PM, Wolfram Sang wrote:

From: Wolfram Sang wsa+rene...@sang-engineering.com

Signed-off-by: Wolfram Sang wsa+rene...@sang-engineering.com
---
   drivers/i2c/busses/i2c-opal.c | 22 +++---
   1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
index 16f90b1a750894..b2788ecad5b3cb 100644
--- a/drivers/i2c/busses/i2c-opal.c
+++ b/drivers/i2c/busses/i2c-opal.c
@@ -104,17 +104,6 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, 
struct i2c_msg *msgs,
req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
break;
case 2:
-   /* For two messages, we basically support only simple
-* smbus transactions of a write plus a read. We might
-* want to allow also two writes but we'd have to bounce
-* the data into a single buffer.
-*/
-   if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
-   return -EOPNOTSUPP;
-   if (msgs[0].len  4)
-   return -EOPNOTSUPP;
-   if (msgs[0].addr != msgs[1].addr)
-   return -EOPNOTSUPP;
req.type = OPAL_I2C_SM_READ;
req.addr = cpu_to_be16(msgs[0].addr);
req.subaddr_sz = msgs[0].len;
@@ -210,6 +199,16 @@ static const struct i2c_algorithm i2c_opal_algo = {
.functionality  = i2c_opal_func,
   };
   
+/* For two messages, we basically support only simple

+ * smbus transactions of a write plus a read. We might
+ * want to allow also two writes but we'd have to bounce
+ * the data into a single buffer.
+ */
+static struct i2c_adapter_quirks i2c_opal_quirks = {
+   .flags = I2C_AQ_COMB_WRITE_THEN_READ,
+   .max_comb_1st_msg_len = 4,
+};
+
   static int 

Re: [PATCH 03/15] fbdev: aty128fb: replace PPC_OF with PPC

2015-03-10 Thread Kevin Hao
On Tue, Mar 10, 2015 at 02:23:12PM +0200, Tomi Valkeinen wrote:
  I just sent out a v2 [1] a few hours earlier with some minor updates. We 
  plan
  to merge this patch series via the powerpc tree in 4.1 cycle if I can 
  collect
  all the acks from the corresponding driver maintainers.
 
 Fbdev changes look ok to me.

Hi Tomi,

I assume that I can add a Acked-by: Tomi Valkeinen tomi.valkei...@ti.com
for all the following patches, right?
  http://patchwork.ozlabs.org/patch/443890/
  http://patchwork.ozlabs.org/patch/443891/
  http://patchwork.ozlabs.org/patch/443892/
  http://patchwork.ozlabs.org/patch/443893/
  http://patchwork.ozlabs.org/patch/443894/
  http://patchwork.ozlabs.org/patch/443895/
  http://patchwork.ozlabs.org/patch/443897/

Thanks,
Kevin


pgpfCYL10pVvd.pgp
Description: PGP signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v12 10/21] PCI: Consider additional PF's IOV BAR alignment in sizing and assigning

2015-03-10 Thread Bjorn Helgaas
On Mon, Mar 02, 2015 at 03:32:47PM +0800, Wei Yang wrote:
 On Tue, Feb 24, 2015 at 02:41:52AM -0600, Bjorn Helgaas wrote:
 On Tue, Feb 24, 2015 at 02:34:06AM -0600, Bjorn Helgaas wrote:
  From: Wei Yang weiy...@linux.vnet.ibm.com
  
  When sizing and assigning resources, we divide the resources into two
  lists: the requested list and the additional list.  We don't consider the
  alignment of additional VF(n) BAR space.
  
  This is reasonable because the alignment required for the VF(n) BAR space
  is the size of an individual VF BAR, not the size of the space for *all*
  VFs.  But some platforms, e.g., PowerNV, require additional alignment.
  
  Consider the additional IOV BAR alignment when sizing and assigning
  resources.  When there is not enough system MMIO space, the PF's IOV BAR
  alignment will not contribute to the bridge.  When there is enough system
  MMIO space, the additional alignment will contribute to the bridge.
 
 I don't understand the when there is not enough system MMIO space part.
 How do we tell if there's enough MMIO space?
 
 
 In __assign_resources_sorted(), it has two resources list, one for requested
 (head) and one for additional (realloc_head). This function will first try to
 combine them and assign. If failed, this means we don't have enough MMIO
 space.

How about this text:

  This is because the alignment required for the VF(n) BAR space is the size
  of an individual VF BAR, not the size of the space for *all* VFs.  But we
  want additional alignment to support partitioning on PowerNV.

  Consider the additional IOV BAR alignment when sizing and assigning
  resources.  When there is not enough system MMIO space to accomodate both
  the requested list and the additional list, the PF's IOV BAR alignment will
  not contribute to the bridge.  When there is enough system MMIO space for
  both lists, the additional alignment will contribute to the bridge.

We're doing something specifically for PowerNV.  I would really like to be
able to read this patch and say Oh, here's the hook where we get the
PowerNV behavior, and it's obvious that other platforms are unaffected.
But I don't see a pcibios or similar hook, so I don't know where that
PowerNV behavior is.

Is it something to do with get_res_add_align()?  That uses min_align, but I
don't know how that's connected ...  ah, I see, add_align is computed
from pci_resource_alignment(), which has this path:

  pci_resource_alignment
pci_sriov_resource_alignment
  pcibios_iov_resource_alignment

and powerpc has a special pcibios_iov_resource_alignment() for PowerNV.

  Also, take advantage of pci_dev_resource::min_align to store this
  additional alignment.
 
 This comment doesn't seem to make sense; this patch doesn't save anything
 in min_align.
 
 At the end of this patch:
 
add_to_list(realloc_head, bus-self, b_res, size1-size0, add_align);
 
 The add_align is stored in pci_dev_resource::min_align in add_to_list(). And
 retrieved by get_res_add_align() in below code. This field is not used
 previously, so I took advantage of this field to store the alignment of the
 additional resources.

Hmm.  pci_dev_resource::min_align *is* already used in
reassign_resources_sorted().  Maybe there's no overlap; I gave up the
analysis before I could convince myself.

The changelog needs to mention the add_to_list() connection.

  +  /*
  +   * There are two kinds of additional resources in the list:
  +   * 1. bridge resource  -- IORESOURCE_STARTALIGN
  +   * 2. SR-IOV resource   -- IORESOURCE_SIZEALIGN
  +   * Here just fix the additional alignment for bridge
  +   */
  +  if (!(dev_res-res-flags  IORESOURCE_STARTALIGN))
  +  continue;
  +
  +  add_align = get_res_add_align(realloc_head, dev_res-res);
  +
  +  /* Reorder the list by their alignment */
 
 Why do we need to reorder the list by alignment?
 
 Resource list head is sorted by the alignment, while the alignment would be
 changed after we considering the additional resource.
 
 Take powernv platform as an example. The IOV BAR is expanded and need to be
 aligned with its total size instead of the individual VF BAR size. If we don't
 reorder it, the IOV BAR would be assigned after some other resources, which
 may cause the real assignment fail even the total size is enough.

This is worthy of a comment in the code.

Bjorn
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 29/29] vfio: powerpc/spapr: Support Dynamic DMA windows

2015-03-10 Thread Alex Williamson
On Tue, 2015-03-10 at 01:07 +1100, Alexey Kardashevskiy wrote:
 This adds create/remove window ioctls to create and remove DMA windows.
 sPAPR defines a Dynamic DMA windows capability which allows
 para-virtualized guests to create additional DMA windows on a PCI bus.
 The existing linux kernels use this new window to map the entire guest
 memory and switch to the direct DMA operations saving time on map/unmap
 requests which would normally happen in a big amounts.
 
 This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
 VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
 Up to 2 windows are supported now by the hardware and by this driver.
 
 This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
 information such as a number of supported windows and maximum number
 levels of TCE tables.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
 Changes:
 v4:
 * moved code to tce_iommu_create_window()/tce_iommu_remove_window()
 helpers
 * added docs
 ---
  Documentation/vfio.txt  |  19 +
  arch/powerpc/include/asm/iommu.h|   2 +-
  drivers/vfio/vfio_iommu_spapr_tce.c | 165 
 +++-
  include/uapi/linux/vfio.h   |  24 +-
  4 files changed, 207 insertions(+), 3 deletions(-)
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index 791e85c..61ce393 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -446,6 +446,25 @@ the memory block.
  The user space is not expected to call these often and the block descriptors
  are stored in a linked list in the kernel.
  
 +6) sPAPR specification allows guests to have an ddditional DMA window(s) on
 +a PCI bus with a variable page size. Two ioctls have been added to support
 +this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
 +The platform has to support the functionality or error will be returned to
 +the userspace. The existing hardware supports up to 2 DMA windows, one is
 +2GB long, uses 4K pages and called default 32bit window; the other can
 +be as big as entire RAM, use different page size, it is optional - guests
 +create those in run-time if the guest driver supports 64bit DMA.
 +
 +VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
 +a number of TCE table levels (if a TCE table is going to be big enough and
 +the kernel may not be able to allocate enough of physicall contiguous 
 memory).
 +It creates a new window in the available slot and returns the bus address 
 where
 +the new window starts. Due to hardware limitation, the user space cannot 
 choose
 +the location of DMA windows.
 +
 +VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
 +and removes it.
 +
  
 ---
  
  [1] VFIO was originally an acronym for Virtual Function I/O in its
 diff --git a/arch/powerpc/include/asm/iommu.h 
 b/arch/powerpc/include/asm/iommu.h
 index 04f72ac..de82b61 100644
 --- a/arch/powerpc/include/asm/iommu.h
 +++ b/arch/powerpc/include/asm/iommu.h
 @@ -138,7 +138,7 @@ extern void iommu_free_table(struct iommu_table *tbl, 
 const char *node_name);
  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
   int nid);
  
 -#define IOMMU_TABLE_GROUP_MAX_TABLES 1
 +#define IOMMU_TABLE_GROUP_MAX_TABLES 2
  
  struct iommu_table_group;
  
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
 b/drivers/vfio/vfio_iommu_spapr_tce.c
 index 3a0b5fe..7aa4141b 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
 @@ -96,6 +96,7 @@ struct tce_container {
   struct list_head mem_list;
   struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
   struct list_head group_list;
 + bool v2;
  };
  
  struct tce_iommu_group {
 @@ -333,6 +334,20 @@ static struct iommu_table *spapr_tce_find_table(
   return ret;
  }
  
 +static int spapr_tce_find_free_table(struct tce_container *container)
 +{
 + int i;
 +
 + for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 + struct iommu_table *tbl = container-tables[i];
 +
 + if (!tbl-it_size)
 + return i;
 + }
 +
 + return -1;


Why not use a real errno here?

 +}
 +
  static int tce_iommu_enable(struct tce_container *container)
  {
   int ret = 0;
 @@ -432,6 +447,8 @@ static void *tce_iommu_open(unsigned long arg)
   INIT_LIST_HEAD_RCU(container-mem_list);
   INIT_LIST_HEAD_RCU(container-group_list);
  
 + container-v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 +

Ah, here v2 actually provides some enforced differentiation, right?  ...
oh wait, nobody ever uses this.
   return container;
  }
  
 @@ -605,11 +622,90 @@ static long tce_iommu_build(struct tce_container 
 *container,
   return ret;
  }
  
 +static long tce_iommu_create_window(struct tce_container *container,
 + __u32 page_shift, __u64 

Re: [PATCH v12 17/21] powerpc/powernv: Shift VF resource with an offset

2015-03-10 Thread Bjorn Helgaas
On Wed, Mar 04, 2015 at 11:01:24AM +0800, Wei Yang wrote:
 On Tue, Feb 24, 2015 at 03:00:37AM -0600, Bjorn Helgaas wrote:
 On Tue, Feb 24, 2015 at 02:34:57AM -0600, Bjorn Helgaas wrote:
  From: Wei Yang weiy...@linux.vnet.ibm.com
  
  On PowerNV platform, resource position in M64 implies the PE# the resource
  belongs to.  In some cases, adjustment of a resource is necessary to locate
  it to a correct position in M64.
  
  Add pnv_pci_vf_resource_shift() to shift the 'real' PF IOV BAR address
  according to an offset.
  
  [bhelgaas: rework loops, rework overlap check, index resource[]
  conventionally, remove pci_regs.h include, squashed with next patch]
  Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
  Signed-off-by: Bjorn Helgaas bhelg...@google.com
 
 ...
 
  +#ifdef CONFIG_PCI_IOV
  +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
  +{
  +  struct pci_dn *pdn = pci_get_pdn(dev);
  +  int i;
  +  struct resource *res, res2;
  +  resource_size_t size;
  +  u16 vf_num;
  +
  +  if (!dev-is_physfn)
  +  return -EINVAL;
  +
  +  /*
  +   * offset is in VFs.  The M64 windows are sized so that when they
  +   * are segmented, each segment is the same size as the IOV BAR.
  +   * Each segment is in a separate PE, and the high order bits of the
  +   * address are the PE number.  Therefore, each VF's BAR is in a
  +   * separate PE, and changing the IOV BAR start address changes the
  +   * range of PEs the VFs are in.
  +   */
  +  vf_num = pdn-vf_pes;
  +  for (i = 0; i  PCI_SRIOV_NUM_BARS; i++) {
  +  res = dev-resource[i + PCI_IOV_RESOURCES];
  +  if (!res-flags || !res-parent)
  +  continue;
  +
  +  if (!pnv_pci_is_mem_pref_64(res-flags))
  +  continue;
  +
  +  /*
  +   * The actual IOV BAR range is determined by the start address
  +   * and the actual size for vf_num VFs BAR.  This check is to
  +   * make sure that after shifting, the range will not overlap
  +   * with another device.
  +   */
  +  size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
  +  res2.flags = res-flags;
  +  res2.start = res-start + (size * offset);
  +  res2.end = res2.start + (size * vf_num) - 1;
  +
  +  if (res2.end  res-end) {
  +  dev_err(dev-dev, VF BAR%d: %pR would extend past %pR 
  (trying to enable %d VFs shifted by %d)\n,
  +  i, res2, res, vf_num, offset);
  +  return -EBUSY;
  +  }
  +  }
  +
  +  for (i = 0; i  PCI_SRIOV_NUM_BARS; i++) {
  +  res = dev-resource[i + PCI_IOV_RESOURCES];
  +  if (!res-flags || !res-parent)
  +  continue;
  +
  +  if (!pnv_pci_is_mem_pref_64(res-flags))
  +  continue;
  +
  +  size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
  +  res2 = *res;
  +  res-start += size * offset;
 
 I'm still not happy about this fiddling with res-start.
 
 Increasing res-start means that in principle, the size * offset bytes
 that we just removed from res are now available for allocation to somebody
 else.  I don't think we *will* give that space to anything else because of
 the alignment restrictions you're enforcing, but res now doesn't
 correctly describe the real resource map.
 
 Would you be able to just update the BAR here while leaving the struct
 resource alone?  In that case, it would look a little funny that lspci
 would show a BAR value in the middle of the region in /proc/iomem, but
 the /proc/iomem region would be more correct.
 
 Bjorn,
 
 I did some tests, while the result is not good.
 
 What I did is still write the shifted resource address to the device by
 pci_update_resource(), but I revert the res-start to the original one. If
 this step is not correct, please let me know.
 
 This can't work since after we revert the res-start, those VFs will be given
 resources from res-start instead of (res-start + offset * size). This is not
 what we expect.

Hmm, yes, I suppose we'd have to have a hook in pci_bus_alloc_from_region()
or something.  That's getting a little messy.  I still don't like messing
with the resource after it's in the resource tree, but I don't have a
better idea right now.  So let's just go with what you have.

  +
  +  dev_info(dev-dev, VF BAR%d: %pR shifted to %pR (enabling %d 
  VFs shifted by %d)\n,
  +   i, res2, res, vf_num, offset);
  +  pci_update_resource(dev, i + PCI_IOV_RESOURCES);
  +  }
  +  pdn-max_vfs -= offset;
  +  return 0;
  +}
  +#endif /* CONFIG_PCI_IOV */
 
 -- 
 Richard Yang
 Help you, Help me
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-pci in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
Linuxppc-dev mailing list

Re: [PATCH 4/4] powerpc/eeh: remove unused macro IS_BRIDGE

2015-03-10 Thread Gavin Shan
On Mon, Mar 09, 2015 at 11:17:32AM +0800, Wei Yang wrote:
Currently, the macro IS_BRIDGE is not used any where.
This patch just removes it.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com

Acked-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/kernel/eeh.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3b2252e..60df70c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -144,8 +144,6 @@ struct eeh_stats {

 static struct eeh_stats eeh_stats;

-#define IS_BRIDGE(class_code) (((class_code)16) == PCI_BASE_CLASS_BRIDGE)
-
 static int __init eeh_setup(char *str)
 {
   if (!strcmp(str, off))
-- 
1.7.9.5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/4] powerpc/eeh: fix powernv_eeh_wait_state delay logic

2015-03-10 Thread Wei Yang
On Wed, Mar 11, 2015 at 04:13:46PM +1100, Gavin Shan wrote:
On Mon, Mar 09, 2015 at 11:17:31AM +0800, Wei Yang wrote:
As the comment indicates, powernv_eeh_get_state() will inform EEH core to
delay 1 second. This means the delay doesn't happen when
powernv_eeh_get_state() returns.

This patch moves the delay subtraction just before msleep(), which is the
same logic in pseries_eeh_wait_state().

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com

The patch would conflict with the patches to remove EEH chip layer done
in https://patchwork.ozlabs.org/patch/439956/. Could you please repost
with corrected function names in commit log when the dependent patches
show up in Michael's next branch?

Except that, the changes look ok to me:

Acked-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks, I will repost it accordingly.


Thanks,
Gavin

---
 arch/powerpc/platforms/powernv/eeh-powernv.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 5abb4c2..af1be8f 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -334,13 +334,13 @@ static int powernv_eeh_wait_state(struct eeh_pe *pe, 
int max_wait)
  if (ret != EEH_STATE_UNAVAILABLE)
  return ret;

- max_wait -= mwait;
  if (max_wait = 0) {
  pr_warn(%s: Timeout getting PE#%x's state (%d)\n,
  __func__, pe-addr, max_wait);
  return EEH_STATE_NOT_SUPPORT;
  }

+ max_wait -= mwait;
  msleep(mwait);
  }

-- 
1.7.9.5


-- 
Richard Yang
Help you, Help me

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Alex Williamson
On Tue, 2015-03-10 at 01:06 +1100, Alexey Kardashevskiy wrote:
 This checks that the TCE table page size is not bigger that the size of
 a page we just pinned and going to put its physical address to the table.
 
 Otherwise the hardware gets unwanted access to physical memory between
 the end of the actual page and the end of the aligned up TCE page.
 
 Since compound_order() and compound_head() work correctly on non-huge
 pages, there is no need for additional check whether the page is huge.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
 Changes:
 v4:
 * s/tce_check_page_size/tce_page_is_contained/
 ---
  drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++
  1 file changed, 22 insertions(+)
 
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
 b/drivers/vfio/vfio_iommu_spapr_tce.c
 index 756831f..91e7599 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
 @@ -49,6 +49,22 @@ struct tce_container {
   bool enabled;
  };
  
 +static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 +{
 + unsigned shift;
 +
 + /*
 +  * Check that the TCE table granularity is not bigger than the size of
 +  * a page we just found. Otherwise the hardware can get access to
 +  * a bigger memory chunk that it should.
 +  */
 + shift = PAGE_SHIFT + compound_order(compound_head(page));
 + if (shift = page_shift)
 + return true;
 +
 + return false;

nit, simplified:

return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);

 +}
 +
  static int tce_iommu_enable(struct tce_container *container)
  {
   int ret = 0;
 @@ -197,6 +213,12 @@ static long tce_iommu_build(struct tce_container 
 *container,
   ret = -EFAULT;
   break;
   }
 +
 + if (!tce_page_is_contained(page, tbl-it_page_shift)) {
 + ret = -EPERM;
 + break;
 + }
 +
   hva = (unsigned long) page_address(page) +
   (tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);
  



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/4] Support registering specific reset handler

2015-03-10 Thread Gavin Shan
On Fri, Mar 06, 2015 at 12:38:59PM -0600, Bjorn Helgaas wrote:
On Fri, Feb 20, 2015 at 04:53:08PM +1100, Gavin Shan wrote:
 On Thu, Feb 19, 2015 at 04:57:47PM -0200, casca...@linux.vnet.ibm.com wrote:
 On Tue, Feb 17, 2015 at 09:36:47AM +1100, Gavin Shan wrote:
  On Mon, Feb 16, 2015 at 11:14:27AM -0200, casca...@linux.vnet.ibm.com 
  wrote:
  On Fri, Feb 13, 2015 at 03:54:55PM +1100, Gavin Shan wrote:
   VFIO PCI infrastructure depends on pci_reset_function() to do reset on
   PCI devices so that they would be in clean state when host or guest 
   grabs
   them. Unfortunately, the function doesn't work (or not well) on some 
   PCI
   devices that require EEH PE reset.
   
   The patchset extends the quirk for PCI device speicific reset methods 
   to
   allow dynamically registration. With it, we can translate reset 
   requests
   for those special PCI devcies to EEH PE reset, which is only avaialble 
   on
   64-bits PowerPC platforms.
   
  
  Hi, Gavin.
  
  I like your approach overall. That allows us to confine these quirks to
  the platforms where they are relevant. I would make the quirks more
  specific, though, instead of doing them for all IBM and Mellanox
  devices.
  
  
  Yeah, we need have more specific vendor/device IDs for PATCH[4/4]. Could
  you please take a look on PATCH[4/4] and then suggest the specific devices
  that requries the platform-dependent reset quirk? Especially the device 
  IDs
  for IBM/Mellanox we need put add quirks for.
  
  I wonder if we should not have some form of domain reset, where we would
  reset all the devices on the same group, and use that on vfio. Grouping
  the devices would then be made platform-dependent, as well as the reset
  method. On powernv, we would group by IOMMU group and issue a
  fundamental reset.
  
  
  I'm assuming domain reset is PE reset, which is the specific reset 
  handler
  tries to do on PowerNV platform. The reason why we need platform specific
  reset handler is some adapters can't support function level reset methods
  (except pci_dev_specific_reset()) in __pci_dev_reset().
  
 
 Well, in the case of Power servers, this would be the PE reset, I am not
 sure what this would be on other platforms. No other platform implements
 pci_set_pcie_reset_state, which I think would be one possible to
 implement such a reset.
 
 What I am saying is that we should consider doing this on all platforms
 and for all adapters, because this is not specific to Power and this is
 not specific to this particular set of adapters. Otherwise, one can
 simply program one adapter in the guest to write to host memory,
 shutdown, and since no reset will take place, the card will simply write
 to host memory when the guest is finished with and IOMMU is off.
 
 
 Yes, I believe your way will make things simpler and we don't need the
 code to support registering reset handler dynamically at atll. Please
 confirm if following idea is what you're suggesting: 
 
 - Introduce function drivers/pci/quirks.c::pci_dev_specific_reset(), which
   will be added to pci_dev_reset_methods[] for various vendor/device IDs.
 - pci_dev_specific_reset() routes the reset (or probe) request to
   pci_set_pcie_reset_state(), which needs one more syntax for reset probing.
   In turn, pci_set_pcie_reset_state() calls pcibios_set_pcie_reset_state(),
   which returns -ETTY by default.
 
 For PowerPC, pcibios_set_pcie_reset_state() will do PE reset, which can't
 be ported to other platforms as it depends on PowerPC unique EEH feature.
 So other platforms have to override this function and do things similar to
 PowerPC PE reset there.

Dropping for now because it sounds like you are considering reworking based
on Cascardo's ideas.  If not, please re-post these so they show up in
patchwork again.


Yes, I'll rework on this according to Cascardo's input and repost.

Thanks,
Gavin

Bjorn
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[REGRESSION in 3.18][PPC] PA Semi fails to boot after: of/base: Fix PowerPC address parsing hack

2015-03-10 Thread Christian Zigotzky

On 10/03/2015 01:33 a.m., Olof Johansson wrote:

* Electra: First development/eval board. Funky USB on localbus, plenty
of PCI-e. Two GigE, one 10GigE XAUI. CompactFlash and IDE on localbus
too. Usually shipped with a PCI-e SATA card and a USB card.

* Chitra: Second edition dev/eval board. Moved SATA and USB on-board,
and removed some of the localbus hardware. Might have routed three
GigE out instead of 2, can't remember.

* Athena: Never released board with a smaller package chip, there's
only a few of these around. Can't comment too much on the specifics,
but it's similar to Chitra, and the silicon is the same.



@All

FYI:

* Nemo: Released as AmigaONE X1000. With co-processor: Xena 500 MHz 
XCore XS1-L2 124 SDS

and SB600 southbridge.

Linux support:

http://forum.hyperion-entertainment.biz/viewforum.php?f=35sid=7849bde9bae455730f3b95ad207bb6e9
http://www.supertuxkart-amiga.de/amiga/x1000.html

Is still in stock: http://amigakit.leamancomputing.com/x1000/?webpage=buy.

It means you could buy a new PA6T system with active Linux support. It 
runs the latest kernel 4.0-rc3

and the new ubuntu MATE 15.04 PowerPC on this board.


Mine is running:

   CFE version PAS-2.0.29 for ATHENA (64bit,MP,BE,PPC)
   Build Date: Mon Jun 30 11:47:25 PDT 2008 (mpl@mitch-1)



FYI: Nemo: CFE version PAS-2.0.30

Rgds,

Christian
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[RFC PATCH 1/2] powerpc: Add a proper syscall for switching endianness

2015-03-10 Thread Michael Ellerman
We currently have a special syscall for switching endianness. This is
syscall number 0x1ebe, which is handled explicitly in the 64-bit syscall
exception entry.

That has a few problems, firstly the syscall number is outside of the
usual range, which confuses various tools. For example strace doesn't
recognise the syscalls at all.

Secondly it's handled explicitly as a special case in the syscall
exception entry, which is complicated enough without it.

As a first step toward removing the special syscall, we need to add a
regular syscall that implements the same functionality.

The logic is simple, it simply toggles the MSR_LE bit in the userspace
MSR. This is the same as the special syscall, with the caveat that the
special syscall clobbers fewer registers.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/systbl.h  | 1 +
 arch/powerpc/include/asm/unistd.h  | 2 +-
 arch/powerpc/include/uapi/asm/unistd.h | 1 +
 arch/powerpc/kernel/syscalls.c | 7 +++
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 91062eef582f..524bf5dff9f9 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -367,3 +367,4 @@ SYSCALL_SPU(getrandom)
 SYSCALL_SPU(memfd_create)
 SYSCALL_SPU(bpf)
 COMPAT_SYS(execveat)
+SYSCALL(switch_endian)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index 36b79c31eedd..f4f8b667d75b 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include uapi/asm/unistd.h
 
 
-#define __NR_syscalls  363
+#define __NR_syscalls  364
 
 #define __NR__exit __NR_exit
 #define NR_syscalls__NR_syscalls
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index ef5b5b1f3123..e4aa173dae62 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -385,5 +385,6 @@
 #define __NR_memfd_create  360
 #define __NR_bpf   361
 #define __NR_execveat  362
+#define __NR_switch_endian 363
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index b2702e87db0d..0ea01957990e 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -121,3 +121,10 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, 
u32 offset_low,
return sys_fadvise64(fd, (u64)offset_high  32 | offset_low,
 (u64)len_high  32 | len_low, advice);
 }
+
+long sys_switch_endian(void)
+{
+   current-thread.regs-msr ^= MSR_LE;
+
+   return 0;
+}
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[RFC PATCH 2/2] selftests/powerpc: Add a test of the switch_endian() syscall

2015-03-10 Thread Michael Ellerman
This adds a test of the switch_endian() syscall we added in the previous
commit.

We test it by calling the endian switch syscall, and then executing some
code in the other endian which writes to stdout and then does exit(0).

If the endian switch failed to happen that code sequence will be illegal
and cause the test to abort.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 tools/testing/selftests/powerpc/Makefile   |  2 +-
 tools/testing/selftests/powerpc/syscalls/Makefile  | 15 
 .../selftests/powerpc/syscalls/endian-test.S   | 43 ++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/syscalls/Makefile
 create mode 100644 tools/testing/selftests/powerpc/syscalls/endian-test.S

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 1d5e7ad2c460..5da93b7d1330 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -13,7 +13,7 @@ CFLAGS := -Wall -O2 -flto -Wall -Werror 
-DGIT_VERSION='$(GIT_VERSION)' -I$(CUR
 
 export CC CFLAGS
 
-TARGETS = pmu copyloops mm tm primitives stringloops
+TARGETS = pmu copyloops mm tm primitives stringloops syscalls
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile 
b/tools/testing/selftests/powerpc/syscalls/Makefile
new file mode 100644
index ..b74201fa4f15
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/Makefile
@@ -0,0 +1,15 @@
+PROGS := endian-test
+
+endian-test: ASFLAGS += -O2 -Wall -g -nostdlib -m64
+
+all: $(PROGS)
+
+run_tests: all
+   @-for PROG in $(PROGS); do \
+   ./$$PROG; \
+   done;
+
+clean:
+   rm -f $(PROGS)
+
+.PHONY: all run_tests clean
diff --git a/tools/testing/selftests/powerpc/syscalls/endian-test.S 
b/tools/testing/selftests/powerpc/syscalls/endian-test.S
new file mode 100644
index ..ba596eb9925f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/endian-test.S
@@ -0,0 +1,43 @@
+#include ppc-asm.h
+#include asm/unistd.h
+
+   .data
+   .balign 8
+message:
+   .ascii success: endian-test\n
+
+   .text
+FUNC_START(_start)
+   ld r14, message@got(%r2)
+
+   /* Call the syscall to flip endian */
+   li r0, 363
+   sc
+
+   /*
+* The longs below encode the following code snippet, in the other
+* endian. We preloaded the address of message into r14 in case it
+* moves.
+*
+*  li r0, __NR_write
+*  li r3, 1# stdout
+*  mr r4, r14  # Get message
+*  li r5, 21   # strlen(message)
+*  sc
+*  li r0, __NR_exit
+*  li r3, 0# exit(0)
+*  sc
+*  b .
+*
+* In both cases, if the endian has not been flipped the processor will
+* see an illegal instruction and the process will exit abnormally.
+*/
+   .long 0x0438
+   .long 0x01006038
+   .long 0x7873c47d
+   .long 0x1500a038
+   .long 0x0244
+   .long 0x0138
+   .long 0x6038
+   .long 0x0244
+   .long 0x0048
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Alex Williamson
On Wed, 2015-03-11 at 09:57 +1100, Alexey Kardashevskiy wrote:
 On 03/11/2015 06:56 AM, Alex Williamson wrote:
  On Tue, 2015-03-10 at 01:06 +1100, Alexey Kardashevskiy wrote:
  This checks that the TCE table page size is not bigger that the size of
  a page we just pinned and going to put its physical address to the table.
 
  Otherwise the hardware gets unwanted access to physical memory between
  the end of the actual page and the end of the aligned up TCE page.
 
  Since compound_order() and compound_head() work correctly on non-huge
  pages, there is no need for additional check whether the page is huge.
 
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  ---
  Changes:
  v4:
  * s/tce_check_page_size/tce_page_is_contained/
  ---
drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++
1 file changed, 22 insertions(+)
 
  diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
  b/drivers/vfio/vfio_iommu_spapr_tce.c
  index 756831f..91e7599 100644
  --- a/drivers/vfio/vfio_iommu_spapr_tce.c
  +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
  @@ -49,6 +49,22 @@ struct tce_container {
 bool enabled;
};
 
  +static bool tce_page_is_contained(struct page *page, unsigned page_shift)
  +{
  +  unsigned shift;
  +
  +  /*
  +   * Check that the TCE table granularity is not bigger than the size of
  +   * a page we just found. Otherwise the hardware can get access to
  +   * a bigger memory chunk that it should.
  +   */
  +  shift = PAGE_SHIFT + compound_order(compound_head(page));
  +  if (shift = page_shift)
  +  return true;
  +
  +  return false;
 
  nit, simplified:
 
  return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);
 
 This won't be bool though.

Yes, it will.

  This will (I'll do this)
 
 shift = PAGE_SHIFT + compound_order(compound_head(page));
 return (shift = page_shift);
 
 
 
 
 
  +}
  +
static int tce_iommu_enable(struct tce_container *container)
{
 int ret = 0;
  @@ -197,6 +213,12 @@ static long tce_iommu_build(struct tce_container 
  *container,
 ret = -EFAULT;
 break;
 }
  +
  +  if (!tce_page_is_contained(page, tbl-it_page_shift)) {
  +  ret = -EPERM;
  +  break;
  +  }
  +
 hva = (unsigned long) page_address(page) +
 (tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);
 
 
 
 
 
 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC V2 04/12] i2c: opal: make use of the new infrastructure for quirks

2015-03-10 Thread Benjamin Herrenschmidt
On Tue, 2015-03-10 at 22:43 +0530, Neelesh Gupta wrote:
 I tested the i2c opal driver after updating the patch as below.
 Basically I think we can also support write-then-{read/write}
 for the number of messages = 2.
 Ben, any issues if we support both write plus read/write in the
 opal driver ?

Nope, in fact it's a good idea, I found myself having to expoes such
an interface to some userspace tool of ours.

However...

 diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
 index 16f90b1..85412ba 100644
 --- a/drivers/i2c/busses/i2c-opal.c
 +++ b/drivers/i2c/busses/i2c-opal.c
 @@ -104,18 +104,8 @@ static int i2c_opal_master_xfer(struct i2c_adapter 
 *adap, struct i2c_msg *msgs,
   req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
   break;
   case 2:
 - /* For two messages, we basically support only simple
 -  * smbus transactions of a write plus a read. We might
 -  * want to allow also two writes but we'd have to bounce
 -  * the data into a single buffer.
 -  */
 - if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
 - return -EOPNOTSUPP;

Don't we still want to enforce that the first message is a write ?
Somebody may not be looking at the quirks...

 - if (msgs[0].len  4)
 - return -EOPNOTSUPP;

And that the len is supported...

 - if (msgs[0].addr != msgs[1].addr)
 - return -EOPNOTSUPP;

Same...

Ie, the quirk indicates to the callers what we support, but we should
still check that we aren't called with something that doesn't match.

 - req.type = OPAL_I2C_SM_READ;
 + req.type = (msgs[1].flags  I2C_M_RD) ?
 + OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE;
   req.addr = cpu_to_be16(msgs[0].addr);
   req.subaddr_sz = msgs[0].len;
   for (i = 0; i  msgs[0].len; i++)
 @@ -210,6 +200,11 @@ static const struct i2c_algorithm i2c_opal_algo = {
   .functionality  = i2c_opal_func,
   };
 
 +static struct i2c_adapter_quirks i2c_opal_quirks = {
 + .flags = I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | I2C_AQ_COMB_SAME_ADDR,
 + .max_comb_1st_msg_len = 4,
 +};
 +
   static int i2c_opal_probe(struct platform_device *pdev)
   {
   struct i2c_adapter  *adapter;
 @@ -232,6 +227,7 @@ static int i2c_opal_probe(struct platform_device *pdev)
 
   adapter-algo = i2c_opal_algo;
   adapter-algo_data = (void *)(unsigned long)opal_id;
 + adapter-quirks = i2c_opal_quirks;
   adapter-dev.parent = pdev-dev;
   adapter-dev.of_node = of_node_get(pdev-dev.of_node);
   pname = of_get_property(pdev-dev.of_node, ibm,port-name, NULL);
 
 
 
 On 02/25/2015 09:31 PM, Wolfram Sang wrote:
  From: Wolfram Sang wsa+rene...@sang-engineering.com
 
  Signed-off-by: Wolfram Sang wsa+rene...@sang-engineering.com
  ---
drivers/i2c/busses/i2c-opal.c | 22 +++---
1 file changed, 11 insertions(+), 11 deletions(-)
 
  diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
  index 16f90b1a750894..b2788ecad5b3cb 100644
  --- a/drivers/i2c/busses/i2c-opal.c
  +++ b/drivers/i2c/busses/i2c-opal.c
  @@ -104,17 +104,6 @@ static int i2c_opal_master_xfer(struct i2c_adapter 
  *adap, struct i2c_msg *msgs,
  req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf));
  break;
  case 2:
  -   /* For two messages, we basically support only simple
  -* smbus transactions of a write plus a read. We might
  -* want to allow also two writes but we'd have to bounce
  -* the data into a single buffer.
  -*/
  -   if ((msgs[0].flags  I2C_M_RD) || !(msgs[1].flags  I2C_M_RD))
  -   return -EOPNOTSUPP;
  -   if (msgs[0].len  4)
  -   return -EOPNOTSUPP;
  -   if (msgs[0].addr != msgs[1].addr)
  -   return -EOPNOTSUPP;
  req.type = OPAL_I2C_SM_READ;
  req.addr = cpu_to_be16(msgs[0].addr);
  req.subaddr_sz = msgs[0].len;
  @@ -210,6 +199,16 @@ static const struct i2c_algorithm i2c_opal_algo = {
  .functionality  = i2c_opal_func,
};

  +/* For two messages, we basically support only simple
  + * smbus transactions of a write plus a read. We might
  + * want to allow also two writes but we'd have to bounce
  + * the data into a single buffer.
  + */
  +static struct i2c_adapter_quirks i2c_opal_quirks = {
  +   .flags = I2C_AQ_COMB_WRITE_THEN_READ,
  +   .max_comb_1st_msg_len = 4,
  +};
  +
static int i2c_opal_probe(struct platform_device *pdev)
{
  struct i2c_adapter  *adapter;
  @@ -232,6 +231,7 @@ static int i2c_opal_probe(struct platform_device *pdev)

  adapter-algo = i2c_opal_algo;
  adapter-algo_data = (void *)(unsigned long)opal_id;
  +   adapter-quirks = i2c_opal_quirks;
  adapter-dev.parent 

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Alex Williamson
On Wed, 2015-03-11 at 10:14 +1100, Benjamin Herrenschmidt wrote:
 On Tue, 2015-03-10 at 17:03 -0600, Alex Williamson wrote:
return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);
   
   This won't be bool though.
  
  Yes, it will.
 
 Don't you have your parenthesis in the wrong place, Alex ? :-)

Probably, but the compiler will warn about that.

This will (I'll do this)
   
   shift = PAGE_SHIFT + compound_order(compound_head(page));
   return (shift = page_shift);
 
 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Alexey Kardashevskiy

On 03/11/2015 06:56 AM, Alex Williamson wrote:

On Tue, 2015-03-10 at 01:06 +1100, Alexey Kardashevskiy wrote:

This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* s/tce_check_page_size/tce_page_is_contained/
---
  drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++
  1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 756831f..91e7599 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -49,6 +49,22 @@ struct tce_container {
bool enabled;
  };

+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+   unsigned shift;
+
+   /*
+* Check that the TCE table granularity is not bigger than the size of
+* a page we just found. Otherwise the hardware can get access to
+* a bigger memory chunk that it should.
+*/
+   shift = PAGE_SHIFT + compound_order(compound_head(page));
+   if (shift = page_shift)
+   return true;
+
+   return false;


nit, simplified:

return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);


This won't be bool though. This will (I'll do this)

shift = PAGE_SHIFT + compound_order(compound_head(page));
return (shift = page_shift);







+}
+
  static int tce_iommu_enable(struct tce_container *container)
  {
int ret = 0;
@@ -197,6 +213,12 @@ static long tce_iommu_build(struct tce_container 
*container,
ret = -EFAULT;
break;
}
+
+   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
+   ret = -EPERM;
+   break;
+   }
+
hva = (unsigned long) page_address(page) +
(tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);








--
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Benjamin Herrenschmidt
On Tue, 2015-03-10 at 17:03 -0600, Alex Williamson wrote:
   return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);
  
  This won't be bool though.
 
 Yes, it will.

Don't you have your parenthesis in the wrong place, Alex ? :-)

   This will (I'll do this)
  
  shift = PAGE_SHIFT + compound_order(compound_head(page));
  return (shift = page_shift);


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 07/29] vfio: powerpc/spapr: Moving pinning/unpinning to helpers

2015-03-10 Thread Alex Williamson
On Tue, 2015-03-10 at 01:07 +1100, Alexey Kardashevskiy wrote:
 This is a pretty mechanical patch to make next patches simpler.
 
 New tce_iommu_unuse_page() helper does put_page() now but it might skip
 that after the memory registering patch applied.
 
 As we are here, this removes unnecessary checks for a value returned
 by pfn_to_page() as it cannot possibly return NULL.
 
 This moves tce_iommu_disable() later to let tce_iommu_clear() know if
 the container has been enabled because if it has not been, then
 put_page() must not be called on TCEs from the TCE table. This situation
 is not yet possible but it will after KVM acceleration patchset is
 applied.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  drivers/vfio/vfio_iommu_spapr_tce.c | 70 
 -
  1 file changed, 54 insertions(+), 16 deletions(-)
 
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
 b/drivers/vfio/vfio_iommu_spapr_tce.c
 index d3ab34f..ca396e5 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
 @@ -204,7 +204,6 @@ static void tce_iommu_release(void *iommu_data)
   struct iommu_table *tbl = container-tbl;
  
   WARN_ON(tbl  !tbl-it_group);
 - tce_iommu_disable(container);
  
   if (tbl) {
   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
 @@ -212,63 +211,102 @@ static void tce_iommu_release(void *iommu_data)
   if (tbl-it_group)
   tce_iommu_detach_group(iommu_data, tbl-it_group);
   }
 +
 + tce_iommu_disable(container);
 +
   mutex_destroy(container-lock);
  
   kfree(container);
  }
  
 +static void tce_iommu_unuse_page(struct tce_container *container,
 + unsigned long oldtce)
 +{
 + struct page *page;
 +
 + if (!(oldtce  (TCE_PCI_READ | TCE_PCI_WRITE)))
 + return;
 +
 + /*
 +  * VFIO cannot map/unmap when a container is not enabled so
 +  * we would not need this check but KVM could map/unmap and if
 +  * this happened, we must not put pages as KVM does not get them as
 +  * it expects memory pre-registation to do this part.
 +  */
 + if (!container-enabled)
 + return;
 +
 + page = pfn_to_page(__pa(oldtce)  PAGE_SHIFT);
 +
 + if (oldtce  TCE_PCI_WRITE)
 + SetPageDirty(page);
 +
 + put_page(page);
 +}
 +
  static int tce_iommu_clear(struct tce_container *container,
   struct iommu_table *tbl,
   unsigned long entry, unsigned long pages)
  {
   unsigned long oldtce;
 - struct page *page;
  
   for ( ; pages; --pages, ++entry) {
   oldtce = iommu_clear_tce(tbl, entry);
   if (!oldtce)
   continue;
  
 - page = pfn_to_page(oldtce  PAGE_SHIFT);
 - WARN_ON(!page);
 - if (page) {
 - if (oldtce  TCE_PCI_WRITE)
 - SetPageDirty(page);
 - put_page(page);
 - }
 + tce_iommu_unuse_page(container, (unsigned long) __va(oldtce));
   }
  
   return 0;
  }
  
 +static unsigned long tce_get_hva(struct tce_container *container,
 + unsigned page_shift, unsigned long tce)
 +{
 + long ret;
 + struct page *page = NULL;
 + unsigned long hva;
 + enum dma_data_direction direction = iommu_tce_direction(tce);
 +
 + ret = get_user_pages_fast(tce  PAGE_MASK, 1,
 + direction != DMA_TO_DEVICE, page);
 + if (unlikely(ret != 1))
 + return -1;
 +
 + hva = (unsigned long) page_address(page);
 +
 + return hva;
 +}


It's a bit crude to return -1 for an unsigned long function.  You might
want to later think about cleaning this up to return int with a proper
error code and return hva via a pointer.  We don't really need to store
'ret' here either.

 +
  static long tce_iommu_build(struct tce_container *container,
   struct iommu_table *tbl,
   unsigned long entry, unsigned long tce, unsigned long pages)
  {
   long i, ret = 0;
 - struct page *page = NULL;
 + struct page *page;
   unsigned long hva;
   enum dma_data_direction direction = iommu_tce_direction(tce);
  
   for (i = 0; i  pages; ++i) {
 - ret = get_user_pages_fast(tce  PAGE_MASK, 1,
 - direction != DMA_TO_DEVICE, page);
 - if (unlikely(ret != 1)) {
 + hva = tce_get_hva(container, tbl-it_page_shift, tce);
 + if (hva == -1) {
   ret = -EFAULT;
   break;
   }
  
 + page = pfn_to_page(__pa(hva)  PAGE_SHIFT);
   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
   ret = -EPERM;
   break;
   }
  
 - hva = (unsigned long) page_address(page) +
 - (tce  IOMMU_PAGE_MASK(tbl)  

Re: [PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-10 Thread Alexey Kardashevskiy

On 03/11/2015 10:03 AM, Alex Williamson wrote:

On Wed, 2015-03-11 at 09:57 +1100, Alexey Kardashevskiy wrote:

On 03/11/2015 06:56 AM, Alex Williamson wrote:

On Tue, 2015-03-10 at 01:06 +1100, Alexey Kardashevskiy wrote:

This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* s/tce_check_page_size/tce_page_is_contained/
---
   drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++
   1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 756831f..91e7599 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -49,6 +49,22 @@ struct tce_container {
bool enabled;
   };

+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+   unsigned shift;
+
+   /*
+* Check that the TCE table granularity is not bigger than the size of
+* a page we just found. Otherwise the hardware can get access to
+* a bigger memory chunk that it should.
+*/
+   shift = PAGE_SHIFT + compound_order(compound_head(page));
+   if (shift = page_shift)
+   return true;
+
+   return false;


nit, simplified:

return (PAGE_SHIFT + compound_order(compound_head(page) = page_shift);


This won't be bool though.


Yes, it will.


Ah, misread as ... - page_shift. And you missed one bracket :)





  This will (I'll do this)

shift = PAGE_SHIFT + compound_order(compound_head(page));
return (shift = page_shift);







+}
+
   static int tce_iommu_enable(struct tce_container *container)
   {
int ret = 0;
@@ -197,6 +213,12 @@ static long tce_iommu_build(struct tce_container 
*container,
ret = -EFAULT;
break;
}
+
+   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
+   ret = -EPERM;
+   break;
+   }
+
hva = (unsigned long) page_address(page) +
(tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);















--
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3] powerpc/numa: set node_possible_map to only node_online_map during boot

2015-03-10 Thread Nishanth Aravamudan
On 10.03.2015 [10:55:05 +1100], Michael Ellerman wrote:
 On Thu, 2015-03-05 at 21:27 -0800, Nishanth Aravamudan wrote:
  diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
  index 0257a7d659ef..0c1716cd271f 100644
  --- a/arch/powerpc/mm/numa.c
  +++ b/arch/powerpc/mm/numa.c
  @@ -958,6 +958,13 @@ void __init initmem_init(void)
   
  memblock_dump_all();
   
  +   /*
  +* Reduce the possible NUMA nodes to the online NUMA nodes,
  +* since we do not support node hotplug. This ensures that  we
  +* lower the maximum NUMA node ID to what is actually present.
  +*/
  +   node_possible_map = node_online_map;
 
 That looks nice, but is it generating what we want?
 
 ie. is the content of node_online_map being *copied* into node_possible_map.
 
 Or are we changing node_possible_map to point at node_online_map?

I think it ends up being the latter, which is probably fine in practice
(I think node_online_map is static on power after boot), but perhaps it
would be better to do:

nodes_and(node_possible_map, node_possible_map, node_online_map);

?

e.g.:


powerpc/numa: reset node_possible_map to only node_online_map

Raghu noticed an issue with excessive memory allocation on power with a
simple cgroup test, specifically, in mem_cgroup_css_alloc -
for_each_node - alloc_mem_cgroup_per_zone_info(), which ends up blowing
up the kmalloc-2048 slab (to the order of 200MB for 400 cgroup
directories).

The underlying issue is that NODES_SHIFT on power is 8 (256 NUMA nodes
possible), which defines node_possible_map, which in turn defines the
value of nr_node_ids in setup_nr_node_ids and the iteration of
for_each_node.

In practice, we never see a system with 256 NUMA nodes, and in fact, we
do not support node hotplug on power in the first place, so the nodes
that are online when we come up are the nodes that will be present for
the lifetime of this kernel. So let's, at least, drop the NUMA possible
map down to the online map at runtime. This is similar to what x86 does
in its initialization routines.

mem_cgroup_css_alloc should also be fixed to only iterate over
memory-populated nodes and handle hotplug, but that is a separate
change.

Signed-off-by: Nishanth Aravamudan n...@linux.vnet.ibm.com
To: Michael Ellerman m...@ellerman.id.au
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Tejun Heo t...@kernel.org
Cc: David Rientjes rient...@google.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
Cc: Anton Blanchard an...@samba.org
Cc: Raghavendra K T raghavendra...@linux.vnet.ibm.com

---
v1 - v2:
  Rather than clear node_possible_map and set it nid-by-nid, just
  directly assign node_online_map to it, as suggested by Michael
  Ellerman and Tejun Heo.

v2 - v3:
  Rather than direct assignment (which is just repointing the pointer),
  modify node_possible_map in-place.

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0257a7d659ef..1a118b08fad2 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -958,6 +958,13 @@ void __init initmem_init(void)
 
memblock_dump_all();
 
+   /*
+* Reduce the possible NUMA nodes to the online NUMA nodes,
+* since we do not support node hotplug. This ensures that  we
+* lower the maximum NUMA node ID to what is actually present.
+*/
+   nodes_and(node_possible_map, node_possible_map, node_online_map);
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 4/4] mm: numa: Slow PTE scan rate if migration failures occur

2015-03-10 Thread Linus Torvalds
On Mon, Mar 9, 2015 at 12:19 PM, Dave Chinner da...@fromorbit.com wrote:
 On Mon, Mar 09, 2015 at 09:52:18AM -0700, Linus Torvalds wrote:

 What's your virtual environment setup? Kernel config, and
 virtualization environment to actually get that odd fake NUMA thing
 happening?

 I don't have the exact .config with me (test machines at home
 are shut down because I'm half a world away), but it's pretty much
 this (copied and munged from a similar test vm on my laptop):

[ snip snip ]

Ok, I hate debugging by symptoms anyway, so I didn't do any of this,
and went back to actually *thinking* about the code instead of trying
to reproduce this and figure things out by trial and error.

And I think I figured it out. Of course, since I didn't actually test
anything, what do I know, but I feel good about it, because I think I
can explain why that patch that on the face of it shouldn't change
anything actually did.

So, the old code just did all those manual page table changes,
clearing the present bit and setting the NUMA bit instead.

The new code _ostensibly_ does the same, except it clears the present
bit and sets the PROTNONE bit instead.

However, rather than playing special games with just those two bits,
it uses the normal pte accessor functions, and in particular uses
vma-vm_page_prot to reset the protections back. Which is a nice
cleanup and really makes the code look saner, and does the same thing.

Except it really isn't the same thing at all.

Why?

The protection bits in the page tables are *not* the same as
vma-vm_page_prot. Yes, they start out that way, but they don't stay
that way. And no, I'm not talking about dirty and accessed bits.

The difference? COW. Any private mapping is marked read-only in
vma-vm_page_prot, and then the COW (or the initial write) makes it
read-write.

And so, when we did

-   pte = pte_mknonnuma(pte);
+   /* Make it present again */
+   pte = pte_modify(pte, vma-vm_page_prot);
+   pte = pte_mkyoung(pte);

that isn't equivalent at all - it makes the page read-only, because it
restores it to its original state.

Now, that isn't actually what hurts most, I suspect. Judging by the
profiles, we don't suddenly take a lot of new COW faults. No, what
hurts most is that the NUMA balancing code does this:

/*
 * Avoid grouping on DSO/COW pages in specific and RO pages
 * in general, RO pages shouldn't hurt as much anyway since
 * they can be in shared cache state.
 */
if (!pte_write(pte))
flags |= TNF_NO_GROUP;

and that !pte_write(pte) is basically now *always* true for private
mappings (which is 99% of all mappings).

In other words, I think the patch unintentionally made the NUMA code
basically always do the TNF_NO_GROUP case.

I think that a quick hack for testing might be to just replace that
!pte_write() with !pte_dirty(), and seeing how that acts.

Comments?

  Linus
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 26/29] vfio: powerpc/spapr: Define v2 IOMMU

2015-03-10 Thread Alex Williamson
On Tue, 2015-03-10 at 01:07 +1100, Alexey Kardashevskiy wrote:
 The existing IOMMU code takes/releases ownership over the existing IOMMU
 tables created by the platform code, i.e. the tables remain in memory
 all the time. Also, the existing IOMMU requires VFIO_IOMMU_ENABLE call to
 start working as that's where we check the rlimit for locked pages.
 
 New IOMMU will take over the IOMMU group completely which means the IOMMU
 tables created by the platform code are going to be disposed and VFIO
 will create its own tables. Also, with the DMA memory pre-registration
 feature, the userspace will not need to call VFIO_IOMMU_ENABLE as
 the locked pages accounting will be done by
 VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
 
 In order to inform the userspace that VFIO supports new capabilities,
 this adds a new SPAPR TCE IOMMU v2 type.

Shouldn't we block some of these new features on v1?  Otherwise v1 now
supports them too and by the property of we don't break userspace v1
and v2 are now one-in-the-same.  You'd need to support everything
through both versions.  If that's what you intend to do, maybe this
should be a capability rather than an IOMMU type.

 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++-
  include/uapi/linux/vfio.h   | 2 ++
  2 files changed, 4 insertions(+), 1 deletion(-)
 
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
 b/drivers/vfio/vfio_iommu_spapr_tce.c
 index e572c28..d665ddc 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
 @@ -404,7 +404,7 @@ static void *tce_iommu_open(unsigned long arg)
  {
   struct tce_container *container;
  
 - if (arg != VFIO_SPAPR_TCE_IOMMU) {
 + if ((arg != VFIO_SPAPR_TCE_IOMMU)  (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
   pr_err(tce_vfio: Wrong IOMMU type\n);
   return ERR_PTR(-EINVAL);
   }
 @@ -588,6 +588,7 @@ static long tce_iommu_ioctl(void *iommu_data,
   case VFIO_CHECK_EXTENSION:
   switch (arg) {
   case VFIO_SPAPR_TCE_IOMMU:
 + case VFIO_SPAPR_TCE_v2_IOMMU:
   ret = 1;
   break;
   default:
 diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
 index b17e120..fbc5286 100644
 --- a/include/uapi/linux/vfio.h
 +++ b/include/uapi/linux/vfio.h
 @@ -36,6 +36,8 @@
  /* Two-stage IOMMU */
  #define VFIO_TYPE1_NESTING_IOMMU 6   /* Implies v2 */
  
 +#define VFIO_SPAPR_TCE_v2_IOMMU  7
 +
  /*
   * The IOCTL interface is designed for extensibility by embedding the
   * structure length (argsz) and flags into structures passed between



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 27/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership

2015-03-10 Thread Alex Williamson
On Tue, 2015-03-10 at 01:07 +1100, Alexey Kardashevskiy wrote:
 Before the IOMMU user would take control over the IOMMU table belonging to
 a specific IOMMU group. This approach did not allow sharing tables between
 IOMMU groups attached to the same container.
 
 This introduces a new IOMMU ownership flavour when the user can not
 just control the existing IOMMU table but remove/create tables on demand.
 If an IOMMU supports a set_ownership() callback, that lets the user have
 full control over the IOMMU group. When the ownership is taken,
 the platform code removes all the windows so the caller must create them.
 Before returning the ownership back to the platform code, the user
 has to unprogram and remove all the tables it created.

We have no ability to enforce that requirement on the user.  VFIO needs
to do the cleanup if the user fails to.

 Old-style ownership is still supported allowing VFIO to run on older
 P5IOC2 and IODA IO controllers.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++---
  drivers/vfio/vfio_iommu_spapr_tce.c   | 51 
 ---
  2 files changed, 66 insertions(+), 15 deletions(-)
 
 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
 b/arch/powerpc/platforms/powernv/pci-ioda.c
 index 07857c4..afb6906 100644
 --- a/arch/powerpc/platforms/powernv/pci-ioda.c
 +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
 @@ -1620,11 +1620,33 @@ static void pnv_ioda2_set_ownership(struct 
 iommu_table_group *table_group,
  {
   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
   table_group);
 - if (enable)
 - iommu_take_ownership(table_group);
 - else
 - iommu_release_ownership(table_group);
 + if (enable) {
 + pnv_pci_ioda2_unset_window(pe-table_group, 0);
 + pnv_pci_free_table(pe-table_group.tables[0]);
 + } else {
 + struct iommu_table *tbl = pe-table_group.tables[0];
 + int64_t rc;
  
 + rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
 + IOMMU_PAGE_SHIFT_4K,
 + pe-phb-ioda.m32_pci_base,
 + POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
 + if (rc) {
 + pe_err(pe, Failed to create 32-bit TCE table, err %ld,
 + rc);
 + return;
 + }
 +
 + iommu_init_table(tbl, pe-phb-hose-node);
 +
 + rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
 + if (rc) {
 + pe_err(pe, Failed to configure 32-bit TCE table, err 
 %ld\n,
 + rc);
 + pnv_pci_free_table(tbl);
 + return;
 + }
 + }
   pnv_pci_ioda2_set_bypass(pe, !enable);
  }
  
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
 b/drivers/vfio/vfio_iommu_spapr_tce.c
 index d665ddc..3bc0645 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
 @@ -426,18 +426,11 @@ static int tce_iommu_clear(struct tce_container 
 *container,
  static void tce_iommu_release(void *iommu_data)
  {
   struct tce_container *container = iommu_data;
 - struct iommu_table *tbl;
 - struct iommu_table_group *table_group;
  
   WARN_ON(container-grp);
  
 - if (container-grp) {
 - table_group = iommu_group_get_iommudata(container-grp);
 - tbl = table_group-tables[0];
 - tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
 -
 + if (container-grp)
   tce_iommu_detach_group(iommu_data, container-grp);
 - }
  
   tce_mem_unregister_all(container);
   tce_iommu_disable(container);
 @@ -826,14 +819,24 @@ static int tce_iommu_attach_group(void *iommu_data,
  
   if (!table_group-ops || !table_group-ops-set_ownership) {
   ret = iommu_take_ownership(table_group);
 + } else if (!table_group-ops-create_table ||
 + !table_group-ops-set_window) {
 + WARN_ON_ONCE(1);
 + ret = -EFAULT;
   } else {
   /*
* Disable iommu bypass, otherwise the user can DMA to all of
* our physical memory via the bypass window instead of just
* the pages that has been explicitly mapped into the iommu
*/
 + struct iommu_table tbltmp = { 0 }, *tbl = tbltmp;
 +
   table_group-ops-set_ownership(table_group, true);
 - ret = 0;
 + ret = table_group-ops-create_table(table_group, 0,
 + IOMMU_PAGE_SHIFT_4K,
 + table_group-tce32_size, 1, tbl);
 + if (!ret)
 + ret = table_group-ops-set_window(table_group, 0, tbl);
   }
  

Re: [PATCH v5 27/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership

2015-03-10 Thread Alexey Kardashevskiy

On 03/11/2015 11:09 AM, Alex Williamson wrote:

On Tue, 2015-03-10 at 01:07 +1100, Alexey Kardashevskiy wrote:

Before the IOMMU user would take control over the IOMMU table belonging to
a specific IOMMU group. This approach did not allow sharing tables between
IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU supports a set_ownership() callback, that lets the user have
full control over the IOMMU group. When the ownership is taken,
the platform code removes all the windows so the caller must create them.
Before returning the ownership back to the platform code, the user
has to unprogram and remove all the tables it created.


We have no ability to enforce that requirement on the user.  VFIO needs
to do the cleanup if the user fails to.


Ah. Wrong commit log, VFIO always does cleanup (as I end my guests by c-a-x 
most of the time :) ), will fix it.






Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
  arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++---
  drivers/vfio/vfio_iommu_spapr_tce.c   | 51 ---
  2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 07857c4..afb6906 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1620,11 +1620,33 @@ static void pnv_ioda2_set_ownership(struct 
iommu_table_group *table_group,
  {
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
-   if (enable)
-   iommu_take_ownership(table_group);
-   else
-   iommu_release_ownership(table_group);
+   if (enable) {
+   pnv_pci_ioda2_unset_window(pe-table_group, 0);
+   pnv_pci_free_table(pe-table_group.tables[0]);
+   } else {
+   struct iommu_table *tbl = pe-table_group.tables[0];
+   int64_t rc;

+   rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   pe-phb-ioda.m32_pci_base,
+   POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld,
+   rc);
+   return;
+   }
+
+   iommu_init_table(tbl, pe-phb-hose-node);
+
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
+   if (rc) {
+   pe_err(pe, Failed to configure 32-bit TCE table, err 
%ld\n,
+   rc);
+   pnv_pci_free_table(tbl);
+   return;
+   }
+   }
pnv_pci_ioda2_set_bypass(pe, !enable);
  }

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index d665ddc..3bc0645 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -426,18 +426,11 @@ static int tce_iommu_clear(struct tce_container 
*container,
  static void tce_iommu_release(void *iommu_data)
  {
struct tce_container *container = iommu_data;
-   struct iommu_table *tbl;
-   struct iommu_table_group *table_group;

WARN_ON(container-grp);

-   if (container-grp) {
-   table_group = iommu_group_get_iommudata(container-grp);
-   tbl = table_group-tables[0];
-   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
-
+   if (container-grp)
tce_iommu_detach_group(iommu_data, container-grp);
-   }

tce_mem_unregister_all(container);
tce_iommu_disable(container);
@@ -826,14 +819,24 @@ static int tce_iommu_attach_group(void *iommu_data,

if (!table_group-ops || !table_group-ops-set_ownership) {
ret = iommu_take_ownership(table_group);
+   } else if (!table_group-ops-create_table ||
+   !table_group-ops-set_window) {
+   WARN_ON_ONCE(1);
+   ret = -EFAULT;
} else {
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
 * our physical memory via the bypass window instead of just
 * the pages that has been explicitly mapped into the iommu
 */
+   struct iommu_table tbltmp = { 0 }, *tbl = tbltmp;
+
table_group-ops-set_ownership(table_group, true);
-   ret = 0;
+   ret = table_group-ops-create_table(table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+