Re: [PATCH v2 3/6] net/wan/fsl_ucc_hdlc: Adding ARPHRD_ETHER

2018-08-31 Thread David Miller
From: David Gounaris 
Date: Wed, 29 Aug 2018 15:13:25 +0200

> @@ -513,6 +517,8 @@ static int hdlc_rx_done(struct ucc_hdlc_private *priv, 
> int rx_work_limit)
>   break;
>  
>   case ARPHRD_PPP:
> + case ARPHRD_ETHER:
> + 

Please don't add such an extraneous empty line.

Thanks.


Re: v4.17 regression: PowerMac G3 won't boot, was Re: [PATCH v5 1/3] of: cache phandle nodes to reduce cost of of_find_node_by_phandle()

2018-08-31 Thread Benjamin Herrenschmidt
On Sat, 2018-09-01 at 09:36 +1000, Finn Thain wrote:
> > The patched kernel (Finn's vmlinux-4.18.0-1-gd44cf7e41c19) boots 
> > normally on my Beige G3 Desktop using BootX.
> > 
> 
> Ben sent two patches, so I picked the most recent one and applied it by 
> hand due to corrupted whitespace.

Yup, evolution seems to have broken copy/pasting of patches in a
preformatted email lately ... ugh.

> The patch you tested was the one below.



Re: v4.17 regression: PowerMac G3 won't boot, was Re: [PATCH v5 1/3] of: cache phandle nodes to reduce cost of of_find_node_by_phandle()

2018-08-31 Thread Benjamin Herrenschmidt
On Fri, 2018-08-31 at 06:28 -0600, Mac User wrote:
> On 8/30/18 10:49 PM, Benjamin Herrenschmidt wrote:
> 
> > On Fri, 2018-08-31 at 14:35 +1000, Benjamin Herrenschmidt wrote:
> > 
> > ...
> > Assuming you are using BootX (or miBoot), can you try this patch ?
> 
> Yes, I'm using BootX.
> 
> Thanks to Finn for applying the patch (I wouldn't have been sure
> which source tree to apply it to).
> 
> Thepatched kernel (Finn's vmlinux-4.18.0-1-gd44cf7e41c19)
> boots normally on my Beige G3 Desktop using BootX.
> 
> Thanks for working on this!

Well it's still a sign of something wrong with the new code, my patch
just band-aids by creating fake phandles. I would be interesting to
figure out what causes the new code to barf.

Cheers,
Ben.




Re: v4.17 regression: PowerMac G3 won't boot, was Re: [PATCH v5 1/3] of: cache phandle nodes to reduce cost of of_find_node_by_phandle()

2018-08-31 Thread Finn Thain
On Fri, 31 Aug 2018, Mac User wrote:

> On 8/30/18 10:49 PM, Benjamin Herrenschmidt wrote:
> 
> > On Fri, 2018-08-31 at 14:35 +1000, Benjamin Herrenschmidt wrote:
> >
> > ...
> > Assuming you are using BootX (or miBoot), can you try this patch ?
> 
> Yes, I'm using BootX.
> 
> Thanks to Finn for applying the patch (I wouldn't have been sure which 
> source tree to apply it to).
> 
> The patched kernel (Finn's vmlinux-4.18.0-1-gd44cf7e41c19) boots 
> normally on my Beige G3 Desktop using BootX.
> 

Ben sent two patches, so I picked the most recent one and applied it by 
hand due to corrupted whitespace.

The patch you tested was the one below.

> Thanks for working on this!
> 

Thanks for testing!

> -Stan
> 

diff --git a/arch/powerpc/platforms/powermac/bootx_init.c 
b/arch/powerpc/platforms/powermac/bootx_init.c
index 3b3b0b9b3577..7fb1c7bb5835 100644
--- a/arch/powerpc/platforms/powermac/bootx_init.c
+++ b/arch/powerpc/platforms/powermac/bootx_init.c
@@ -37,6 +37,7 @@ static unsigned long __initdata bootx_dt_strend;
 static unsigned long __initdata bootx_node_chosen;
 static boot_infos_t * __initdata bootx_info;
 static char __initdata bootx_disp_path[256];
+static int __initdata bootx_phandle;
 
 /* Is boot-info compatible ? */
 #define BOOT_INFO_IS_COMPATIBLE(bi) \
@@ -258,6 +259,8 @@ static void __init bootx_scan_dt_build_strings(unsigned 
long base,
namep = pp->name ? (char *)(base + pp->name) : NULL;
if (namep == NULL || strcmp(namep, "name") == 0)
goto next;
+   if (!strcmp(namep, "phandle") || !strcmp(namep, 
"linux,phandle"))
+   bootx_phandle = -1;
/* get/create string entry */
soff = bootx_dt_find_string(namep);
if (soff == 0)
@@ -330,6 +333,12 @@ static void __init bootx_scan_dt_build_struct(unsigned 
long base,
ppp = >next;
}
 
+   /* add a phandle */
+   if (bootx_phandle > 0) {
+   bootx_dt_add_prop("phandle", _phandle, 4, mem_end);
+   bootx_phandle++;
+   }
+
if (node == bootx_node_chosen) {
bootx_add_chosen_props(base, mem_end);
if (bootx_info->dispDeviceRegEntryOffset == 0)
@@ -385,6 +394,8 @@ static unsigned long __init bootx_flatten_dt(unsigned long 
start)
bootx_dt_add_string("linux,bootx-height", _end);
bootx_dt_add_string("linux,bootx-linebytes", _end);
bootx_dt_add_string("linux,bootx-addr", _end);
+   if (bootx_phandle > 0)
+   bootx_dt_add_string("phandle", _end);
/* Wrap up strings */
hdr->off_dt_strings = bootx_dt_strbase - mem_start;
hdr->dt_strings_size = bootx_dt_strend - bootx_dt_strbase;
@@ -482,6 +493,7 @@ void __init bootx_init(unsigned long r3, unsigned long r4)
bootx_dt_strbase = bootx_dt_strend = 0;
bootx_node_chosen = 0;
bootx_disp_path[0] = 0;
+   bootx_phandle = 1;
 
if (!BOOT_INFO_IS_V2_COMPATIBLE(bi))
bi->logicalDisplayBase = bi->dispDeviceBase;

-- 


Re: [PATCH RFCv2 0/6] mm: online/offline_pages called w.o. mem_hotplug_lock

2018-08-31 Thread Oscar Salvador
On Tue, Aug 21, 2018 at 12:44:12PM +0200, David Hildenbrand wrote:
> This is the same approach as in the first RFC, but this time without
> exporting device_hotplug_lock (requested by Greg) and with some more
> details and documentation regarding locking. Tested only on x86 so far.

Hi David,

I would like to review this but I am on vacation, so I will not be able to get 
to it
soon.
I plan to do it once I am back.

Thanks
-- 
Oscar Salvador
SUSE L3


Re: [PATCH 3/5] drivers: clk-qoriq: Add clockgen support for lx2160a

2018-08-31 Thread Scott Wood
On Fri, 2018-08-31 at 06:12 +, Andy Tang wrote:
> Hi Scott,
> 
> Please see my replay inline.
> 
> > -Original Message-
> > From: linux-arm-kernel 
> > On Behalf Of Scott Wood
> > Sent: 2018年8月31日 1:43
> > To: Vabhav Sharma ;
> > linux-ker...@vger.kernel.org; devicet...@vger.kernel.org;
> > robh...@kernel.org; mark.rutl...@arm.com;
> > linuxppc-dev@lists.ozlabs.org; linux-arm-ker...@lists.infradead.org;
> > mturque...@baylibre.com; sb...@kernel.org; r...@rjwysocki.net;
> > viresh.ku...@linaro.org; linux-...@vger.kernel.org;
> > linux...@vger.kernel.org; linux-kernel-ow...@vger.kernel.org;
> > catalin.mari...@arm.com; will.dea...@arm.com;
> > gre...@linuxfoundation.org; a...@arndb.de;
> > kstew...@linuxfoundation.org; yamada.masah...@socionext.com
> > Cc: Yogesh Narayan Gaur ; Andy Tang
> > ; li...@armlinux.org.uk; Varun Sethi
> > ; Udit Kumar 
> > Subject: Re: [PATCH 3/5] drivers: clk-qoriq: Add clockgen support for
> > lx2160a
> > 
> > On Thu, 2018-08-30 at 12:39 -0500, Scott Wood wrote:
> > > On Thu, 2018-08-30 at 07:36 +, Vabhav Sharma wrote:
> > > > > 
> > > > > Why are you increasing NUM_CMUX beyond 8 for a chip that only
> > 
> > has
> > > > > 8 entries in cmux_to_group?
> > > > 
> > > > Configuration is 16 cores,8 cluster with 2 cores in each cluster
> > > 
> > > So?  This is about cmuxes, not cores.  You're increasing the array
> > > without ever using the new size.
> > 
> > Oh, and you also broke p4080 which has 8 cmuxes but no -1 terminator,
> > because the array was of length 8.  Probably the array should be changed
> > to NUM_CMUX+1 so every array can be -1 terminated.
> > 
> 
> [Andy] How about we add -1 terminator to p4080 and increase NUM_CMUX to 16?

Why 16?  What does such a change have to do with this chip, which according to
the rest of the patch has 8 cmuxes?

> We don't want to increase NUM_CMUX each time new soc with more cmuxes added.

You don't want to have to make a trivial change each time you exceed a limit
that has yet to be exceeded once since NUM_CMUX was added?  This isn't ABI or
in any other way hard to change.  It's right in the same file as the chip
description you'd be adding.

And even if a chip did come along with 16 cmuxes, you'd then need to increase
the array to 17 to hold the -1 if you don't want to leave a situation like the
p4080 is in now, where a chip's cmux array could be broken by increasing
NUM_CMUX further.

-Scott



Re: [PATCH RFCv2 1/6] mm/memory_hotplug: make remove_memory() take the device_hotplug_lock

2018-08-31 Thread David Hildenbrand
On 30.08.2018 21:35, Pasha Tatashin wrote:
>> +
>> +void __ref remove_memory(int nid, u64 start, u64 size)
> 
> Remove __ref, otherwise looks good:

Indeed, will do.

Thanks for the review. Will resend in two weeks when I'm back from vacation.

Cheers!

> 
> Reviewed-by: Pavel Tatashin 
> 
>> +{
>> +lock_device_hotplug();
>> +__remove_memory(nid, start, size);
>> +unlock_device_hotplug();
>> +}
>>  EXPORT_SYMBOL_GPL(remove_memory);
>>  #endif /* CONFIG_MEMORY_HOTREMOVE */


-- 

Thanks,

David / dhildenb


Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Srikar Dronamraju
* Peter Zijlstra  [2018-08-31 13:26:39]:

> On Fri, Aug 31, 2018 at 01:12:53PM +0200, Peter Zijlstra wrote:
> > NAK, not until you've fixed every cpu_to_node() user in the kernel to
> > deal with that mask changing.
> 
> Also, what happens if userspace reads that information; uses libnuma and
> then you go and shift the world underneath their feet?
> 
> > This is absolutely insane.
> 

The topology events are suppose to be very rare.
>From whatever small experiments I have done till now, unless tasks are
bound to both cpu and memory, they seem to be coping well with topology
updates. I know things weren't optimal after a topology change but they
worked. Now after 051f3ca02e46 "Introduce NUMA identity node sched
domain", systems stall. I am only exploring at ways to keep them working
as much as they were before that commit.

-- 
Thanks and Regards
Srikar Dronamraju



Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 04:53:50AM -0700, Srikar Dronamraju wrote:

> The topology events are suppose to be very rare.
> From whatever small experiments I have done till now, unless tasks are
> bound to both cpu and memory, they seem to be coping well with topology
> updates.

IOW, if you're not using NUMA, it works if you change the NUMA setup.

You don't see anything wrong with that?!

Those programs would work as well if you didn't expose the NUMA stuff,
because they're not using it anyway.


Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 04:53:50AM -0700, Srikar Dronamraju wrote:
> * Peter Zijlstra  [2018-08-31 13:26:39]:
> 
> > On Fri, Aug 31, 2018 at 01:12:53PM +0200, Peter Zijlstra wrote:
> > > NAK, not until you've fixed every cpu_to_node() user in the kernel to
> > > deal with that mask changing.
> > 
> > Also, what happens if userspace reads that information; uses libnuma and
> > then you go and shift the world underneath their feet?
> > 
> > > This is absolutely insane.
> > 
> 
> The topology events are suppose to be very rare.
> From whatever small experiments I have done till now, unless tasks are
> bound to both cpu and memory, they seem to be coping well with topology
> updates. I know things weren't optimal after a topology change but they
> worked. Now after 051f3ca02e46 "Introduce NUMA identity node sched
> domain", systems stall. I am only exploring at ways to keep them working
> as much as they were before that commit.

I'm saying things were fundamentally buggered and this just made it show.

If you cannot guarantee cpu:node relations, you do not have NUMA, end of
story.


Re: [PATCH] sched/topology: Use Identity node only if required

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 04:56:18PM +0530, Srikar Dronamraju wrote:
> This was the same in my previous posting too. Before the topology update
> happened, all the cpus would be in SMT, DIE. The topology updates can be
> disabled using a kernel parameter topology_updates=off. Its documented under
> https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html as
> 
>   topology_updates= [KNL, PPC, NUMA] Format: {off} Specify if the kernel
>   should ignore (off) topology updates sent by the hypervisor to this
>   LPAR.
> 
> and is not something new in powerpc.

Doesn't mean it isn't utterly broken.


Re: [PATCH v1] mm: relax deferred struct page requirements

2018-08-31 Thread Jiri Slaby
On 08/31/2018, 02:10 PM, Pasha Tatashin wrote:
> Thanks Jiri, I am now able to reproduce it with your new config.
> 
> I have tried yesterday to enable sparsemem and deferred_struct_init on
> x86_32, and that kernel booted fine, there must be something else in
> your config that helps to trigger this problem. I am studying it now.
> 
> [0.051245] Initializing CPU#0
> [0.051682] Initializing HighMem for node 0 (000367fe:0007ffe0)
> [0.067499] BUG: unable to handle kernel NULL pointer dereference at
> 0028
> [0.068452] *pdpt =  *pde = f000ff53f000ff53
> [0.069105] Oops:  [#1] PREEMPT SMP PTI
> [0.069595] CPU: 0 PID: 0 Comm: swapper Not tainted
> 4.19.0-rc1-pae_pt_jiri #1
> [0.070382] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.11.0-20171110_100015-anatol 04/01/2014
> [0.071545] EIP: free_unref_page_prepare.part.70+0x2c/0x50
> [0.072178] Code: 19 e9 ff 89 d1 55 c1 ea 11 c1 e9 07 8b 14 d5 44 52
> fd d6 81 e1 fc 03 00 00 89 e5 56 53 89 cb be 1d 00 00 00 c1 eb 05 83 e1
> 1f <8b> 14 9a 29 ce 89 f1 d3 ea 83 e2 07 89 50 10 b8 01 00 00 00 5b 5e
> [0.074296] EAX: f4cfa000 EBX: 000a ECX: 0010 EDX: 
> [0.075005] ESI: 001d EDI: 0007ffe0 EBP: d6d41ed0 ESP: d6d41ec8
> [0.075714] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210002
> [0.076508] CR0: 80050033 CR2: 0028 CR3: 16f2 CR4: 000406b0
> [0.077242] DR0:  DR1:  DR2:  DR3: 
> [0.077934] DR6: fffe0ff0 DR7: 0400
> [0.078380] Call Trace:
> [0.078670]  free_unref_page+0x3a/0x90
> [0.079136]  __free_pages+0x25/0x30
> [0.079533]  free_highmem_page+0x1e/0x50
> [0.079978]  add_highpages_with_active_regions+0xd1/0x11f
> [0.080592]  set_highmem_pages_init+0x67/0x7d
> [0.081076]  mem_init+0x30/0x1fc

page_to_pfn(pfn_to_page(pfn)) != pfn with my .config on pfns >= 0x6:

[0.157667] add_highpages_with_active_regions: pfn=5fffb pg=f55f9f4c
pfn(pg(pfn)=5fffb sec=2
[0.159231] add_highpages_with_active_regions: pfn=5fffc pg=f55f9f70
pfn(pg(pfn)=5fffc sec=2
[0.161020] add_highpages_with_active_regions: pfn=5fffd pg=f55f9f94
pfn(pg(pfn)=5fffd sec=2
[0.163149] add_highpages_with_active_regions: pfn=5fffe pg=f55f9fb8
pfn(pg(pfn)=5fffe sec=2
[0.165204] add_highpages_with_active_regions: pfn=5 pg=f55f9fdc
pfn(pg(pfn)=5 sec=2
[0.167216] add_highpages_with_active_regions: pfn=6 pg=f4cfa000
pfn(pg(pfn)=c716a800 sec=3

So add_highpages_with_active_regions passes down page to
free_highmem_page and later, free_unref_page does page_to_pfn(page) and
__get_pfnblock_flags_mask operates on this modified pfn leading to crash
– __pfn_to_section(pfn)->pageblock_flags is NULL!

Note that __pfn_to_section(pfn)->pageblock_flags on the original pfn
returns a valid bitmap.

thanks,
-- 
js
suse labs


Re: [PATCH v1] mm: relax deferred struct page requirements

2018-08-31 Thread Pasha Tatashin
Thanks Jiri, I am now able to reproduce it with your new config.

I have tried yesterday to enable sparsemem and deferred_struct_init on
x86_32, and that kernel booted fine, there must be something else in
your config that helps to trigger this problem. I am studying it now.

[0.051245] Initializing CPU#0
[0.051682] Initializing HighMem for node 0 (000367fe:0007ffe0)
[0.067499] BUG: unable to handle kernel NULL pointer dereference at
0028
[0.068452] *pdpt =  *pde = f000ff53f000ff53
[0.069105] Oops:  [#1] PREEMPT SMP PTI
[0.069595] CPU: 0 PID: 0 Comm: swapper Not tainted
4.19.0-rc1-pae_pt_jiri #1
[0.070382] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.11.0-20171110_100015-anatol 04/01/2014
[0.071545] EIP: free_unref_page_prepare.part.70+0x2c/0x50
[0.072178] Code: 19 e9 ff 89 d1 55 c1 ea 11 c1 e9 07 8b 14 d5 44 52
fd d6 81 e1 fc 03 00 00 89 e5 56 53 89 cb be 1d 00 00 00 c1 eb 05 83 e1
1f <8b> 14 9a 29 ce 89 f1 d3 ea 83 e2 07 89 50 10 b8 01 00 00 00 5b 5e
[0.074296] EAX: f4cfa000 EBX: 000a ECX: 0010 EDX: 
[0.075005] ESI: 001d EDI: 0007ffe0 EBP: d6d41ed0 ESP: d6d41ec8
[0.075714] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210002
[0.076508] CR0: 80050033 CR2: 0028 CR3: 16f2 CR4: 000406b0
[0.077242] DR0:  DR1:  DR2:  DR3: 
[0.077934] DR6: fffe0ff0 DR7: 0400
[0.078380] Call Trace:
[0.078670]  free_unref_page+0x3a/0x90
[0.079136]  __free_pages+0x25/0x30
[0.079533]  free_highmem_page+0x1e/0x50
[0.079978]  add_highpages_with_active_regions+0xd1/0x11f
[0.080592]  set_highmem_pages_init+0x67/0x7d
[0.081076]  mem_init+0x30/0x1fc
[0.081434]  start_kernel+0x1cc/0x44c
[0.081874]  i386_start_kernel+0x98/0x9c
[0.082401]  startup_32_smp+0x164/0x168
[0.082873] Modules linked in:
[0.083228] CR2: 0028
[0.083606] ---[ end trace a5990d9ace2ec990 ]---
[0.084128] EIP: free_unref_page_prepare.part.70+0x2c/0x50
[0.084747] Code: 19 e9 ff 89 d1 55 c1 ea 11 c1 e9 07 8b 14 d5 44 52
fd d6 81 e1 fc 03 00 00 89 e5 56 53 89 cb be 1d 00 00 00 c1 eb 05 83 e1
1f <8b> 14 9a 29 ce 89 f1 d3 ea 83 e2 07 89 50 10 b8 01 00 00 00 5b 5e
[0.086874] EAX: f4cfa000 EBX: 000a ECX: 0010 EDX: 
[0.087581] ESI: 001d EDI: 0007ffe0 EBP: d6d41ed0 ESP: d6f27efc
[0.088287] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210002
[0.089139] CR0: 80050033 CR2: 0028 CR3: 16f2 CR4: 000406b0
[0.089850] DR0:  DR1:  DR2:  DR3: 
[0.090557] DR6: fffe0ff0 DR7: 0400
[0.090992] Kernel panic - not syncing: Attempted to kill the idle task!


Pavel

On 8/31/18 7:29 AM, Jiri Slaby wrote:
> On 08/31/2018, 01:26 PM, Jiri Slaby wrote:
>> On 08/30/2018, 05:45 PM, Pasha Tatashin wrote:
>>> Hi Jiri,
>>>
>>> I believe this bug is fixed with this change:
>>>
>>> d39f8fb4b7776dcb09ec3bf7a321547083078ee3
>>> mm: make DEFERRED_STRUCT_PAGE_INIT explicitly depend on SPARSEMEM
>>
>> Hi,
>>
>> it only shifted. Enabling only SPARSEMEM works fine, enabling also
>> DEFERRED_STRUCT_PAGE_INIT doesn't even boot – immediately reboots
>> (config attached).
> 
> Wow, earlyprintk is up at the moment of crash already:
> [0.00] Linux version 4.19.0-rc1-pae (jslaby@kunlun) (gcc version
> 4.8.5 (SUSE Linux)) #4 SMP PREEMPT Fri Aug 31 13:18:33 CEST 2018
> [0.00] x86/fpu: x87 FPU will use FXSAVE
> [0.00] BIOS-provided physical RAM map:
> [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
> [0.00] BIOS-e820: [mem 0x0009fc00-0x0009]
> reserved
> [0.00] BIOS-e820: [mem 0x000f-0x000f]
> reserved
> [0.00] BIOS-e820: [mem 0x0010-0x7cfd] usable
> [0.00] BIOS-e820: [mem 0x7cfe-0x7cff]
> reserved
> [0.00] BIOS-e820: [mem 0xfeffc000-0xfeff]
> reserved
> [0.00] BIOS-e820: [mem 0xfffc-0x]
> reserved
> [0.00] bootconsole [earlyser0] enabled
> [0.00] NX (Execute Disable) protection: active
> [0.00] SMBIOS 2.8 present.
> [0.00] DMI: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> 1.0.0-prebuilt.qemu-project.org 04/01/2014
> [0.00] Hypervisor detected: KVM
> [0.00] kvm-clock: Using msrs 4b564d01 and 4b564d00
> [0.02] kvm-clock: cpu 0, msr 1d12c001, primary cpu clock
> [0.02] kvm-clock: using sched offset of 1597117996 cycles
> [0.001395] clocksource: kvm-clock: mask: 0x
> max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
> [0.006245] tsc: Detected 2808.000 MHz processor
> [0.010055] last_pfn = 0x7cfe0 max_arch_pfn = 0x100
> [0.011483] x86/PAT: PAT not supported by CPU.
> [0.012580] x86/PAT: Configuration [0-7]: WB  WT  UC- UC  WB  WT  UC-
> UC
> [  

RE: [PATCH 0/7 v6] Support for fsl-mc bus and its devices in SMMU

2018-08-31 Thread Nipun Gupta
Hi Joerg/Robin,

Can you please let me know when these patches will be applied onto the tree.
Is there anything else pending from my side.

Thanks,
Nipun

> -Original Message-
> From: Nipun Gupta
> Sent: Monday, July 9, 2018 4:48 PM
> To: robin.mur...@arm.com; will.dea...@arm.com; robh...@kernel.org;
> r...@kernel.org; mark.rutl...@arm.com; catalin.mari...@arm.com;
> gre...@linuxfoundation.org; Laurentiu Tudor ;
> bhelg...@google.com; h...@lst.de
> Cc: j...@8bytes.org; m.szyprow...@samsung.com; shawn...@kernel.org;
> frowand.l...@gmail.com; io...@lists.linux-foundation.org; linux-
> ker...@vger.kernel.org; devicet...@vger.kernel.org; linux-arm-
> ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org; linux-
> p...@vger.kernel.org; Bharat Bhushan ;
> stuyo...@gmail.com; Leo Li ; Nipun Gupta
> 
> Subject: [PATCH 0/7 v6] Support for fsl-mc bus and its devices in SMMU
> 
> This patchset defines IOMMU DT binding for fsl-mc bus and adds
> support in SMMU for fsl-mc bus.
> 
> The patch series is based on top of dma-mapping tree (for-next branch):
> http://git.infradead.org/users/hch/dma-mapping.git
> 
> These patches
>   - Define property 'iommu-map' for fsl-mc bus (patch 1)
>   - Integrates the fsl-mc bus with the SMMU using this
> IOMMU binding (patch 2,3,4)
>   - Adds the dma configuration support for fsl-mc bus (patch 5, 6)
>   - Updates the fsl-mc device node with iommu/dma related changes (patch 7)
> 
> Changes in v2:
>   - use iommu-map property for fsl-mc bus
>   - rebase over patchset https://patchwork.kernel.org/patch/10317337/
> and make corresponding changes for dma configuration of devices on
> fsl-mc bus
> 
> Changes in v3:
>   - move of_map_rid in drivers/of/address.c
> 
> Changes in v4:
>   - move of_map_rid in drivers/of/base.c
> 
> Changes in v5:
>   - break patch 5 in two separate patches (now patch 5/7 and patch 6/7)
>   - add changelog text in patch 3/7 and patch 5/7
>   - typo fix
> 
> Changes in v6:
>   - Updated fsl_mc_device_group() API to be more rational
>   - Added dma-coherent property in the LS2 smmu device node
>   - Minor fixes in the device-tree documentation
> 
> Nipun Gupta (7):
>   Documentation: fsl-mc: add iommu-map device-tree binding for fsl-mc
> bus
>   iommu/of: make of_pci_map_rid() available for other devices too
>   iommu/of: support iommu configuration for fsl-mc devices
>   iommu/arm-smmu: Add support for the fsl-mc bus
>   bus: fsl-mc: support dma configure for devices on fsl-mc bus
>   bus: fsl-mc: set coherent dma mask for devices on fsl-mc bus
>   arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc
> 
>  .../devicetree/bindings/misc/fsl,qoriq-mc.txt  |  39 
>  arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi |   7 +-
>  drivers/bus/fsl-mc/fsl-mc-bus.c|  16 +++-
>  drivers/iommu/arm-smmu.c   |   7 ++
>  drivers/iommu/iommu.c  |  13 +++
>  drivers/iommu/of_iommu.c   |  25 -
>  drivers/of/base.c  | 102 
> +
>  drivers/of/irq.c   |   5 +-
>  drivers/pci/of.c   | 101 
>  include/linux/fsl/mc.h |   8 ++
>  include/linux/iommu.h  |   2 +
>  include/linux/of.h |  11 +++
>  include/linux/of_pci.h |  10 --
>  13 files changed, 224 insertions(+), 122 deletions(-)
> 
> --
> 1.9.1



Re: [PATCH v1] mm: relax deferred struct page requirements

2018-08-31 Thread Jiri Slaby
On 08/31/2018, 01:26 PM, Jiri Slaby wrote:
> On 08/30/2018, 05:45 PM, Pasha Tatashin wrote:
>> Hi Jiri,
>>
>> I believe this bug is fixed with this change:
>>
>> d39f8fb4b7776dcb09ec3bf7a321547083078ee3
>> mm: make DEFERRED_STRUCT_PAGE_INIT explicitly depend on SPARSEMEM
> 
> Hi,
> 
> it only shifted. Enabling only SPARSEMEM works fine, enabling also
> DEFERRED_STRUCT_PAGE_INIT doesn't even boot – immediately reboots
> (config attached).

Wow, earlyprintk is up at the moment of crash already:
[0.00] Linux version 4.19.0-rc1-pae (jslaby@kunlun) (gcc version
4.8.5 (SUSE Linux)) #4 SMP PREEMPT Fri Aug 31 13:18:33 CEST 2018
[0.00] x86/fpu: x87 FPU will use FXSAVE
[0.00] BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009]
reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f]
reserved
[0.00] BIOS-e820: [mem 0x0010-0x7cfd] usable
[0.00] BIOS-e820: [mem 0x7cfe-0x7cff]
reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff]
reserved
[0.00] BIOS-e820: [mem 0xfffc-0x]
reserved
[0.00] bootconsole [earlyser0] enabled
[0.00] NX (Execute Disable) protection: active
[0.00] SMBIOS 2.8 present.
[0.00] DMI: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.0.0-prebuilt.qemu-project.org 04/01/2014
[0.00] Hypervisor detected: KVM
[0.00] kvm-clock: Using msrs 4b564d01 and 4b564d00
[0.02] kvm-clock: cpu 0, msr 1d12c001, primary cpu clock
[0.02] kvm-clock: using sched offset of 1597117996 cycles
[0.001395] clocksource: kvm-clock: mask: 0x
max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
[0.006245] tsc: Detected 2808.000 MHz processor
[0.010055] last_pfn = 0x7cfe0 max_arch_pfn = 0x100
[0.011483] x86/PAT: PAT not supported by CPU.
[0.012580] x86/PAT: Configuration [0-7]: WB  WT  UC- UC  WB  WT  UC-
UC
[0.020644] found SMP MP-table at [mem 0x000f5d20-0x000f5d2f] mapped
at [(ptrval)]
[0.023528] Scanning 1 areas for low memory corruption
[0.025047] ACPI: Early table checksum verification disabled
[0.026581] ACPI: RSDP 0x000F5B40 14 (v00 BOCHS )
[0.028031] ACPI: RSDT 0x7CFE157C 30 (v01 BOCHS  BXPCRSDT
0001 BXPC 0001)
[0.029996] ACPI: FACP 0x7CFE1458 74 (v01 BOCHS  BXPCFACP
0001 BXPC 0001)
[0.032234] ACPI: DSDT 0x7CFE0040 001418 (v01 BOCHS  BXPCDSDT
0001 BXPC 0001)
[0.034662] ACPI: FACS 0x7CFE 40
[0.036126] ACPI: APIC 0x7CFE14CC 78 (v01 BOCHS  BXPCAPIC
0001 BXPC 0001)
[0.038235] ACPI: HPET 0x7CFE1544 38 (v01 BOCHS  BXPCHPET
0001 BXPC 0001)
[0.040373] No NUMA configuration found
[0.041407] Faking a node at [mem 0x-0x7cfd]
[0.043306] NODE_DATA(0) allocated [mem 0x367fc000-0x367fcfff]
[0.044958] 1127MB HIGHMEM available.
[0.045940] 871MB LOWMEM available.
[0.046978]   mapped low ram: 0 - 367fe000
[0.048200]   low ram: 0 - 367fe000
[0.050830] Zone ranges:
[0.051625]   DMA  [mem 0x1000-0x00ff]
[0.053295]   Normal   [mem 0x0100-0x367fdfff]
[0.054921]   HighMem  [mem 0x367fe000-0x7cfd]
[0.056408] Movable zone start for each node
[0.057452] Early memory node ranges
[0.058377]   node   0: [mem 0x1000-0x0009efff]
[0.059946]   node   0: [mem 0x0010-0x7cfd]
[0.061825] Reserved but unavailable: 12418 pages
[0.061828] Initmem setup node 0 [mem
0x1000-0x7cfd]
[0.074252] Using APIC driver default
[0.075615] ACPI: PM-Timer IO Port: 0x608
[0.076574] ACPI: LAPIC_NMI (acpi_id[0xff] dfl dfl lint[0x1])
[0.077995] IOAPIC[0]: apic_id 0, version 17, address 0xfec0, GSI
0-23
[0.079610] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[0.08] ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level)
[0.082786] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
[0.084297] ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level)
[0.085933] ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level)
[0.087729] Using ACPI (MADT) for SMP configuration information
[0.089119] ACPI: HPET id: 0x8086a201 base: 0xfed0
[0.090351] smpboot: Allowing 1 CPUs, 0 hotplug CPUs
[0.091561] PM: Registered nosave memory: [mem 0x-0x0fff]
[0.093361] PM: Registered nosave memory: [mem 0x0009f000-0x0009]
[0.096382] PM: Registered nosave memory: [mem 0x000a-0x000e]
[0.098130] PM: Registered nosave memory: [mem 0x000f-0x000f]
[0.099729] [mem 0x7d00-0xfeffbfff] 

Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 01:12:53PM +0200, Peter Zijlstra wrote:
> NAK, not until you've fixed every cpu_to_node() user in the kernel to
> deal with that mask changing.

Also, what happens if userspace reads that information; uses libnuma and
then you go and shift the world underneath their feet?

> This is absolutely insane.



Re: [PATCH] sched/topology: Use Identity node only if required

2018-08-31 Thread Srikar Dronamraju
* Peter Zijlstra  [2018-08-31 12:41:15]:

> On Fri, Aug 31, 2018 at 03:22:48AM -0700, Srikar Dronamraju wrote:
> 
> > At boot: Before topology update.
> 
> How does that work; you do SMP bringup _before_ you know the topology !?
> 

If you look at the other mail that I sent, the system boots to its regular
state with a certain topology. The hypervisor might detect and push topology
updates after the system has been booted and initialized.  This topology
update can happen much much later after boot. We boot with a particular
topology and a later point of time, the topology update event occurs.


> > After topology update.
> > 
> > For CPU 0
> > domain-0: span=0-7 level=SMT
> >  groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 
> > }, 5:{ span=5 }, 6:{ span=6 }, 7:{ span=7 }
> >  domain-1: 
> > span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> > level=DIE
> >   groups: 0:{ span=0-7 cap=8192 }, 32:{ span=32-39 cap=8192 }, 64:{ 
> > span=64-71 cap=8192 }, 96:{ span=96-103 cap=8192 }, 128:{ span=128-135 
> > cap=8192 }, 160:{ span=160-167 cap=8192 }, 192:{ span=192-199 cap=8192 }, 
> > 224:{ span=224-231 cap=8192 }, 256:{ span=256-263 cap=8192 }, 288:{ 
> > span=288-295 cap=8192 }
> >   domain-2: span=0-303 level=NODE
> >groups: 0:{ 
> > span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> > cap=81920 }, 8:{ 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  cap=81920 }, 16:{ 
> > span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> > cap=73728 }, 24:{ 
> > span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> > cap=73728 }
> > 

> > For CPU 9
> >  domain-0: span=8-15 level=SMT
> >   groups: 9:{ span=9 }, 10:{ span=10 }, 11:{ span=11 }, 12:{ span=12 }, 
> > 13:{ span=13 }, 14:{ span=14 }, 15:{ span=15 }, 8:{ span=8 }
> >   domain-1: 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  level=DIE
> >groups: 8:{ span=8-15 cap=8192 }, 40:{ span=40-47 cap=8192 }, 72:{ 
> > span=72-79 cap=8192 }, 104:{ span=104-111 cap=8192 }, 136:{ span=136-143 
> > cap=8192 }, 168:{ span=168-175 cap=8192 }, 200:{ span=200-207 cap=8192 }, 
> > 232:{ span=232-239 cap=8192 }, 264:{ span=264-271 cap=8192 }, 296:{ 
> > span=296-303 cap=8192 }
> >domain-2: 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  level=NODE
> > groups: 8:{ 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  cap=81920 }
> > domain-3: span=0-303 level=NUMA
> >  groups: 8:{ 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  cap=81920 }, 16:{ 
> > span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> > cap=73728 }, 24:{ 
> > span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> > cap=73728 }
> > ERROR: groups don't span domain->span
> 
> This is all very confused... and does not include the error we saw
> earlier.

> 
> CPU 0 has: SMT, DIE, NODE
> CPU 8 has: SMT, DIE, NODE, NUMA
> 

This was the same in my previous posting too. Before the topology update
happened, all the cpus would be in SMT, DIE. The topology updates can be
disabled using a kernel parameter topology_updates=off. Its documented under
https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html as

  topology_updates= [KNL, PPC, NUMA] Format: {off} Specify if the kernel
  should ignore (off) topology updates sent by the hypervisor to this
  LPAR.

and is not something new in powerpc.

> Something is completely buggered in your topology setup.
> 



Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 03:27:24AM -0700, Srikar Dronamraju wrote:
> * Peter Zijlstra  [2018-08-29 10:02:19]:


> Powerpc lpars running on Phyp have 2 modes. Dedicated and shared.
> 
> Dedicated lpars are similar to kvm guest with vcpupin.

Like i know what that means... I'm not big on virt. I suppose you're
saying it has a fixed virt to phys mapping.

> Shared  lpars are similar to kvm guest without any pinning. When running
> shared lpar mode, Phyp allows overcommitting. Now if more lpars are
> created/destroyed, Phyp will internally move / consolidate the cores. The
> objective is similar to what autonuma tries achieves on the host but with a
> different approach (consolidating to optimal nodes to achieve the best
> possible output).  This would mean that the actual underlying cpus/node
> mapping has changed.

AFAIK Linux can _not_ handle cpu:node relations changing. And I'm pretty
sure I told you that before.

> Phyp will propogate upwards an event to the lpar.  The
> lpar / os can choose to ignore or act on the same.
> 
> We have found that acting on the event will provide upto 40% improvement
> over ignoring the event. Acting on the event would mean moving the cpu from
> one node to the other, and topology_work_fn exactly does that.

How? Last time I checked there was a ton of code that relies on
cpu_to_node() not changing during the runtime of the kernel.

Stuff like the per-cpu memory allocations are done using the boot time
cpu_to_node() map for instance. Similarly, kthread creation uses the
cpu_to_node() map at the time of creation.

A lot of stuff is not re-evaluated. If you're dynamically changing the
node map, you're in for a world of hurt.

> In the case where we didn't have the NUMA sched domain, we would build the
> independent (aka overlap) sched_groups. With NUMA  sched domain
> introduction, we try to reuse sched_groups (aka non-overlay). This results
> in the above, which I thought I tried to explain in
> https://lwn.net/ml/linux-kernel/20180810164533.gb42...@linux.vnet.ibm.com

That email was a ton of confusion; you show an error and you don't
explain how you get there.

> In the typical case above, lets take 2 node, 8 core each having SMT 8
> threads.  Initially all the 8 cores might come from node 0.  Hence
> sched_domains_numa_masks[NODE][node1] and
> sched_domains_numa_mask[NUMA][node1] is set at sched_init_numa will have
> blank cpumasks.
> 
> Let say Phyp decides to move some of the load to another node, node 1, which
> till now has 0 cpus.  Hence we will see
> 
> "BUG: arch topology borken \n the DIE domain not a subset of the NODE
> domain"   which is probably okay. This problem is even present even before
> NODE domain was created and systems still booted and ran.

No that is _NOT_ OKAY. The fact that it boots and runs just means we
cope with it, but it violates a base assumption when building domains.

> However with the introduction of NODE sched_domain,
> init_sched_groups_capacity() gets called for non-overlay sched_domains which
> gets us into even worse problems. Here we will end up in a situation where
> sgA->sgB->sgC-sgD->sgA gets converted into sgA->sgB->sgC->sgB which ends up
> creating cpu stalls.
> 
> So the request is to expose the sched_domains_numa_masks_set /
> sched_domains_numa_masks_clear to arch, so that on topology update i.e event
> from phyp, arch set the mask correctly. The scheduler seems to take care of
> everything else.

NAK, not until you've fixed every cpu_to_node() user in the kernel to
deal with that mask changing.

This is absolutely insane.


Re: [PATCH] sched/topology: Use Identity node only if required

2018-08-31 Thread Peter Zijlstra
On Fri, Aug 31, 2018 at 03:22:48AM -0700, Srikar Dronamraju wrote:

> At boot: Before topology update.

How does that work; you do SMP bringup _before_ you know the topology !?

> After topology update.
> 
> For CPU 0
> domain-0: span=0-7 level=SMT
>  groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 
> }, 5:{ span=5 }, 6:{ span=6 }, 7:{ span=7 }
>  domain-1: 
> span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> level=DIE
>   groups: 0:{ span=0-7 cap=8192 }, 32:{ span=32-39 cap=8192 }, 64:{ 
> span=64-71 cap=8192 }, 96:{ span=96-103 cap=8192 }, 128:{ span=128-135 
> cap=8192 }, 160:{ span=160-167 cap=8192 }, 192:{ span=192-199 cap=8192 }, 
> 224:{ span=224-231 cap=8192 }, 256:{ span=256-263 cap=8192 }, 288:{ 
> span=288-295 cap=8192 }
>   domain-2: span=0-303 level=NODE
>groups: 0:{ 
> span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> cap=81920 }, 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }, 16:{ 
> span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> cap=73728 }, 24:{ 
> span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> cap=73728 }
> 
> For CPU 1
> domain-0: span=0-7 level=SMT
>  groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 
> }, 6:{ span=6 }, 7:{ span=7 }, 0:{ span=0 }
>  domain-1: 
> span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> level=DIE
>   groups: 0:{ span=0-7 cap=8192 }, 32:{ span=32-39 cap=8192 }, 64:{ 
> span=64-71 cap=8192 }, 96:{ span=96-103 cap=8192 }, 128:{ span=128-135 
> cap=8192 }, 160:{ span=160-167 cap=8192 }, 192:{ span=192-199 cap=8192 }, 
> 224:{ span=224-231 cap=8192 }, 256:{ span=256-263 cap=8192 }, 288:{ 
> span=288-295 cap=8192 }
>   domain-2: span=0-303 level=NODE
>groups: 0:{ 
> span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> cap=81920 }, 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }, 16:{ 
> span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> cap=73728 }, 24:{ 
> span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> cap=73728 }
> 
> 
> For CPU 8
>  domain-0: span=8-15 level=SMT
>   groups: 8:{ span=8 }, 9:{ span=9 }, 10:{ span=10 }, 11:{ span=11 }, 12:{ 
> span=12 }, 13:{ span=13 }, 14:{ span=14 }, 15:{ span=15 }
>   domain-1: 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> level=DIE
>groups: 8:{ span=8-15 cap=8192 }, 40:{ span=40-47 cap=8192 }, 72:{ 
> span=72-79 cap=8192 }, 104:{ span=104-111 cap=8192 }, 136:{ span=136-143 
> cap=8192 }, 168:{ span=168-175 cap=8192 }, 200:{ span=200-207 cap=8192 }, 
> 232:{ span=232-239 cap=8192 }, 264:{ span=264-271 cap=8192 }, 296:{ 
> span=296-303 cap=8192 }
>domain-2: 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> level=NODE
> groups: 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }
> domain-3: span=0-303 level=NUMA
>  groups: 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }, 16:{ 
> span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> cap=73728 }, 24:{ 
> span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> cap=73728 }
> ERROR: groups don't span domain->span
> 
> For CPU 9
>  domain-0: span=8-15 level=SMT
>   groups: 9:{ span=9 }, 10:{ span=10 }, 11:{ span=11 }, 12:{ span=12 }, 13:{ 
> span=13 }, 14:{ span=14 }, 15:{ span=15 }, 8:{ span=8 }
>   domain-1: 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> level=DIE
>groups: 8:{ span=8-15 cap=8192 }, 40:{ span=40-47 cap=8192 }, 72:{ 
> span=72-79 cap=8192 }, 104:{ span=104-111 cap=8192 }, 136:{ span=136-143 
> cap=8192 }, 168:{ span=168-175 cap=8192 }, 200:{ span=200-207 cap=8192 }, 
> 232:{ span=232-239 cap=8192 }, 264:{ span=264-271 cap=8192 }, 296:{ 
> span=296-303 cap=8192 }
>domain-2: 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> level=NODE
> groups: 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }
> domain-3: span=0-303 level=NUMA
>  groups: 8:{ 
> span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303 
> cap=81920 }, 16:{ 
> span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> cap=73728 }, 24:{ 
> span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> cap=73728 }
> ERROR: groups don't span domain->span

This is all very confused... and does not include the error we saw
earlier.

CPU 0 has: SMT, DIE, NODE
CPU 8 has: SMT, DIE, NODE, NUMA

Something is completely buggered in your topology setup.


Re: [PATCH 2/2] sched/topology: Expose numa_mask set/clear functions to arch

2018-08-31 Thread Srikar Dronamraju
* Peter Zijlstra  [2018-08-29 10:02:19]:

> On Fri, Aug 10, 2018 at 10:30:19PM +0530, Srikar Dronamraju wrote:
> > With commit 051f3ca02e46 ("sched/topology: Introduce NUMA identity node
> > sched domain") scheduler introduces an new numa level. However on shared
> > lpars like powerpc, this extra sched domain creation can lead to
> > repeated rcu stalls, sometimes even causing unresponsive systems on
> > boot. On such stalls, it was noticed that init_sched_groups_capacity()
> > (sg != sd->groups is always true).
> > 
> > INFO: rcu_sched self-detected stall on CPU
> >  1-: (240039 ticks this GP) idle=c32/1/4611686018427387906 
> > softirq=782/782 fqs=80012
> >   (t=240039 jiffies g=6272 c=6271 q=263040)
> > NMI backtrace for cpu 1
> 
> > --- interrupt: 901 at __bitmap_weight+0x70/0x100
> > LR = __bitmap_weight+0x78/0x100
> > [c0832132f9b0] [c09bb738] __func__.61127+0x0/0x20 (unreliable)
> > [c0832132fa00] [c016c178] build_sched_domains+0xf98/0x13f0
> > [c0832132fb30] [c016d73c] partition_sched_domains+0x26c/0x440
> > [c0832132fc20] [c01ee284] rebuild_sched_domains_locked+0x64/0x80
> > [c0832132fc50] [c01f11ec] rebuild_sched_domains+0x3c/0x60
> > [c0832132fc80] [c007e1c4] topology_work_fn+0x24/0x40
> > [c0832132fca0] [c0126704] process_one_work+0x1a4/0x470
> > [c0832132fd30] [c0126a68] worker_thread+0x98/0x540
> > [c0832132fdc0] [c012f078] kthread+0x168/0x1b0
> > [c0832132fe30] [c000b65c]
> > ret_from_kernel_thread+0x5c/0x80
> > 
> > Similar problem was earlier also reported at
> > https://lwn.net/ml/linux-kernel/20180512100233.GB3738@osiris/
> > 
> > Allow arch to set and clear masks corresponding to numa sched domain.
> 

> What this Changelog fails to do is explain the problem and motivate why
> this is the right solution.
> 
> As-is, this reads like, something's buggered, I changed this random thing
> and it now works.
> 
> So what is causing that domain construction error?
> 

Powerpc lpars running on Phyp have 2 modes. Dedicated and shared.

Dedicated lpars are similar to kvm guest with vcpupin.

Shared  lpars are similar to kvm guest without any pinning. When running
shared lpar mode, Phyp allows overcommitting. Now if more lpars are
created/destroyed, Phyp will internally move / consolidate the cores. The
objective is similar to what autonuma tries achieves on the host but with a
different approach (consolidating to optimal nodes to achieve the best
possible output).  This would mean that the actual underlying cpus/node
mapping has changed. Phyp will propogate upwards an event to the lpar.  The
lpar / os can choose to ignore or act on the same.

We have found that acting on the event will provide upto 40% improvement
over ignoring the event. Acting on the event would mean moving the cpu from
one node to the other, and topology_work_fn exactly does that.

In the case where we didn't have the NUMA sched domain, we would build the
independent (aka overlap) sched_groups. With NUMA  sched domain
introduction, we try to reuse sched_groups (aka non-overlay). This results
in the above, which I thought I tried to explain in
https://lwn.net/ml/linux-kernel/20180810164533.gb42...@linux.vnet.ibm.com

In the typical case above, lets take 2 node, 8 core each having SMT 8
threads.  Initially all the 8 cores might come from node 0.  Hence
sched_domains_numa_masks[NODE][node1] and
sched_domains_numa_mask[NUMA][node1] is set at sched_init_numa will have
blank cpumasks.

Let say Phyp decides to move some of the load to another node, node 1, which
till now has 0 cpus.  Hence we will see

"BUG: arch topology borken \n the DIE domain not a subset of the NODE
domain"   which is probably okay. This problem is even present even before
NODE domain was created and systems still booted and ran.

However with the introduction of NODE sched_domain,
init_sched_groups_capacity() gets called for non-overlay sched_domains which
gets us into even worse problems. Here we will end up in a situation where
sgA->sgB->sgC-sgD->sgA gets converted into sgA->sgB->sgC->sgB which ends up
creating cpu stalls.

So the request is to expose the sched_domains_numa_masks_set /
sched_domains_numa_masks_clear to arch, so that on topology update i.e event
from phyp, arch set the mask correctly. The scheduler seems to take care of
everything else.

-- 
Thanks and Regards
Srikar Dronamraju



Re: [PATCH] sched/topology: Use Identity node only if required

2018-08-31 Thread Srikar Dronamraju
* Peter Zijlstra  [2018-08-29 10:43:48]:

> On Fri, Aug 10, 2018 at 09:45:33AM -0700, Srikar Dronamraju wrote:
> 
> > 
> > CPU302 attaching NULL sched-domain.
> > CPU303 attaching NULL sched-domain.
> > BUG: arch topology borken
> >  the DIE domain not a subset of the NODE domain
> 
> ^ CLUE!!
> 
> but nowhere did you show what it thinks the DIE mask is.
> 
> >  CPU0 attaching sched-domain(s):
> >domain-2: sdA, span=0-303 level=NODE
> > groups: sg=sgL 0:{ 
> > span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> > cap=81920 }, sgM 8:{ 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  cap=81920 }, sdN 16:{ 
> > span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> > cap=73728 }, sgO 24:{ 
> > span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> > cap=73728 }
> > CPU1  attaching sched-domain(s):
> >domain-2: sdB, span=0-303 level=NODE
> > [  367.739387] groups: sg=sgL 0:{ 
> > span=0-7,32-39,64-71,96-103,128-135,160-167,192-199,224-231,256-263,288-295 
> > cap=81920 }, sgM 8:{ 
> > span=8-15,40-47,72-79,104-111,136-143,168-175,200-207,232-239,264-271,296-303
> >  cap=81920 }, sdN 16:{ 
> > span=16-23,48-55,80-87,112-119,144-151,176-183,208-215,240-247,272-279 
> > cap=73728 }, sgO 24:{ 
> > span=24-31,56-63,88-95,120-127,152-159,184-191,216-223,248-255,280-287 
> > cap=73728 }
> 
> You forgot to provide the rest of it... what's domain-[01] look like?

At boot: Before topology update.

For  CPU 0 
domain-0: span=0-7 level=SMT
 groups: 0:{ span=0 }, 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 
5:{ span=5 }, 6:{ span=6 }, 7:{ span=7 }
 domain-1: span=0-303 level=DIE
  groups: 0:{ span=0-7 cap=8192 }, 8:{ span=8-15 cap=8192 }, 16:{ span=16-23 
cap=8192 }, 24:{ span=24-31 cap=8192 }, 32:{ span=32-39 cap=8192 }, 40:{ 
span=40-47 cap=8192 }, 48:{ span=48-55 cap=8192 }, 56:{ span=56-63 cap=8192 }, 
64:{ span=64-71 cap=8192 }, 72:{ span=72-79 cap=8192 }, 80:{ span=80-87 
cap=8192 }, 88:{ span=88-95 cap=8192 }, 96:{ span=96-103 cap=8192 }, 104:{ 
span=104-111 cap=8192 }, 112:{ span=112-119 cap=8192 }, 120:{ span=120-127 
cap=8192 }, 128:{ span=128-135 cap=8192 }, 136:{ span=136-143 cap=8192 }, 144:{ 
span=144-151 cap=8192 }, 152:{ span=152-159 cap=8192 }, 160:{ span=160-167 
cap=8192 }, 168:{ span=168-175 cap=8192 }, 176:{ span=176-183 cap=8192 }, 184:{ 
span=184-191 cap=8192 }, 192:{ span=192-199 cap=8192 }, 200:{ span=200-207 
cap=8192 }, 208:{ span=208-215 cap=8192 }, 216:{ span=216-223 cap=8192 }, 224:{ 
span=224-231 cap=8192 }, 232:{ span=232-239 cap=8192 }, 240:{ span=240-247 
cap=8192 }, 248:{ span=248-255 cap=8192 }, 256:{ span=256-263 cap=8192 }, 264:{ 
span=264-271 cap=8192 }, 272:{ span=272-279 cap=8192 }, 280:{ span=280-287 
cap=8192 }, 288:{ span=288-295 cap=8192 }, 296:{ span=296-303 cap=8192 }

For  CPU 1 
domain-0: span=0-7 level=SMT
 groups: 1:{ span=1 }, 2:{ span=2 }, 3:{ span=3 }, 4:{ span=4 }, 5:{ span=5 }, 
6:{ span=6 }, 7:{ span=7 }, 0:{ span=0 }
 domain-1: span=0-303 level=DIE
  groups: 0:{ span=0-7 cap=8192 }, 8:{ span=8-15 cap=8192 }, 16:{ span=16-23 
cap=8192 }, 24:{ span=24-31 cap=8192 }, 32:{ span=32-39 cap=8192 }, 40:{ 
span=40-47 cap=8192 }, 48:{ span=48-55 cap=8192 }, 56:{ span=56-63 cap=8192 }, 
64:{ span=64-71 cap=8192 }, 72:{ span=72-79 cap=8192 }, 80:{ span=80-87 
cap=8192 }, 88:{ span=88-95 cap=8192 }, 96:{ span=96-103 cap=8192 }, 104:{ 
span=104-111 cap=8192 }, 112:{ span=112-119 cap=8192 }, 120:{ span=120-127 
cap=8192 }, 128:{ span=128-135 cap=8192 }, 136:{ span=136-143 cap=8192 }, 144:{ 
span=144-151 cap=8192 }, 152:{ span=152-159 cap=8192 }, 160:{ span=160-167 
cap=8192 }, 168:{ span=168-175 cap=8192 }, 176:{ span=176-183 cap=8192 }, 184:{ 
span=184-191 cap=8192 }, 192:{ span=192-199 cap=8192 }, 200:{ span=200-207 
cap=8192 }, 208:{ span=208-215 cap=8192}, 216:{ span=216-223 cap=8192 }, 224:{ 
span=224-231 cap=8192 }, 232:{ span=232-239 cap=8192 }, 240:{ span=240-247 
cap=8192 }, 248:{ span=248-255 cap=8192 }, 256:{ span=256-263 cap=8192 }, 264:{ 
span=264-271 cap=8192 }, 272:{ span=272-279 cap=8192 }, 280:{ span=280-287 
cap=8192 }, 288:{ span=288-295 cap=8192 }, 296:{ span=296-303 cap=8192 }


For  CPU 8
domain-0: span=8-15 level=SMT
 groups: 8:{ span=8 }, 9:{ span=9 }, 10:{ span=10 }, 11:{ span=11 }, 12:{ 
span=12 }, 13:{ span=13 }, 14:{ span=14 }, 15:{ span=15 }
 domain-1: span=0-303 level=DIE
  groups: 8:{ span=8-15 cap=8192 }, 16:{ span=16-23 cap=8192 }, 24:{ span=24-31 
cap=8192 }, 32:{ span=32-39 cap=8192 }, 40:{ span=40-47 cap=8192 }, 48:{ 
span=48-55 cap=8192 }, 56:{ span=56-63 cap=8192 }, 64:{ span=64-71 cap=8192 }, 
72:{ span=72-79 cap=8192 }, 80:{ span=80-87 cap=8192 }, 88:{ span=88-95 
cap=8192 }, 96:{ span=96-103 cap=8192 }, 104:{ span=104-111 cap=8192 }, 112:{ 
span=112-119 cap=8192 }, 120:{ span=120-127 cap=8192 }, 128:{ span=128-135 
cap=8192 }, 136:{ span=136-143 cap=8192 }, 144:{ span=144-151 cap=8192 }, 152:{ 

[PATCH 2/2] powerpc/kexec: avoid hard coding when automatically allocating mem for crashkernel

2018-08-31 Thread Pingfan Liu
If no start address is specified for crashkernel, the current program hard
code as: crashk_res.start = min(0x800ULL, (ppc64_rma_size / 2));
This limits the candidate memory region, and may cause failure while there
is enough mem for crashkernel. This patch suggests to find a suitable mem
chunk by memblock_find_in_range()

Signed-off-by: Pingfan Liu 
Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Hari Bathini 
Cc: Mahesh Salgaonkar 
Cc: Anton Blanchard 
---
 arch/powerpc/kernel/machine_kexec.c | 24 +++-
 arch/powerpc/kernel/prom.c  |  7 +--
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/machine_kexec.c 
b/arch/powerpc/kernel/machine_kexec.c
index 63f5a93..78005bf 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -22,6 +22,9 @@
 #include 
 #include 
 #include 
+#include 
+
+#include "setup.h"
 
 void machine_kexec_mask_interrupts(void) {
unsigned int i;
@@ -117,6 +120,7 @@ void machine_kexec(struct kimage *image)
 void __init reserve_crashkernel(void)
 {
unsigned long long crash_size, crash_base;
+   phys_addr_t start, up_boundary;
int ret;
 
/* use common parsing */
@@ -146,22 +150,24 @@ void __init reserve_crashkernel(void)
 #else
if (!crashk_res.start) {
 #ifdef CONFIG_PPC64
-   /*
-* On 64bit we split the RMO in half but cap it at half of
-* a small SLB (128MB) since the crash kernel needs to place
-* itself and some stacks to be in the first segment.
-*/
-   crashk_res.start = min(0x800ULL, (ppc64_rma_size / 2));
+   up_boundary = min(ppc64_bolted_size(), ppc64_rma_size);
+   start = memblock_find_in_range(KDUMP_KERNELBASE, up_boundary,
+   crash_size, PAGE_SIZE);
+   if (start == 0) {
+   pr_err("Failed to reserve memory for crashkernel!\n");
+   crashk_res.start = crashk_res.end = 0;
+   return;
+   } else
+   crashk_res.start = start;
 #else
crashk_res.start = KDUMP_KERNELBASE;
 #endif
}
 
-   crash_base = PAGE_ALIGN(crashk_res.start);
-   if (crash_base != crashk_res.start) {
+   if (crashk_res.start != PAGE_ALIGN(crashk_res.start)) {
printk("Crash kernel base must be aligned to 0x%lx\n",
PAGE_SIZE);
-   crashk_res.start = crash_base;
+   crashk_res.start = PAGE_ALIGN(crashk_res.start);
}
 
 #endif
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index cae4a78..8b2ab99 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -688,6 +688,7 @@ static void tm_init(void) { }
 void __init early_init_devtree(void *params)
 {
phys_addr_t limit;
+   bool fadump_enabled = false;
 
DBG(" -> early_init_devtree(%p)\n", params);
 
@@ -737,9 +738,9 @@ void __init early_init_devtree(void *params)
 * If we fail to reserve memory for firmware-assisted dump then
 * fallback to kexec based kdump.
 */
-   if (fadump_reserve_mem() == 0)
+   if (fadump_reserve_mem() == 1)
+   fadump_enabled = true;
 #endif
-   reserve_crashkernel();
early_reserve_mem();
 
/* Ensure that total memory size is page-aligned. */
@@ -761,6 +762,8 @@ void __init early_init_devtree(void *params)
 
dt_cpu_ftrs_scan();
mmu_early_init_devtree();
+   if (!fadump_enabled)
+   reserve_crashkernel();
 
/* Retrieve CPU related informations from the flat tree
 * (altivec support, boot CPU ID, ...)
-- 
2.7.4



[PATCH 1/2] powerpc/prom: move mmu_early_init_devtree() before early_init_dt_scan_cpus()

2018-08-31 Thread Pingfan Liu
In early_init_dt_scan_cpus() -> allocate_paca(), using ppc64_bolted_size()
to get the limitation. Although MMU_SEGSIZE_256M is enough for boot cpu's
paca, but in fact the bolted segment size may be MMU_SEGSIZE_1T. Hence
moving mmu_early_init_devtree() a little earlier, and let any callers of
ppc64_bolted_size() get the right value.

Signed-off-by: Pingfan Liu 
Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Hari Bathini 
Cc: Mahesh Salgaonkar 
Cc: Anton Blanchard 
---
 arch/powerpc/kernel/prom.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index c4d7078..cae4a78 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -760,6 +760,7 @@ void __init early_init_devtree(void *params)
DBG("Scanning CPUs ...\n");
 
dt_cpu_ftrs_scan();
+   mmu_early_init_devtree();
 
/* Retrieve CPU related informations from the flat tree
 * (altivec support, boot CPU ID, ...)
@@ -777,8 +778,6 @@ void __init early_init_devtree(void *params)
spinning_secondaries = boot_cpu_count - 1;
 #endif
 
-   mmu_early_init_devtree();
-
 #ifdef CONFIG_PPC_POWERNV
/* Scan and build the list of machine check recoverable ranges */
of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
-- 
2.7.4



[PATCH 0/2] powerpc/kexec: automatically allocating mem for crashkernel=Y

2018-08-31 Thread Pingfan Liu
If no start address is specified for crashkernel, the current program hard
code as: crashk_res.start = min(0x800ULL, (ppc64_rma_size / 2));
This limits the candidate memory region, and may cause failure while there
is enough mem for crashkernel. This patch suggests to find a suitable mem
chunk by memblock_find_in_range()

Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Hari Bathini 
Cc: Mahesh Salgaonkar 
Cc: Anton Blanchard 

Pingfan Liu (2):
  powerpc/prom: move mmu_early_init_devtree() before
early_init_dt_scan_cpus()
  powerpc/kexec: avoid hard coding when automatically allocating mem for
crashkernel

 arch/powerpc/kernel/machine_kexec.c | 24 +++-
 arch/powerpc/kernel/prom.c  | 10 ++
 2 files changed, 21 insertions(+), 13 deletions(-)

-- 
2.7.4



RE: [PATCH 3/5] drivers: clk-qoriq: Add clockgen support for lx2160a

2018-08-31 Thread Andy Tang
Hi Scott,

Please see my replay inline.

> -Original Message-
> From: linux-arm-kernel 
> On Behalf Of Scott Wood
> Sent: 2018年8月31日 1:43
> To: Vabhav Sharma ;
> linux-ker...@vger.kernel.org; devicet...@vger.kernel.org;
> robh...@kernel.org; mark.rutl...@arm.com;
> linuxppc-dev@lists.ozlabs.org; linux-arm-ker...@lists.infradead.org;
> mturque...@baylibre.com; sb...@kernel.org; r...@rjwysocki.net;
> viresh.ku...@linaro.org; linux-...@vger.kernel.org;
> linux...@vger.kernel.org; linux-kernel-ow...@vger.kernel.org;
> catalin.mari...@arm.com; will.dea...@arm.com;
> gre...@linuxfoundation.org; a...@arndb.de;
> kstew...@linuxfoundation.org; yamada.masah...@socionext.com
> Cc: Yogesh Narayan Gaur ; Andy Tang
> ; li...@armlinux.org.uk; Varun Sethi
> ; Udit Kumar 
> Subject: Re: [PATCH 3/5] drivers: clk-qoriq: Add clockgen support for
> lx2160a
> 
> On Thu, 2018-08-30 at 12:39 -0500, Scott Wood wrote:
> > On Thu, 2018-08-30 at 07:36 +, Vabhav Sharma wrote:
> > > > -Original Message-
> > > > From: linux-kernel-ow...@vger.kernel.org  > > > ow...@vger.kernel.org> On Behalf Of Scott Wood
> > > > Sent: Wednesday, August 29, 2018 5:49 AM
> > > > To: Vabhav Sharma ; linux-
> > > > ker...@vger.kernel.org; devicet...@vger.kernel.org;
> > > > robh...@kernel.org; mark.rutl...@arm.com;
> > > > linuxppc-dev@lists.ozlabs.org; linux-arm-
> > > > ker...@lists.infradead.org; mturque...@baylibre.com;
> > > > sb...@kernel.org; r...@rjwysocki.net; viresh.ku...@linaro.org;
> > > > linux-...@vger.kernel.org; linux...@vger.kernel.org;
> > > > linux-kernel-ow...@vger.kernel.org;
> > > > catalin.mari...@arm.com; will.dea...@arm.com;
> > > > gre...@linuxfoundation.org; a...@arndb.de;
> > > > kstew...@linuxfoundation.org; yamada.masah...@socionext.com
> > > > Cc: Yogesh Narayan Gaur ; Andy
> Tang
> > > > ; Udit Kumar ;
> > > > li...@armlinux.org.uk; Varun Sethi 
> > > > Subject: Re: [PATCH 3/5] drivers: clk-qoriq: Add clockgen support
> > > > for lx2160a
> > > >
> > > > On Mon, 2018-08-20 at 12:17 +0530, Vabhav Sharma wrote:
> > > > > From: Yogesh Gaur 
> > > > >
> > > > > Add clockgen support for lx2160a.
> > > > > Added entry for compat 'fsl,lx2160a-clockgen'.
> > > > > As LX2160A is 16 core, so modified value for NUM_CMUX
> > > > >
> > > > > Signed-off-by: Tang Yuantian 
> > > > > Signed-off-by: Yogesh Gaur 
> > > > > Signed-off-by: Vabhav Sharma 
> > > > > ---
> > > > >  drivers/clk/clk-qoriq.c | 14 +-
> > > > >  drivers/cpufreq/qoriq-cpufreq.c |  1 +
> > > > >  2 files changed, 14 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/clk/clk-qoriq.c b/drivers/clk/clk-qoriq.c
> > > > > index
> > > > > 3a1812f..fc6e308 100644
> > > > > --- a/drivers/clk/clk-qoriq.c
> > > > > +++ b/drivers/clk/clk-qoriq.c
> > > > > @@ -60,7 +60,7 @@ struct clockgen_muxinfo {  };
> > > > >
> > > > >  #define NUM_HWACCEL  5
> > > > > -#define NUM_CMUX 8
> > > > > +#define NUM_CMUX 16
> > > > >
> > > > >  struct clockgen;
> > > > >
> > > > > @@ -570,6 +570,17 @@ static const struct clockgen_chipinfo
> > > > > chipinfo[] = {
> > > > >   .flags = CG_VER3 | CG_LITTLE_ENDIAN,
> > > > >   },
> > > > >   {
> > > > > + .compat = "fsl,lx2160a-clockgen",
> > > > > + .cmux_groups = {
> > > > > + _cmux_cga12, _cmux_cgb
> > > > > + },
> > > > > + .cmux_to_group = {
> > > > > + 0, 0, 0, 0, 1, 1, 1, 1, -1
> > > > > + },
> > > > > + .pll_mask = 0x37,
> > > > > + .flags = CG_VER3 | CG_LITTLE_ENDIAN,
> > > > > + },
> > > >
> > > > Why are you increasing NUM_CMUX beyond 8 for a chip that only
> has
> > > > 8 entries in cmux_to_group?
> > >
> > > Configuration is 16 cores,8 cluster with 2 cores in each cluster
> >
> > So?  This is about cmuxes, not cores.  You're increasing the array
> > without ever using the new size.
> 
> Oh, and you also broke p4080 which has 8 cmuxes but no -1 terminator,
> because the array was of length 8.  Probably the array should be changed
> to NUM_CMUX+1 so every array can be -1 terminated.
> 
[Andy] How about we add -1 terminator to p4080 and increase NUM_CMUX to 16?
We don't want to increase NUM_CMUX each time new soc with more cmuxes added.

BR,
Andy Tang

> -Scott
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Flist
> s.infradead.org%2Fmailman%2Flistinfo%2Flinux-arm-kerneldata=02
> %7C01%7Candy.tang%40nxp.com%7C963d0cdf49964539ea4408d60ea06
> 63b%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C63671247
> 9414561101sdata=rZY%2F5JP0TBZRLr%2Bp6qaG4oSQd8fsNviz92AY
> 3IoLTOw%3Dreserved=0


[PATCH kernel] KVM: PPC: Avoid mapping compound pages to TCEs in real mode

2018-08-31 Thread Alexey Kardashevskiy
At the moment the real mode handler of H_PUT_TCE calls iommu_tce_xchg_rm()
which in turn reads the old TCE and if it was a valid entry - marks
the physical page dirty if it was mapped for writing. Since it is
the real mode, realmode_pfn_to_page() is used instead of pfn_to_page()
to get the page struct. However SetPageDirty() itself reads the compound
page head and returns a virtual address for the head page struct and
setting dirty bit for that kills the system.

This moves dirty bit setting before updating the hardware table to make
sure compound pages are never mapped in the real mode so when H_PUT_TCE
or H_STUFF_TCE try clearing a TCE, they won't get a compound page to mark
dirty.

This changes kvmppc_rm_tce_validate() to check if the preregistered
memory is backed with pages bigger than hardware IOMMU pages; if this is
the case, we forward the request to the virtual mode handlers where it
can be safely processed.

The second check makes the first check rather unnecessary but since
the actual crash happened at the SetPageDirty() call site, this marks
the spot with WARN_ON_ONCE.

In order to keep virtual and real mode handlers in sync, this adjusts
iommu_tce_xchg() as well.

Signed-off-by: Alexey Kardashevskiy 
---

This is made on top of:
[PATCH kernel 0/4] KVM: PPC: Some error handling rework
  KVM: PPC: Validate all tces before updating tables
  KVM: PPC: Inform the userspace about TCE update failures
  KVM: PPC: Validate TCEs against preregistered memory page sizes
  KVM: PPC: Propagate errors to the guest when failed instead of
ignoring
---
 arch/powerpc/include/asm/mmu_context.h |  3 ++-
 arch/powerpc/kernel/iommu.c| 19 ---
 arch/powerpc/kvm/book3s_64_vio_hv.c| 15 +++
 arch/powerpc/mm/mmu_context_iommu.c|  4 +++-
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index b2f89b6..073d72f9b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -31,7 +31,8 @@ extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
-   struct mm_struct *mm, unsigned long ua, unsigned long size);
+   struct mm_struct *mm, unsigned long ua, unsigned long size,
+   unsigned int *pshift);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index af7a20d..62e014d 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -998,12 +998,11 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned 
long entry,
 {
long ret;
 
-   ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
-
-   if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-   (*direction == DMA_BIDIRECTIONAL)))
+   if (*direction == DMA_FROM_DEVICE || *direction == DMA_BIDIRECTIONAL)
SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
+   ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
+
/* if (unlikely(ret))
pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx 
ret=%d\n",
__func__, hwaddr, entry << tbl->it_page_shift,
@@ -1019,20 +1018,18 @@ long iommu_tce_xchg_rm(struct iommu_table *tbl, 
unsigned long entry,
 {
long ret;
 
-   ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
-
-   if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-   (*direction == DMA_BIDIRECTIONAL))) {
+   if (*direction == DMA_FROM_DEVICE || *direction == DMA_BIDIRECTIONAL) {
struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
 
if (likely(pg)) {
+   if (WARN_ON_ONCE(PageCompound(pg)))
+   return -EPERM;
SetPageDirty(pg);
-   } else {
-   tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
-   ret = -EFAULT;
}
}
 
+   ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8d82133..ce4d2f4 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -118,11 +118,17 @@ static long kvmppc_rm_tce_validate(struct 
kvmppc_spapr_tce_table *stt,
unsigned long hpa = 0;
struct mm_iommu_table_group_mem_t *mem;
long shift =