Re: [PATCH] powerpc/64s: fix hash page fault interrupt handler

2021-06-28 Thread Sachin Sant



> On 29-Jun-2021, at 7:20 AM, Nicholas Piggin  wrote:
> 
> The early bad fault or key fault test in do_hash_fault() ends up calling
> into ___do_page_fault without having gone through an interrupt handler
> wrapper (except the initial _RAW one). This can end up calling local irq
> functions while the interrupt has not been reconciled, which will likely
> cause crashes and it trips up on a later patch that adds more assertions.
> 
> pkey_exec_prot from selftests causes this path to be executed.
> 
> There is no real reason to run the in_nmi() test should be performed
> before the key fault check. In fact if a perf interrupt in the hash
> fault code did a stack walk that was made to take a key fault somehow
> then running ___do_page_fault could possibly cause another hash fault
> causing problems. Move the in_nmi() test first, and then do everything
> else inside the regular interrupt handler function.
> 
> Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
> Reported-by: Sachin Sant 
> Signed-off-by: Nicholas Piggin 
> ---

Thanks Nick. 
Fixes the reported problem.

Tested-by: Sachin Sant 

-Sachin


Re: [PATCH] Documentation: PCI: pci-error-recovery: rearrange the general sequence

2021-06-28 Thread Wesley Sheng
On Fri, Jun 18, 2021 at 05:21:32PM +1000, Oliver O'Halloran wrote:
> On Fri, Jun 18, 2021 at 4:05 PM Wesley Sheng  wrote:
> >
> > Reset_link() callback function was called before mmio_enabled() in
> > pcie_do_recovery() function actually, so rearrange the general
> > sequence betwen step 2 and step 3 accordingly.
> 
> I don't think this is true in all cases. If pcie_do_recovery() is
> called with state==pci_channel_io_normal (i.e. non-fatal AER) the link
> won't be reset. EEH (ppc PCI error recovery thing) also uses
> .mmio_enabled() as described.

Yes, in case of non-fatal AER, reset_link() callback (aer_root_reset() for 
AER and dpc_reset_link() for DPC) will not be invoked. And if 
.error_detected() return PCI_ERS_RESULT_CAN_RECOVER, .mmio_enabled() be
called followed.

But if pcie_do_recovery() is called with state == pci_channel_io_frozen,
reset_link() callback is called after .error_detected() but before
.mmio_enabled(). So I thought Step 2: MMIO Enabled and Step 3: Link Reset
should rearrange their sequence.


[PATCH] powerpc/64s: fix hash page fault interrupt handler

2021-06-28 Thread Nicholas Piggin
The early bad fault or key fault test in do_hash_fault() ends up calling
into ___do_page_fault without having gone through an interrupt handler
wrapper (except the initial _RAW one). This can end up calling local irq
functions while the interrupt has not been reconciled, which will likely
cause crashes and it trips up on a later patch that adds more assertions.

pkey_exec_prot from selftests causes this path to be executed.

There is no real reason to run the in_nmi() test should be performed
before the key fault check. In fact if a perf interrupt in the hash
fault code did a stack walk that was made to take a key fault somehow
then running ___do_page_fault could possibly cause another hash fault
causing problems. Move the in_nmi() test first, and then do everything
else inside the regular interrupt handler function.

Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
Reported-by: Sachin Sant 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 24 +++-
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 96d9aa164007..ac5720371c0d 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1522,8 +1522,8 @@ int hash_page(unsigned long ea, unsigned long access, 
unsigned long trap,
 }
 EXPORT_SYMBOL_GPL(hash_page);
 
-DECLARE_INTERRUPT_HANDLER_RET(__do_hash_fault);
-DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
+DECLARE_INTERRUPT_HANDLER(__do_hash_fault);
+DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
 {
unsigned long ea = regs->dar;
unsigned long dsisr = regs->dsisr;
@@ -1533,6 +1533,11 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
unsigned int region_id;
long err;
 
+   if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
+   hash__do_page_fault(regs);
+   return;
+   }
+
region_id = get_region_id(ea);
if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = _mm;
@@ -1571,9 +1576,10 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
bad_page_fault(regs, SIGBUS);
}
err = 0;
-   }
 
-   return err;
+   } else if (err) {
+   hash__do_page_fault(regs);
+   }
 }
 
 /*
@@ -1582,13 +1588,6 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
  */
 DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
 {
-   unsigned long dsisr = regs->dsisr;
-
-   if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
-   hash__do_page_fault(regs);
-   return 0;
-   }
-
/*
 * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
 * don't call hash_page, just fail the fault. This is required to
@@ -1607,8 +1606,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
return 0;
}
 
-   if (__do_hash_fault(regs))
-   hash__do_page_fault(regs);
+   __do_hash_fault(regs);
 
return 0;
 }
-- 
2.23.0



Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-06-28 Thread kernel test robot
Hi "Aneesh,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.13 next-20210628]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r024-20210628 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/fcbc8b19e99b1cf44fde904817f19616c6baecdb
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
git checkout fcbc8b19e99b1cf44fde904817f19616c6baecdb
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   arch/powerpc/mm/numa.c:298:6: warning: no previous prototype for 
'update_numa_distance' [-Wmissing-prototypes]
 298 | void update_numa_distance(struct device_node *node)
 |  ^~~~
   arch/powerpc/mm/numa.c: In function 'parse_numa_properties':
>> arch/powerpc/mm/numa.c:809:7: error: implicit declaration of function 
>> '__vphn_get_associativity'; did you mean 'of_get_associativity'? 
>> [-Werror=implicit-function-declaration]
 809 |   if (__vphn_get_associativity(i, vphn_assoc) == 0) {
 |   ^~~~
 |   of_get_associativity
   cc1: some warnings being treated as errors


vim +809 arch/powerpc/mm/numa.c

   771  
   772  static int __init parse_numa_properties(void)
   773  {
   774  struct device_node *memory;
   775  int default_nid = 0;
   776  unsigned long i;
   777  const __be32 *associativity;
   778  
   779  if (numa_enabled == 0) {
   780  printk(KERN_WARNING "NUMA disabled by user\n");
   781  return -1;
   782  }
   783  
   784  primary_domain_index = find_primary_domain_index();
   785  
   786  if (primary_domain_index < 0) {
   787  /*
   788   * if we fail to parse primary_domain_index from device 
tree
   789   * mark the numa disabled, boot with numa disabled.
   790   */
   791  numa_enabled = false;
   792  return primary_domain_index;
   793  }
   794  
   795  dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
   796  
   797  /*
   798   * Even though we connect cpus to numa domains later in SMP
   799   * init, we need to know the node ids now. This is because
   800   * each node to be onlined must have NODE_DATA etc backing it.
   801   */
   802  for_each_present_cpu(i) {
   803  __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
   804  struct device_node *cpu;
   805  int nid = NUMA_NO_NODE;
   806  
   807  memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * 
sizeof(__be32));
   808  
 > 809  if (__vphn_get_associativity(i, vphn_assoc) == 0) {
   810  nid = associativity_to_nid(vphn_assoc);
   811  __initialize_form1_numa_distance(vphn_assoc);
   812  } else {
   813  
   814  /*
   815   * Don't fall back to default_nid yet -- we 
will plug
   816   * cpus into nodes once the memory scan has 
discovered
   817   * the topology.
   818   */
   819  cpu = of_get_cpu_node(i, NULL);
   820  BUG_ON(!cpu);
   821  
   822  associativity = of_get_associativity(cpu);
   823  if (associativity) {
   824  nid = 
associativity_to_nid(associativity);
   825  
__initialize_form1_numa_distance(associativity);
   826  }
   827  of_node_put(cpu);
   828  }
   829  
   830  node_set_online(nid);
   831  }
   832  
   833  get_n_mem_cells(_mem_addr_cells, _mem_size_cells);
   834  
   835   

Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-06-28 Thread kernel test robot
Hi "Aneesh,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.13 next-20210628]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/fcbc8b19e99b1cf44fde904817f19616c6baecdb
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Aneesh-Kumar-K-V/Add-support-for-FORM2-associativity/20210628-231546
git checkout fcbc8b19e99b1cf44fde904817f19616c6baecdb
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> arch/powerpc/mm/numa.c:298:6: warning: no previous prototype for 
>> 'update_numa_distance' [-Wmissing-prototypes]
 298 | void update_numa_distance(struct device_node *node)
 |  ^~~~


vim +/update_numa_distance +298 arch/powerpc/mm/numa.c

   294  
   295  /*
   296   * Used to update distance information w.r.t newly added node.
   297   */
 > 298  void update_numa_distance(struct device_node *node)
   299  {
   300  if (affinity_form == FORM0_AFFINITY)
   301  return;
   302  else if (affinity_form == FORM1_AFFINITY) {
   303  initialize_form1_numa_distance(node);
   304  return;
   305  }
   306  }
   307  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH 5/8] powerpc/64: enable MSR[EE] in irq replay pt_regs

2021-06-28 Thread Nicholas Piggin
Excerpts from Sachin Sant's message of June 29, 2021 12:37 am:
> 
>> On 28-Jun-2021, at 1:19 PM, Nicholas Piggin  wrote:
>> 
>> Similar to 2b48e96be2f9f ("powerpc/64: fix irq replay pt_regs->softe
>> value"), enable MSR_EE in pt_regs->msr, which makes the regs look a
>> bit more normal and allows the extra debug checks to be added to
>> interrupt handler entry.
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>> arch/powerpc/include/asm/interrupt.h | 4 
>> arch/powerpc/kernel/irq.c| 1 +
>> 2 files changed, 5 insertions(+)
>> 
>> diff --git a/arch/powerpc/include/asm/interrupt.h 
>> b/arch/powerpc/include/asm/interrupt.h
>> index 789311d1e283..d4bdf7d274ac 100644
>> --- a/arch/powerpc/include/asm/interrupt.h
>> +++ b/arch/powerpc/include/asm/interrupt.h
>> @@ -173,6 +173,8 @@ static inline void interrupt_enter_prepare(struct 
>> pt_regs *regs, struct interrup
>>  BUG_ON(search_kernel_restart_table(regs->nip));
>> #endif
>>  }
>> +if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
>> +BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
>> #endif
> 
> I think this BUG_ON was triggered while running selftests 
> (powerpc/mm/pkey_exec_prot)
> 
> [ 9741.254969] [ cut here ]
> [ 9741.254978] kernel BUG at arch/powerpc/include/asm/interrupt.h:177!
> [ 9741.254985] Oops: Exception in kernel mode, sig: 5 [#1]
> [ 9741.254990] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [ 9741.254995] Modules linked in: rpadlpar_io rpaphp uinput sha512_generic 
> vmac n_gsm pps_ldisc pps_core ppp_synctty ppp_async ppp_generic slcan slip 
> slhc snd_hrtimer snd_seq snd_seq_device snd_timer snd soundcore authenc 
> pcrypt crypto_user n_hdlc dummy veth nfsv3 nfs_acl nfs lockd grace fscache 
> netfs tun brd overlay vfat fat btrfs blake2b_generic xor zstd_compress 
> raid6_pq xfs loop sctp ip6_udp_tunnel udp_tunnel dm_mod bonding nft_ct 
> nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables libcrc32c 
> nfnetlink sunrpc pseries_rng xts vmx_crypto uio_pdrv_genirq uio sch_fq_codel 
> ip_tables ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ibmvscsi ibmveth 
> scsi_transport_srp fuse [last unloaded: test_cpuidle_latency]
> [ 9741.255097] CPU: 17 PID: 3278920 Comm: pkey_exec_prot Tainted: GW  
> OE 5.13.0-rc7-next-20210625-dirty #4
> [ 9741.255106] NIP:  c00300d8 LR: c0009604 CTR: 
> c0009330
> [ 9741.255111] REGS: c000347536f0 TRAP: 0700   Tainted: GW  OE
>   (5.13.0-rc7-next-20210625-dirty)
> [ 9741.255117] MSR:  80021033   CR: 22004282  XER: 
> 2004
> [ 9741.255130] CFAR: c003007c IRQMASK: 3 
> [ 9741.255130] GPR00: c0093cd0 c00034753990 c29bbe00 
> c00034753a30 
> [ 9741.255130] GPR04: 7fff9ebb 0020 000a 
> 002d 
> [ 9741.255130] GPR08:  0001  
> 7265677368657265 
> [ 9741.255130] GPR12: 80021033 c0001ec27280  
>  
> [ 9741.255130] GPR16:    
>  
> [ 9741.255130] GPR20:    
> 10003c40 
> [ 9741.255130] GPR24:   0020 
> c0005e89d200 
> [ 9741.255130] GPR28: 0300 7fff9ebb c00034753e80 
> c00034753a30 
> [ 9741.255191] NIP [c00300d8] program_check_exception+0xe8/0x1c0
> [ 9741.255202] LR [c0009604] program_check_common_virt+0x2d4/0x320
> [ 9741.255209] Call Trace:
> [ 9741.255212] [c00034753990] [0008] 0x8 (unreliable)
> [ 9741.255219] [c000347539c0] [c00034753a80] 0xc00034753a80
> [ 9741.255225] --- interrupt: 700 at arch_local_irq_restore+0x1d0/0x200
> [ 9741.255231] NIP:  c0016790 LR: c0093388 CTR: 
> c0008780
> [ 9741.255236] REGS: c00034753a30 TRAP: 0700   Tainted: GW  OE
>   (5.13.0-rc7-next-20210625-dirty)
> [ 9741.255242] MSR:  80021033   CR: 24004288  XER: 
> 2004
> [ 9741.255253] CFAR: c00165ec IRQMASK: 0 
> [ 9741.255253] GPR00: c0093cd0 c00034753cd0 c29bbe00 
>  
> [ 9741.255253] GPR04: 7fff9ebb 0020 000a 
> 002d 
> [ 9741.255253] GPR08:   c000bd77d400 
> 7265677368657265 
> [ 9741.255253] GPR12: 44000282 c0001ec27280  
>  
> [ 9741.255253] GPR16:    
>  
> [ 9741.255253] GPR20:    
> 10003c40 
> [ 9741.255253] GPR24:   0020 
> c0005e89d200 
> [ 9741.255253] GPR28: 0300 7fff9ebb c00034753e80 
> 0001 
> [ 9741.255313] NIP [c0016790] 

Re: [RFC] fpga: dfl: fme: Fix cpu hotplug code

2021-06-28 Thread Moritz Fischer
On Mon, Jun 28, 2021 at 12:45:46PM +0530, Kajol Jain wrote:
> Commit 724142f8c42a ("fpga: dfl: fme: add performance
> reporting support") added performance reporting support
> for FPGA management engine via perf.
> 
> It also added cpu hotplug feature but it didn't add
> pmu migration call in cpu offline function.
> This can create an issue incase the current designated
> cpu being used to collect fme pmu data got offline,
> as based on current code we are not migrating fme pmu to
> new target cpu. Because of that perf will still try to
> fetch data from that offline cpu and hence we will not
> get counter data.
> 
> Patch fixed this issue by adding pmu_migrate_context call
> in fme_perf_offline_cpu function.
> 
> Fixes: 724142f8c42a ("fpga: dfl: fme: add performance reporting support")
> Signed-off-by: Kajol Jain 

You might want to Cc: sta...@vger.kernel.org if it fixes an actual bug.
> ---
>  drivers/fpga/dfl-fme-perf.c | 4 
>  1 file changed, 4 insertions(+)
> 
> ---
> - This fix patch is not tested (as I don't have required environment).
>   But issue mentioned in the commit msg can be re-created, by starting any
>   fme_perf event and while its still running, offline current designated
>   cpu pointed by cpumask file. Since current code didn't migrating pmu,
>   perf gonna try getting counts from that offlined cpu and hence we will
>   not get event data.
> ---
> diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
> index 4299145ef347..b9a54583e505 100644
> --- a/drivers/fpga/dfl-fme-perf.c
> +++ b/drivers/fpga/dfl-fme-perf.c
> @@ -953,6 +953,10 @@ static int fme_perf_offline_cpu(unsigned int cpu, struct 
> hlist_node *node)
>   return 0;
>  
>   priv->cpu = target;
> +
> + /* Migrate fme_perf pmu events to the new target cpu */
> + perf_pmu_migrate_context(>pmu, cpu, target);
> +
>   return 0;
>  }
>  
> -- 
> 2.31.1
> 
- Moritz


Re: [PATCH v4 7/7] powerpc/pseries: Add support for FORM2 associativity

2021-06-28 Thread Aneesh Kumar K.V
David Gibson  writes:

> On Thu, Jun 24, 2021 at 01:50:34PM +0530, Aneesh Kumar K.V wrote:
>> David Gibson  writes:
>> 
>> > On Thu, Jun 17, 2021 at 10:21:05PM +0530, Aneesh Kumar K.V wrote:
>> >> PAPR interface currently supports two different ways of communicating 
>> >> resource
>> >> grouping details to the OS. These are referred to as Form 0 and Form 1
>> >> associativity grouping. Form 0 is the older format and is now considered
>> >> deprecated. This patch adds another resource grouping named FORM2.
>> >> 
>> >> Signed-off-by: Daniel Henrique Barboza 
>> >> Signed-off-by: Aneesh Kumar K.V 
>> >> ---
>> >>  Documentation/powerpc/associativity.rst   | 135 
>> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
>> >>  arch/powerpc/include/asm/prom.h   |   1 +
>> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
>> >>  arch/powerpc/mm/numa.c| 149 +-
>> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>> >>  6 files changed, 286 insertions(+), 6 deletions(-)
>> >>  create mode 100644 Documentation/powerpc/associativity.rst
>> >> 
>> >> diff --git a/Documentation/powerpc/associativity.rst 
>> >> b/Documentation/powerpc/associativity.rst
>> >> new file mode 100644
>> >> index ..93be604ac54d
>> >> --- /dev/null
>> >> +++ b/Documentation/powerpc/associativity.rst
>> >> @@ -0,0 +1,135 @@
>> >> +
>> >> +NUMA resource associativity
>> >> +=
>> >> +
>> >> +Associativity represents the groupings of the various platform resources 
>> >> into
>> >> +domains of substantially similar mean performance relative to resources 
>> >> outside
>> >> +of that domain. Resources subsets of a given domain that exhibit better
>> >> +performance relative to each other than relative to other resources 
>> >> subsets
>> >> +are represented as being members of a sub-grouping domain. This 
>> >> performance
>> >> +characteristic is presented in terms of NUMA node distance within the 
>> >> Linux kernel.
>> >> +From the platform view, these groups are also referred to as domains.
>> >> +
>> >> +PAPR interface currently supports different ways of communicating these 
>> >> resource
>> >> +grouping details to the OS. These are referred to as Form 0, Form 1 and 
>> >> Form2
>> >> +associativity grouping. Form 0 is the older format and is now considered 
>> >> deprecated.
>> >> +
>> >> +Hypervisor indicates the type/form of associativity used via 
>> >> "ibm,arcitecture-vec-5 property".
>> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
>> >> of Form 0 or Form 1.
>> >> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
>> >> associativity
>> >> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
>> >> +
>> >> +Form 0
>> >> +-
>> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
>> >> +
>> >> +Form 1
>> >> +-
>> >> +With Form 1 a combination of ibm,associativity-reference-points and 
>> >> ibm,associativity
>> >> +device tree properties are used to determine the NUMA distance between 
>> >> resource groups/domains.
>> >> +
>> >> +The “ibm,associativity” property contains one or more lists of numbers 
>> >> (domainID)
>> >> +representing the resource’s platform grouping domains.
>> >> +
>> >> +The “ibm,associativity-reference-points” property contains one or more 
>> >> list of numbers
>> >> +(domainID index) that represents the 1 based ordinal in the 
>> >> associativity lists.
>> >> +The list of domainID index represnets increasing hierachy of
>> >> resource grouping.
>> >
>> > Typo "represnets".  Also s/hierachy/hierarchy/
>> >
>> >> +
>> >> +ex:
>> >> +{ primary domainID index, secondary domainID index, tertiary domainID 
>> >> index.. }
>> >
>> >> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
>> >> node id.
>> >> +Linux kernel computes NUMA distance between two domains by recursively 
>> >> comparing
>> >> +if they belong to the same higher-level domains. For mismatch at every 
>> >> higher
>> >> +level of the resource group, the kernel doubles the NUMA distance 
>> >> between the
>> >> +comparing domains.
>> >
>> > The Form1 description is still kinda confusing, but I don't really
>> > care.  Form1 *is* confusing, it's Form2 that I hope will be clearer.
>> >
>> >> +
>> >> +Form 2
>> >> +---
>> >> +Form 2 associativity format adds separate device tree properties 
>> >> representing NUMA node distance
>> >> +thereby making the node distance computation flexible. Form 2 also 
>> >> allows flexible primary
>> >> +domain numbering. With numa distance computation now detached from the 
>> >> index value of
>> >> +"ibm,associativity" property, Form 2 allows a large number of primary 
>> >> domain ids at the
>> >> +same domainID index representing resource groups of different 
>> >> performance/latency characteristics.
>> >
>> > So, see you've removed the special 

[PATCH v5 6/6] powerpc/pseries: Add support for FORM2 associativity

2021-06-28 Thread Aneesh Kumar K.V
PAPR interface currently supports two different ways of communicating resource
grouping details to the OS. These are referred to as Form 0 and Form 1
associativity grouping. Form 0 is the older format and is now considered
deprecated. This patch adds another resource grouping named FORM2.

Signed-off-by: Daniel Henrique Barboza 
Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/powerpc/associativity.rst   | 103 ++
 arch/powerpc/include/asm/firmware.h   |   3 +-
 arch/powerpc/include/asm/prom.h   |   1 +
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 157 ++
 arch/powerpc/platforms/pseries/firmware.c |   1 +
 6 files changed, 242 insertions(+), 26 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

diff --git a/Documentation/powerpc/associativity.rst 
b/Documentation/powerpc/associativity.rst
new file mode 100644
index ..31cc7da2c7a6
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,103 @@
+
+NUMA resource associativity
+=
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux 
kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports different ways of communicating these 
resource
+grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
+associativity grouping. Form 0 is the older format and is now considered 
deprecated.
+
+Hypervisor indicates the type/form of associativity used via 
"ibm,architecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity. For Form 2 
associativity
+bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
+
+Form 0
+-
+Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
+
+Form 1
+-
+With Form 1 a combination of ibm,associativity-reference-points, and 
ibm,associativity
+device tree properties are used to determine the NUMA distance between 
resource groups/domains.
+
+The “ibm,associativity” property contains a list of one or more numbers 
(domainID)
+representing the resource’s platform grouping domains.
+
+The “ibm,associativity-reference-points” property contains a list of one or 
more numbers
+(domainID index) that represents the 1 based ordinal in the associativity 
lists.
+The list of domainID indexes represents an increasing hierarchy of resource 
grouping.
+
+ex:
+{ primary domainID index, secondary domainID index, tertiary domainID index.. }
+
+Linux kernel uses the domainID at the primary domainID index as the NUMA node 
id.
+Linux kernel computes NUMA distance between two domains by recursively 
comparing
+if they belong to the same higher-level domains. For mismatch at every higher
+level of the resource group, the kernel doubles the NUMA distance between the
+comparing domains.
+
+Form 2
+---
+Form 2 associativity format adds separate device tree properties representing 
NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows 
flexible primary
+domain numbering. With numa distance computation now detached from the index 
value in
+"ibm,associativity-reference-points" property, Form 2 allows a large number of 
primary domain
+ids at the same domainID index representing resource groups of different 
performance/latency
+characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in 
the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains a list of one or more numbers 
representing
+the domainIDs present in the system. The offset of the domainID in this 
property is
+used as an index while computing numa distance information via 
"ibm,numa-distance-table".
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, 
followed by
+N domainID encoded as with encode-int
+
+For ex:
+"ibm,numa-lookup-index-table" =  {4, 0, 8, 250, 252}. The offset of domainID 8 
(2) is used when
+computing the distance of domain 8 from other domains present in the system. 
For the rest of
+this document, this offset will be referred to as domain distance offset.
+
+"ibm,numa-distance-table" property contains a list of one or more numbers 
representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with 
encode-int, 

[PATCH v5 5/6] powerpc/pseries: Add a helper for form1 cpu distance

2021-06-28 Thread Aneesh Kumar K.V
This helper is only used with the dispatch trace log collection.
A later patch will add Form2 affinity support and this change helps
in keeping that simpler. Also add a comment explaining we don't expect
the code to be called with FORM0

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/topology.h   |  4 ++--
 arch/powerpc/mm/numa.c| 10 +-
 arch/powerpc/platforms/pseries/lpar.c |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index e4db64c0e184..ac8b5ed79832 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 cpu_all_mask : \
 cpumask_of_node(pcibus_to_node(bus)))
 
-extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 
@@ -83,7 +83,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
-static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
return 0;
 }
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 7b142f79d600..c6293037a103 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 {
int dist = 0;
 
@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
 }
 
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+   /* We should not get called with FORM0 */
+   VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+
+   return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
+}
+
 /* must hold reference to node during call */
 static const __be32 *of_get_associativity(struct device_node *dev)
 {
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index dab356e3ff87..afefbdfe768d 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int 
last_disp_cpu, int cur_disp_cpu)
if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
return -EIO;
 
-   return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
+   return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
 }
 
 static int cpu_home_node_dispatch_distance(int disp_cpu)
@@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu)
if (!disp_cpu_assoc || !vcpu_assoc)
return -EIO;
 
-   return cpu_distance(disp_cpu_assoc, vcpu_assoc);
+   return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
 }
 
 static void update_vcpu_disp_stat(int disp_cpu)
-- 
2.31.1



[PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-06-28 Thread Aneesh Kumar K.V
The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call.

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c| 173 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/pseries.h  |   1 +
 4 files changed, 132 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0ec16999beef..7b142f79d600 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-   const __be32 *associativity)
-{
-   int i;
-
-   if (affinity_form != FORM1_AFFINITY)
-   return;
-
-   for (i = 0; i < max_associativity_domain_index; i++) {
-   const __be32 *entry;
-
-   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
-   distance_lookup_table[nid][i] = of_read_number(entry, 1);
-   }
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
*associativity)
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
-
-   if (nid > 0 &&
-   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
-   /*
-* Skip the length field and send start of associativity array
-*/
-   initialize_distance_lookup_table(nid, associativity + 1);
-   }
-
 out:
return nid;
 }
@@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+   int i, nid;
+
+   if (affinity_form != FORM1_AFFINITY)
+   return;
+
+   if (of_read_number(associativity, 1) >= primary_domain_index) {
+   nid = of_read_number([primary_domain_index], 1);
+
+   for (i = 0; i < max_associativity_domain_index; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i])];
+   distance_lookup_table[nid][i] = of_read_number(entry, 
1);
+   }
+   }
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+   const __be32 *associativity;
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return;
+
+   __initialize_form1_numa_distance(associativity);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+   if (affinity_form == FORM0_AFFINITY)
+   return;
+   else if (affinity_form == FORM1_AFFINITY) {
+   initialize_form1_numa_distance(node);
+   return;
+   }
+}
+
 static int __init find_primary_domain_index(void)
 {
int index;
@@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+   struct assoc_arrays aa = { .arrays = NULL };
+   int default_nid = NUMA_NO_NODE;
+   int nid = default_nid;
+   int rc, index;
+
+   if ((primary_domain_index < 0) || !numa_enabled)
+   return default_nid;
+
+   rc = of_get_assoc_arrays();
+   if (rc)
+   return default_nid;
+
+   if (primary_domain_index <= aa.array_sz &&
+   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+   nid = of_read_number([index], 1);
+
+   if (nid == 0x || nid >= nr_node_ids)
+   nid = default_nid;
+   if (nid > 0 && affinity_form == FORM1_AFFINITY) {
+   int i;
+   const __be32 *associativity;
+
+   index = lmb->aa_index * aa.array_sz;
+   associativity = [index];
+   /*
+* lookup array associativity entries have different 
format
+* There is no length of the array as the first element.
+*/
+   for (i = 0; i < max_associativity_domain_index; i++) {
+

[PATCH v5 3/6] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-06-28 Thread Aneesh Kumar K.V
Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   |  4 +--
 arch/powerpc/include/asm/prom.h   |  2 +-
 arch/powerpc/kernel/prom_init.c   |  2 +-
 arch/powerpc/mm/numa.c| 35 ++-
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPALASM_CONST(0x1000)
 #define FW_FEATURE_SET_MODEASM_CONST(0x4000)
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
 #define FW_FEATURE_DRMEM_V2ASM_CONST(0x0004)
 #define FW_FEATURE_DRC_INFOASM_CONST(0x0008)
@@ -69,7 +69,7 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-   FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+   FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI0x0201  /* PCIe/MSI support */
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580  /* FORM1 NUMA affinity */
 #define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_HP_EVT 0x0604  /* Hot Plug Event support */
 #define OV5_RESIZE_HPT 0x0601  /* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 523b31685c4c..5d9ea059594f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1069,7 +1069,7 @@ static const struct ibm_arch_vec 
ibm_architecture_vec_template __initconst = {
 #else
0,
 #endif
-   .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
+   .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 132813dd1a6c..0ec16999beef 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int max_associativity_domain_index;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
 
-   if (!form1_affinity)
+   if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
int i;
 
-   if (!form1_affinity)
+   if (affinity_form != FORM1_AFFINITY)
return;
 
for (i = 0; i < max_associativity_domain_index; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
int index;
struct device_node *root;
 
+   /*
+* Check for which form of affinity.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL)) {
+   affinity_form = FORM1_AFFINITY;
+   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
+   affinity_form = FORM1_AFFINITY;
+   } else
+   affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -318,23 

[PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-06-28 Thread Aneesh Kumar K.V
No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..132813dd1a6c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+static int max_associativity_domain_index;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 
int i, index;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
index = be32_to_cpu(distance_ref_points[i]);
if (cpu1_assoc[index] == cpu2_assoc[index])
break;
@@ -193,7 +193,7 @@ int __node_distance(int a, int b)
if (!form1_affinity)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
break;
 
@@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
if (!form1_affinity)
return;
 
-   for (i = 0; i < distance_ref_points_depth; i++) {
+   for (i = 0; i < max_associativity_domain_index; i++) {
const __be32 *entry;
 
entry = [be32_to_cpu(distance_ref_points[i]) - 1];
@@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
nid = NUMA_NO_NODE;
 
if (nid > 0 &&
-   of_read_number(associativity, 1) >= distance_ref_points_depth) {
+   of_read_number(associativity, 1) >= 
max_associativity_domain_index) {
/*
 * Skip the length field and send start of associativity array
 */
@@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
 */
distance_ref_points = of_get_property(root,
"ibm,associativity-reference-points",
-   _ref_points_depth);
+   _associativity_domain_index);
 
if (!distance_ref_points) {
dbg("NUMA: ibm,associativity-reference-points not found.\n");
goto err;
}
 
-   distance_ref_points_depth /= sizeof(int);
+   max_associativity_domain_index /= sizeof(int);
 
if (firmware_has_feature(FW_FEATURE_OPAL) ||
firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
@@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
if (form1_affinity) {
index = of_read_number(distance_ref_points, 1);
} else {
-   if (distance_ref_points_depth < 2) {
+   if (max_associativity_domain_index < 2) {
printk(KERN_WARNING "NUMA: "
"short ibm,associativity-reference-points\n");
goto err;
@@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
 * Warn and cap if the hardware supports more than
 * MAX_DISTANCE_REF_POINTS domains.
 */
-   if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+   if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
printk(KERN_WARNING "NUMA: distance array capped at "
"%d entries\n", MAX_DISTANCE_REF_POINTS);
-   distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+   max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
}
 
of_node_put(root);
-- 
2.31.1



[PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-28 Thread Aneesh Kumar K.V
No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
if (!numa_enabled)
goto out;
 
-   if (of_read_number(associativity, 1) >= min_common_depth)
-   nid = of_read_number([min_common_depth], 1);
+   if (of_read_number(associativity, 1) >= primary_domain_index)
+   nid = of_read_number([primary_domain_index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-   int depth;
+   int index;
struct device_node *root;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
}
 
if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
 
-   depth = of_read_number(_ref_points[1], 1);
+   index = of_read_number(_ref_points[1], 1);
}
 
/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
 
of_node_put(root);
-   return depth;
+   return index;
 
 err:
of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if ((min_common_depth < 0) || !numa_enabled)
+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth <= aa.array_sz &&
+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number([index], 1);
 
if (nid == 0x || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
 
-   min_common_depth = find_min_common_depth();
+   primary_domain_index = find_primary_domain_index();
 
-   if (min_common_depth < 0) {
+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
 
-   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
 
/*
 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
 
-   max_nodes = of_read_number([min_common_depth], 1);
+   max_nodes = of_read_number([primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
prop_length /= sizeof(int);
-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
 
index = of_read_number(associativity, 1);
-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number([index - 1], 1);
 
 out:
-- 
2.31.1



[PATCH v5 0/6] Add support for FORM2 associativity

2021-06-28 Thread Aneesh Kumar K.V
Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. More details can be found in patch 7.

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX kmem memory add
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  222  33  240 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  230 
  4:  255  255  255  230  10 


PAPR SCM now use the numa distance details to find the numa_node and target_node
for the device.

kvaneesh@ubuntu-guest:~$ ndctl  list -N -v 
[
  {
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":1071644672,
"uuid":"d333d867-3f57-44c8-b386-d4d3abdc2bf2",
"raw_uuid":"915361ad-fe6a-42dd-848f-d6dc9f5af362",
"daxregion":{
  "id":0,
  "size":1071644672,
  "devices":[
{
  "chardev":"dax0.0",
  "size":1071644672,
  "target_node":4,
  "mode":"devdax"
}
  ]
},
"align":2097152,
"numa_node":3
  }
]
kvaneesh@ubuntu-guest:~$ 


The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=222 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=240 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=230 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=230 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/

Changes from v4:
* Drop DLPAR related device tree property for now because both Qemu nor PowerVM
  will provide the distance details of all possible NUMA nodes during boot.
* Rework numa distance code based on review feedback.

Changes from v3:
* Drop PAPR SCM specific changes and depend completely on NUMA distance 
information.

Changes from v2:
* Add nvdimm list to Cc:
* update PATCH 8 commit message.

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index


Aneesh Kumar K.V (6):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: rename distance_ref_points_depth to
max_associativity_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate different NUMA distance update code paths
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity

 Documentation/powerpc/associativity.rst   | 103 +
 arch/powerpc/include/asm/firmware.h   |   7 +-
 arch/powerpc/include/asm/prom.h   |   3 +-
 arch/powerpc/include/asm/topology.h   |   4 +-
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 415 +-
 arch/powerpc/platforms/pseries/firmware.c |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/lpar.c |   4 +-
 arch/powerpc/platforms/pseries/pseries.h  |   1 +
 11 files changed, 432 insertions(+), 115 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1



Re: [PATCH] perf script python: Fix buffer size to report iregs in perf script

2021-06-28 Thread Paul A. Clarke
On Mon, Jun 28, 2021 at 11:53:41AM +0530, Kajol Jain wrote:
> Commit 48a1f565261d ("perf script python: Add more PMU fields
> to event handler dict") added functionality to report fields like
> weight, iregs, uregs etc via perf report.
> That commit predefined buffer size to 512 bytes to print those fields.
> 
> But incase of powerpc, since we added extended regs support
> in commits:
> 
> Commit 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
> Counter SPRs as part of extended regs")
> Commit d735599a069f ("powerpc/perf: Add extended regs support for
> power10 platform")
> 
> Now iregs can carry more bytes of data and this predefined buffer size
> can result to data loss in perf script output.
> 
> Patch resolve this issue by making buffer size dynamic based on number
> of registers needed to print. It also changed return type for function
> "regs_map" from int to void, as the return value is not being used by
> the caller function "set_regs_in_dict".
> 
> Fixes: 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
> Counter SPRs as part of extended regs")
> Signed-off-by: Kajol Jain 
> ---
>  .../util/scripting-engines/trace-event-python.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
> b/tools/perf/util/scripting-engines/trace-event-python.c
> index 4e4aa4c97ac5..c8c9706b4643 100644
> --- a/tools/perf/util/scripting-engines/trace-event-python.c
> +++ b/tools/perf/util/scripting-engines/trace-event-python.c
[...]
> @@ -713,7 +711,16 @@ static void set_regs_in_dict(PyObject *dict,
>struct evsel *evsel)
>  {
>   struct perf_event_attr *attr = >core.attr;
> - char bf[512];
> +
> + /*
> +  * Here value 28 is a constant size which can be used to print
> +  * one register value and its corresponds to:
> +  * 16 chars is to specify 64 bit register in hexadecimal.
> +  * 2 chars is for appending "0x" to the hexadecimal value and
> +  * 10 chars is for register name.
> +  */
> + int size = __sw_hweight64(attr->sample_regs_intr) * 28;
> + char bf[size];

I propose using a template rather than a magic number here. Something like:
const char reg_name_tmpl[] = "10 chars  ";
const char reg_value_tmpl[] = "0x0123456789abcdef";
const int size = __sw_hweight64(attr->sample_regs_intr) +
 sizeof reg_name_tmpl + sizeof reg_value_tmpl;

Pardon my ignorance, but is there no separation/whitespace between the name
and the value? And is there some significance to 10 characters for the
register name, or is that a magic number?

PC


Re: [PATCH 5/8] powerpc/64: enable MSR[EE] in irq replay pt_regs

2021-06-28 Thread Sachin Sant


> On 28-Jun-2021, at 1:19 PM, Nicholas Piggin  wrote:
> 
> Similar to 2b48e96be2f9f ("powerpc/64: fix irq replay pt_regs->softe
> value"), enable MSR_EE in pt_regs->msr, which makes the regs look a
> bit more normal and allows the extra debug checks to be added to
> interrupt handler entry.
> 
> Signed-off-by: Nicholas Piggin 
> ---
> arch/powerpc/include/asm/interrupt.h | 4 
> arch/powerpc/kernel/irq.c| 1 +
> 2 files changed, 5 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h 
> b/arch/powerpc/include/asm/interrupt.h
> index 789311d1e283..d4bdf7d274ac 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -173,6 +173,8 @@ static inline void interrupt_enter_prepare(struct pt_regs 
> *regs, struct interrup
>   BUG_ON(search_kernel_restart_table(regs->nip));
> #endif
>   }
> + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
> + BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
> #endif

I think this BUG_ON was triggered while running selftests 
(powerpc/mm/pkey_exec_prot)

[ 9741.254969] [ cut here ]
[ 9741.254978] kernel BUG at arch/powerpc/include/asm/interrupt.h:177!
[ 9741.254985] Oops: Exception in kernel mode, sig: 5 [#1]
[ 9741.254990] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
[ 9741.254995] Modules linked in: rpadlpar_io rpaphp uinput sha512_generic vmac 
n_gsm pps_ldisc pps_core ppp_synctty ppp_async ppp_generic slcan slip slhc 
snd_hrtimer snd_seq snd_seq_device snd_timer snd soundcore authenc pcrypt 
crypto_user n_hdlc dummy veth nfsv3 nfs_acl nfs lockd grace fscache netfs tun 
brd overlay vfat fat btrfs blake2b_generic xor zstd_compress raid6_pq xfs loop 
sctp ip6_udp_tunnel udp_tunnel dm_mod bonding nft_ct nf_conntrack 
nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables libcrc32c nfnetlink 
sunrpc pseries_rng xts vmx_crypto uio_pdrv_genirq uio sch_fq_codel ip_tables 
ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ibmvscsi ibmveth 
scsi_transport_srp fuse [last unloaded: test_cpuidle_latency]
[ 9741.255097] CPU: 17 PID: 3278920 Comm: pkey_exec_prot Tainted: GW  
OE 5.13.0-rc7-next-20210625-dirty #4
[ 9741.255106] NIP:  c00300d8 LR: c0009604 CTR: c0009330
[ 9741.255111] REGS: c000347536f0 TRAP: 0700   Tainted: GW  OE  
(5.13.0-rc7-next-20210625-dirty)
[ 9741.255117] MSR:  80021033   CR: 22004282  XER: 
2004
[ 9741.255130] CFAR: c003007c IRQMASK: 3 
[ 9741.255130] GPR00: c0093cd0 c00034753990 c29bbe00 
c00034753a30 
[ 9741.255130] GPR04: 7fff9ebb 0020 000a 
002d 
[ 9741.255130] GPR08:  0001  
7265677368657265 
[ 9741.255130] GPR12: 80021033 c0001ec27280  
 
[ 9741.255130] GPR16:    
 
[ 9741.255130] GPR20:    
10003c40 
[ 9741.255130] GPR24:   0020 
c0005e89d200 
[ 9741.255130] GPR28: 0300 7fff9ebb c00034753e80 
c00034753a30 
[ 9741.255191] NIP [c00300d8] program_check_exception+0xe8/0x1c0
[ 9741.255202] LR [c0009604] program_check_common_virt+0x2d4/0x320
[ 9741.255209] Call Trace:
[ 9741.255212] [c00034753990] [0008] 0x8 (unreliable)
[ 9741.255219] [c000347539c0] [c00034753a80] 0xc00034753a80
[ 9741.255225] --- interrupt: 700 at arch_local_irq_restore+0x1d0/0x200
[ 9741.255231] NIP:  c0016790 LR: c0093388 CTR: c0008780
[ 9741.255236] REGS: c00034753a30 TRAP: 0700   Tainted: GW  OE  
(5.13.0-rc7-next-20210625-dirty)
[ 9741.255242] MSR:  80021033   CR: 24004288  XER: 
2004
[ 9741.255253] CFAR: c00165ec IRQMASK: 0 
[ 9741.255253] GPR00: c0093cd0 c00034753cd0 c29bbe00 
 
[ 9741.255253] GPR04: 7fff9ebb 0020 000a 
002d 
[ 9741.255253] GPR08:   c000bd77d400 
7265677368657265 
[ 9741.255253] GPR12: 44000282 c0001ec27280  
 
[ 9741.255253] GPR16:    
 
[ 9741.255253] GPR20:    
10003c40 
[ 9741.255253] GPR24:   0020 
c0005e89d200 
[ 9741.255253] GPR28: 0300 7fff9ebb c00034753e80 
0001 
[ 9741.255313] NIP [c0016790] arch_local_irq_restore+0x1d0/0x200
[ 9741.255319] LR [c0093388] ___do_page_fault+0x438/0xb80
[ 9741.255325] --- interrupt: 700
[ 9741.255328] [c00034753cd0] [c009be74] hash_page_mm+0x5e4/0x800 
(unreliable)
[ 

Re: [PATCH] perf vendor events power10: Adds 24x7 nest metric events for power10 platform

2021-06-28 Thread Paul A. Clarke
On Mon, Jun 28, 2021 at 11:58:54AM +0530, kajoljain wrote:
> 
> 
> On 6/25/21 6:51 PM, Paul A. Clarke wrote:
> > On Fri, Jun 25, 2021 at 05:29:48PM +0530, Kajol Jain wrote:
> >> Patch adds 24x7 nest metric events for POWER10.
> >>
> >> Signed-off-by: Kajol Jain 
> >> ---
> >>  .../arch/powerpc/power10/nest_metrics.json| 491 ++
> >>  1 file changed, 491 insertions(+)
> >>  create mode 100644 
> >> tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
> >>
> >> diff --git a/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json 
> >> b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
> >> new file mode 100644
> >> index ..b79046cd8b09
> >> --- /dev/null
> >> +++ b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
> >> @@ -0,0 +1,491 @@
> >> +[
> >> +{
> >> +  "MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
> >> +  "BriefDescription": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
> > 
> > Is it possible to get better descriptions than just a restatement of the
> > name, or no description at all?
> > 
> > This comment obviously applies to almost all of the metrics herein.
> 
> Hi Paul,
>Thanks for reviewing the patch. Sure I will remove description part for 
> now.

My sentence didn't parse well, sorry...

What I really meant was more like "Is it possible to get better descriptions?
Having just a restatement of the name (or no description at all in some cases)
is not helpful."

So, can we provide better descriptions of the metrics?

PC


Re: [PATCH 3/8] powerpc/64s: add a table of implicit soft-masked addresses

2021-06-28 Thread Sachin Sant



> On 28-Jun-2021, at 1:19 PM, Nicholas Piggin  wrote:
> 
> Commit 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs
> soft-masked") ends up catching too much code, including ret_from_fork,
> and parts of interrupt and syscall return that do not expect to be
> interrupts to be soft-masked. If an interrupt gets marked pending,
> and then the code proceeds out of the implicit soft-masked region it
> will fail to deal with the pending interrupt.
> 
> Fix this by adding a new table of addresses which explicitly marks
> the regions of code that are soft masked. This table is only checked
> for interrupts that below __end_soft_masked, so most kernel interrupts
> will not have the overhead of the table search.
> 
> Fixes: 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs soft-masked")
> Reported-by: Sachin Sant 
> Signed-off-by: Nicholas Piggin 

Thanks Nick for the fix.

I was able to verify this patch. 
Both kernel boot and test ran to completion without the reported warning.

Tested-by: Sachin Sant 

-Sachin



Re: [RFC] fpga: dfl: fme: Fix cpu hotplug code

2021-06-28 Thread Xu Yilun
It's a good fix, you can drop the RFC in commit title. :)

The title could be more specific, like:

fpga: dfl: fme: Fix cpu hotplug issue in performance reporting

So we know it is for performance reporting feature at first glance.

On Mon, Jun 28, 2021 at 12:45:46PM +0530, Kajol Jain wrote:

> Commit 724142f8c42a ("fpga: dfl: fme: add performance
> reporting support") added performance reporting support
> for FPGA management engine via perf.

May drop this section, it is indicated in the Fixes tag.

> 
> It also added cpu hotplug feature but it didn't add

The performance reporting driver added cpu hotplug ...

> pmu migration call in cpu offline function.
> This can create an issue incase the current designated
> cpu being used to collect fme pmu data got offline,
> as based on current code we are not migrating fme pmu to
> new target cpu. Because of that perf will still try to
> fetch data from that offline cpu and hence we will not
> get counter data.
> 
> Patch fixed this issue by adding pmu_migrate_context call
> in fme_perf_offline_cpu function.
> 
> Fixes: 724142f8c42a ("fpga: dfl: fme: add performance reporting support")
> Signed-off-by: Kajol Jain 

Tested-by: Xu Yilun 

Thanks,
Yilun

> ---
>  drivers/fpga/dfl-fme-perf.c | 4 
>  1 file changed, 4 insertions(+)
> 
> ---
> - This fix patch is not tested (as I don't have required environment).
>   But issue mentioned in the commit msg can be re-created, by starting any
>   fme_perf event and while its still running, offline current designated
>   cpu pointed by cpumask file. Since current code didn't migrating pmu,
>   perf gonna try getting counts from that offlined cpu and hence we will
>   not get event data.
> ---
> diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
> index 4299145ef347..b9a54583e505 100644
> --- a/drivers/fpga/dfl-fme-perf.c
> +++ b/drivers/fpga/dfl-fme-perf.c
> @@ -953,6 +953,10 @@ static int fme_perf_offline_cpu(unsigned int cpu, struct 
> hlist_node *node)
>   return 0;
>  
>   priv->cpu = target;
> +
> + /* Migrate fme_perf pmu events to the new target cpu */
> + perf_pmu_migrate_context(>pmu, cpu, target);
> +
>   return 0;
>  }
>  
> -- 
> 2.31.1


[PATCH] fpga: dfl: fme: Fix cpu hotplug issue in performance reporting

2021-06-28 Thread Kajol Jain
The performance reporting driver added cpu hotplug
feature but it didn't add pmu migration call in cpu
offline function.
This can create an issue incase the current designated
cpu being used to collect fme pmu data got offline,
as based on current code we are not migrating fme pmu to
new target cpu. Because of that perf will still try to
fetch data from that offline cpu and hence we will not
get counter data.

Patch fixed this issue by adding pmu_migrate_context call
in fme_perf_offline_cpu function.

Fixes: 724142f8c42a ("fpga: dfl: fme: add performance reporting support")
Tested-by: Xu Yilun 
Signed-off-by: Kajol Jain 
---
 drivers/fpga/dfl-fme-perf.c | 4 
 1 file changed, 4 insertions(+)

---
Changelog:
- Remove RFC tag
- Did nits changes on subject and commit message as suggested by Xu Yilun
- Added Tested-by tag
- Link to rfc patch: https://lkml.org/lkml/2021/6/28/112
---
diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
index 4299145ef347..b9a54583e505 100644
--- a/drivers/fpga/dfl-fme-perf.c
+++ b/drivers/fpga/dfl-fme-perf.c
@@ -953,6 +953,10 @@ static int fme_perf_offline_cpu(unsigned int cpu, struct 
hlist_node *node)
return 0;
 
priv->cpu = target;
+
+   /* Migrate fme_perf pmu events to the new target cpu */
+   perf_pmu_migrate_context(>pmu, cpu, target);
+
return 0;
 }
 
-- 
2.31.1



Re: [RFC] fpga: dfl: fme: Fix cpu hotplug code

2021-06-28 Thread kajoljain



On 6/28/21 2:31 PM, Xu Yilun wrote:
> It's a good fix, you can drop the RFC in commit title. :)
> 
> The title could be more specific, like:
> 
> fpga: dfl: fme: Fix cpu hotplug issue in performance reporting
> 
> So we know it is for performance reporting feature at first glance.
> 
> On Mon, Jun 28, 2021 at 12:45:46PM +0530, Kajol Jain wrote:
> 
>> Commit 724142f8c42a ("fpga: dfl: fme: add performance
>> reporting support") added performance reporting support
>> for FPGA management engine via perf.
> 
> May drop this section, it is indicated in the Fixes tag.
> 

Hi Yilun,
Thanks for testing the patch. I will make mentioned changes and send
new patch.

Thanks,
Kajol Jain
>>
>> It also added cpu hotplug feature but it didn't add
> 
> The performance reporting driver added cpu hotplug ...
> 
>> pmu migration call in cpu offline function.
>> This can create an issue incase the current designated
>> cpu being used to collect fme pmu data got offline,
>> as based on current code we are not migrating fme pmu to
>> new target cpu. Because of that perf will still try to
>> fetch data from that offline cpu and hence we will not
>> get counter data.
>>
>> Patch fixed this issue by adding pmu_migrate_context call
>> in fme_perf_offline_cpu function.
>>
>> Fixes: 724142f8c42a ("fpga: dfl: fme: add performance reporting support")
>> Signed-off-by: Kajol Jain 
> 
> Tested-by: Xu Yilun 
> 
> Thanks,
> Yilun
> 
>> ---
>>  drivers/fpga/dfl-fme-perf.c | 4 
>>  1 file changed, 4 insertions(+)
>>
>> ---
>> - This fix patch is not tested (as I don't have required environment).
>>   But issue mentioned in the commit msg can be re-created, by starting any
>>   fme_perf event and while its still running, offline current designated
>>   cpu pointed by cpumask file. Since current code didn't migrating pmu,
>>   perf gonna try getting counts from that offlined cpu and hence we will
>>   not get event data.
>> ---
>> diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
>> index 4299145ef347..b9a54583e505 100644
>> --- a/drivers/fpga/dfl-fme-perf.c
>> +++ b/drivers/fpga/dfl-fme-perf.c
>> @@ -953,6 +953,10 @@ static int fme_perf_offline_cpu(unsigned int cpu, 
>> struct hlist_node *node)
>>  return 0;
>>  
>>  priv->cpu = target;
>> +
>> +/* Migrate fme_perf pmu events to the new target cpu */
>> +perf_pmu_migrate_context(>pmu, cpu, target);
>> +
>>  return 0;
>>  }
>>  
>> -- 
>> 2.31.1


linux-next: manual merge of the akpm tree with the powerpc tree

2021-06-28 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the akpm tree got a conflict in:

  arch/powerpc/kernel/setup-common.c

between commit:

  56afad885228 ("powerpc: Remove klimit")

from the powerpc tree and commit:

  6e6e0df2a484 ("powerpc: convert to setup_initial_init_mm()")

from the akpm tree.

I fixed it up (I just used the latter since it had also decided to use
_end directly) and can carry the fix as necessary. This is now fixed as
far as linux-next is concerned, but any non trivial conflicts should be
mentioned to your upstream maintainer when your tree is submitted for
merging.  You may also want to consider cooperating with the maintainer
of the conflicting tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


pgpcr1WXZsLUw.pgp
Description: OpenPGP digital signature


[PATCH 8/8] powerpc/64s: move ret_from_fork etc above __end_soft_masked

2021-06-28 Thread Nicholas Piggin
Code which runs with interrupts enabled should be moved above
__end_soft_masked where possible, because maskable interrupts that hit
below there need to consult the soft mask table.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt_64.S | 52 +++---
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index 795c105850e4..3ca3576690ce 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -451,32 +451,6 @@ _ASM_NOKPROBE_SYMBOL(tabort_syscall)
b   .   /* prevent speculative execution */
 #endif
 
-#ifdef CONFIG_PPC_BOOK3S
-_GLOBAL(ret_from_fork_scv)
-   bl  schedule_tail
-   REST_NVGPRS(r1)
-   li  r3,0/* fork() return value */
-   b   .Lsyscall_vectored_common_exit
-#endif
-
-_GLOBAL(ret_from_fork)
-   bl  schedule_tail
-   REST_NVGPRS(r1)
-   li  r3,0/* fork() return value */
-   b   .Lsyscall_exit
-
-_GLOBAL(ret_from_kernel_thread)
-   bl  schedule_tail
-   REST_NVGPRS(r1)
-   mtctr   r14
-   mr  r3,r15
-#ifdef PPC64_ELF_ABI_v2
-   mr  r12,r14
-#endif
-   bctrl
-   li  r3,0
-   b   .Lsyscall_exit
-
/*
 * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not
 * touched, no exit work created, then this can be used.
@@ -770,3 +744,29 @@ interrupt_return_macro hsrr
 __end_soft_masked:
 DEFINE_FIXED_SYMBOL(__end_soft_masked)
 #endif /* CONFIG_PPC_BOOK3S */
+
+#ifdef CONFIG_PPC_BOOK3S
+_GLOBAL(ret_from_fork_scv)
+   bl  schedule_tail
+   REST_NVGPRS(r1)
+   li  r3,0/* fork() return value */
+   b   .Lsyscall_vectored_common_exit
+#endif
+
+_GLOBAL(ret_from_fork)
+   bl  schedule_tail
+   REST_NVGPRS(r1)
+   li  r3,0/* fork() return value */
+   b   .Lsyscall_exit
+
+_GLOBAL(ret_from_kernel_thread)
+   bl  schedule_tail
+   REST_NVGPRS(r1)
+   mtctr   r14
+   mr  r3,r15
+#ifdef PPC64_ELF_ABI_v2
+   mr  r12,r14
+#endif
+   bctrl
+   li  r3,0
+   b   .Lsyscall_exit
-- 
2.23.0



[PATCH 7/8] powerpc/64s/interrupt: clean up interrupt return labels

2021-06-28 Thread Nicholas Piggin
Normal kernel-interrupt exits can get interrupt_return_srr_user_restart
in their backtrace, which is an unusual and notable function, and it is
part of the user-interrupt exit path, which is doubly confusing.

Add symmetric non-local labels for user and kernel interrupt exit cases
to address this. Also get rid of an unused label.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt_64.S | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index 06244b4df719..795c105850e4 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -511,7 +511,9 @@ interrupt_return_\srr\():
 _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\())
ld  r4,_MSR(r1)
andi.   r0,r4,MSR_PR
-   beq .Lkernel_interrupt_return_\srr
+   beq interrupt_return_\srr\()_kernel
+interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */
+_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user)
addir3,r1,STACK_FRAME_OVERHEAD
bl  interrupt_exit_user_prepare
cmpdi   r3,0
@@ -625,8 +627,8 @@ RESTART_TABLE(.Linterrupt_return_\srr\()_user_rst_start, 
.Linterrupt_return_\srr
 #endif
 
.balign IFETCH_ALIGN_BYTES
-.Lkernel_interrupt_return_\srr\():
-.Linterrupt_return_\srr\()_kernel:
+interrupt_return_\srr\()_kernel:
+_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel)
addir3,r1,STACK_FRAME_OVERHEAD
bl  interrupt_exit_kernel_prepare
 
-- 
2.23.0



[PATCH 6/8] powerpc/64/interrupts: add missing kprobe annotations on interrupt exit symbols

2021-06-28 Thread Nicholas Piggin
If one interrupt exit symbol must not be kprobed, none of them can be,
really.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt_64.S | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index c06ed64541e1..06244b4df719 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -198,6 +198,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 #ifdef CONFIG_PPC_BOOK3S
 syscall_vectored_\name\()_restart:
+_ASM_NOKPROBE_SYMBOL(syscall_vectored_\name\()_restart)
GET_PACA(r13)
ld  r1,PACA_EXIT_SAVE_R1(r13)
ld  r2,PACATOC(r13)
@@ -240,6 +241,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_emulate)
.balign IFETCH_ALIGN_BYTES
.globl system_call_common_real
 system_call_common_real:
+_ASM_NOKPROBE_SYMBOL(system_call_common_real)
ld  r10,PACAKMSR(r13)   /* get MSR value for kernel */
mtmsrd  r10
 
@@ -404,6 +406,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 #ifdef CONFIG_PPC_BOOK3S
 syscall_restart:
+_ASM_NOKPROBE_SYMBOL(syscall_restart)
GET_PACA(r13)
ld  r1,PACA_EXIT_SAVE_R1(r13)
ld  r2,PACATOC(r13)
@@ -422,6 +425,7 @@ RESTART_TABLE(.Lsyscall_rst_start, .Lsyscall_rst_end, 
syscall_restart)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 tabort_syscall:
+_ASM_NOKPROBE_SYMBOL(tabort_syscall)
/* Firstly we need to enable TM in the kernel */
mfmsr   r10
li  r9, 1
@@ -604,6 +608,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 
 #ifdef CONFIG_PPC_BOOK3S
 interrupt_return_\srr\()_user_restart:
+_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user_restart)
GET_PACA(r13)
ld  r1,PACA_EXIT_SAVE_R1(r13)
ld  r2,PACATOC(r13)
@@ -737,6 +742,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 
 #ifdef CONFIG_PPC_BOOK3S
 interrupt_return_\srr\()_kernel_restart:
+_ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel_restart)
GET_PACA(r13)
ld  r1,PACA_EXIT_SAVE_R1(r13)
ld  r2,PACATOC(r13)
-- 
2.23.0



[PATCH 5/8] powerpc/64: enable MSR[EE] in irq replay pt_regs

2021-06-28 Thread Nicholas Piggin
Similar to 2b48e96be2f9f ("powerpc/64: fix irq replay pt_regs->softe
value"), enable MSR_EE in pt_regs->msr, which makes the regs look a
bit more normal and allows the extra debug checks to be added to
interrupt handler entry.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 4 
 arch/powerpc/kernel/irq.c| 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 789311d1e283..d4bdf7d274ac 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -173,6 +173,8 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
BUG_ON(search_kernel_restart_table(regs->nip));
 #endif
}
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+   BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
 #endif
 
booke_restore_dbcr0();
@@ -268,6 +270,8 @@ static inline void interrupt_nmi_enter_prepare(struct 
pt_regs *regs, struct inte
// arch_irq_disabled_regs(regs) behaves as expected.
regs->softe = IRQS_ALL_DISABLED;
}
+   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+   BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
 
/* Don't do any per-CPU operations until interrupt state is fixed */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8428caf3194e..91e63eac4e8f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -121,6 +121,7 @@ void replay_soft_interrupts(void)
 
ppc_save_regs();
regs.softe = IRQS_ENABLED;
+   regs.msr |= MSR_EE;
 
 again:
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
-- 
2.23.0



[PATCH 4/8] powerpc/64s/interrupt: preserve regs->softe for NMI interrupts

2021-06-28 Thread Nicholas Piggin
If an NMI interrupt hits in an implicit soft-masked region, regs->softe
is modified to reflect that. This may not be necessary for correctness
at the moment, but it is less surprising and it's unhelpful when
debugging or adding checks.

Make sure this is changed back to how it was found before returning.

Fixes: 4ec5feec1ad0 ("powerpc/64s: Make NMI record implicitly soft-masked code 
as irqs disabled")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index d7df247a149c..789311d1e283 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -227,6 +227,7 @@ struct interrupt_nmi_state {
u8 irq_soft_mask;
u8 irq_happened;
u8 ftrace_enabled;
+   u64 softe;
 #endif
 };
 
@@ -252,6 +253,7 @@ static inline void interrupt_nmi_enter_prepare(struct 
pt_regs *regs, struct inte
 #ifdef CONFIG_PPC64
state->irq_soft_mask = local_paca->irq_soft_mask;
state->irq_happened = local_paca->irq_happened;
+   state->softe = regs->softe;
 
/*
 * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
@@ -311,6 +313,7 @@ static inline void interrupt_nmi_exit_prepare(struct 
pt_regs *regs, struct inter
 
/* Check we didn't change the pending interrupt mask. */
WARN_ON_ONCE((state->irq_happened | PACA_IRQ_HARD_DIS) != 
local_paca->irq_happened);
+   regs->softe = state->softe;
local_paca->irq_happened = state->irq_happened;
local_paca->irq_soft_mask = state->irq_soft_mask;
 #endif
-- 
2.23.0



[PATCH 3/8] powerpc/64s: add a table of implicit soft-masked addresses

2021-06-28 Thread Nicholas Piggin
Commit 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs
soft-masked") ends up catching too much code, including ret_from_fork,
and parts of interrupt and syscall return that do not expect to be
interrupts to be soft-masked. If an interrupt gets marked pending,
and then the code proceeds out of the implicit soft-masked region it
will fail to deal with the pending interrupt.

Fix this by adding a new table of addresses which explicitly marks
the regions of code that are soft masked. This table is only checked
for interrupts that below __end_soft_masked, so most kernel interrupts
will not have the overhead of the table search.

Fixes: 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs soft-masked")
Reported-by: Sachin Sant 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h |  5 ++-
 arch/powerpc/include/asm/ppc_asm.h   |  7 
 arch/powerpc/kernel/exceptions-64s.S | 55 
 arch/powerpc/kernel/interrupt_64.S   |  8 
 arch/powerpc/kernel/vmlinux.lds.S|  9 +
 arch/powerpc/lib/restart_table.c | 26 +
 6 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index f2481fac7f7f..d7df247a149c 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -75,11 +75,12 @@
 
 #ifdef CONFIG_PPC_BOOK3S_64
 extern char __end_soft_masked[];
+bool search_kernel_soft_mask_table(unsigned long addr);
 unsigned long search_kernel_restart_table(unsigned long addr);
 
 DECLARE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
 
-bool is_implicit_soft_masked(struct pt_regs *regs)
+static inline bool is_implicit_soft_masked(struct pt_regs *regs)
 {
if (regs->msr & MSR_PR)
return false;
@@ -87,7 +88,7 @@ bool is_implicit_soft_masked(struct pt_regs *regs)
if (regs->nip >= (unsigned long)__end_soft_masked)
return false;
 
-   return true;
+   return search_kernel_soft_mask_table(regs->nip);
 }
 
 static inline void srr_regs_clobbered(void)
diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index c9c2c36c1f8f..116c1519728a 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -762,6 +762,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, 
CPU_FTR_CELL_TB_BUG, 96)
stringify_in_c(.long (_target) - . ;)   \
stringify_in_c(.previous)
 
+#define SOFT_MASK_TABLE(_start, _end)  \
+   stringify_in_c(.section __soft_mask_table,"a";)\
+   stringify_in_c(.balign 8;)  \
+   stringify_in_c(.llong (_start);)\
+   stringify_in_c(.llong (_end);)  \
+   stringify_in_c(.previous)
+
 #define RESTART_TABLE(_start, _end, _target)   \
stringify_in_c(.section __restart_table,"a";)\
stringify_in_c(.balign 8;)  \
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ecd07bf604c5..3a58c3fd6de4 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -428,21 +428,30 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
 
/* If coming from user, skip soft-mask tests. */
andi.   r10,r12,MSR_PR
-   bne 2f
+   bne 3f
 
/*
-* Kernel code running below __end_soft_masked is implicitly
-* soft-masked
+* Kernel code running below __end_soft_masked may be
+* implicitly soft-masked if it is within the regions
+* in the soft mask table.
 */
LOAD_HANDLER(r10, __end_soft_masked)
cmpld   r11,r10
-
+   bge+1f
+
+   /* SEARCH_SOFT_MASK_TABLE clobbers r9,r10,r12 */
+   stw r9,PACA_EXGEN+EX_CCR(r13)
+   SEARCH_SOFT_MASK_TABLE
+   lwz r9,PACA_EXGEN+EX_CCR(r13)
+   cmpdi   r12,0
+   mfspr   r12,SPRN_SRR1   /* Restore r12 to SRR1 */
+   beq 1f  /* Not in soft-mask table */
li  r10,IMASK
-   blt-1f
+   b   2f  /* In soft-mask table, always mask */
 
/* Test the soft mask state against our interrupt's bit */
-   lbz r10,PACAIRQSOFTMASK(r13)
-1: andi.   r10,r10,IMASK
+1: lbz r10,PACAIRQSOFTMASK(r13)
+2: andi.   r10,r10,IMASK
/* Associate vector numbers with bits in paca->irq_happened */
.if IVEC == 0x500 || IVEC == 0xea0
li  r10,PACA_IRQ_EE
@@ -473,7 +482,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
 
.if ISTACK
andi.   r10,r12,MSR_PR  /* See if coming from user  */
-2: mr  r10,r1  /* Save r1  */
+3:   

[PATCH 2/8] powerpc/64e: remove implicit soft-masking and interrupt exit restart logic

2021-06-28 Thread Nicholas Piggin
The implicit soft-masking to speed up interrupt return was going to be
used by 64e as well, but it was not ready in time. 64e always disables
MSR[EE] when exiting from interrupt and syscall.

Disable it for now.

Fixes: 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs soft-masked")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 33 
 arch/powerpc/kernel/exceptions-64e.S | 12 +-
 arch/powerpc/kernel/interrupt_64.S   | 16 +-
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8b4b1e84e110..f2481fac7f7f 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -73,20 +73,34 @@
 #include 
 #include 
 
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
 extern char __end_soft_masked[];
 unsigned long search_kernel_restart_table(unsigned long addr);
-#endif
 
-#ifdef CONFIG_PPC_BOOK3S_64
 DECLARE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
 
+bool is_implicit_soft_masked(struct pt_regs *regs)
+{
+   if (regs->msr & MSR_PR)
+   return false;
+
+   if (regs->nip >= (unsigned long)__end_soft_masked)
+   return false;
+
+   return true;
+}
+
 static inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
 }
 #else
+static inline bool is_implicit_soft_masked(struct pt_regs *regs)
+{
+   return false;
+}
+
 static inline void srr_regs_clobbered(void)
 {
 }
@@ -150,11 +164,13 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
 */
if (TRAP(regs) != INTERRUPT_PROGRAM) {
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
-   BUG_ON(regs->nip < (unsigned long)__end_soft_masked);
+   BUG_ON(is_implicit_soft_masked(regs));
}
+#ifdef CONFIG_PPC_BOOK3S
/* Move this under a debugging check */
if (arch_irq_disabled_regs(regs))
BUG_ON(search_kernel_restart_table(regs->nip));
+#endif
}
 #endif
 
@@ -244,10 +260,9 @@ static inline void interrupt_nmi_enter_prepare(struct 
pt_regs *regs, struct inte
local_paca->irq_soft_mask = IRQS_ALL_DISABLED;
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !(regs->msr & MSR_PR) &&
-   regs->nip < (unsigned long)__end_soft_masked) {
-   // Kernel code running below __end_soft_masked is
-   // implicitly soft-masked.
+   if (is_implicit_soft_masked(regs)) {
+   // Adjust regs->softe soft implicit soft-mask, so
+   // arch_irq_disabled_regs(regs) behaves as expected.
regs->softe = IRQS_ALL_DISABLED;
}
 
@@ -282,6 +297,7 @@ static inline void interrupt_nmi_exit_prepare(struct 
pt_regs *regs, struct inter
 */
 
 #ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
if (arch_irq_disabled_regs(regs)) {
unsigned long rst = search_kernel_restart_table(regs->nip);
if (rst)
@@ -289,7 +305,6 @@ static inline void interrupt_nmi_exit_prepare(struct 
pt_regs *regs, struct inter
}
 #endif
 
-#ifdef CONFIG_PPC64
if (nmi_disables_ftrace(regs))
this_cpu_set_ftrace_enabled(state->ftrace_enabled);
 
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index d634bfceed2c..1401787b0b93 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -342,17 +342,7 @@ ret_from_mc_except:
 #define PROLOG_ADDITION_MASKABLE_GEN(n)
\
lbz r10,PACAIRQSOFTMASK(r13);   /* are irqs soft-masked? */ \
andi.   r10,r10,IRQS_DISABLED;  /* yes -> go out of line */ \
-   bne masked_interrupt_book3e_##n;\
-   /* Kernel code below __end_soft_masked is implicitly masked */  \
-   andi.   r10,r11,MSR_PR; \
-   bne 1f; /* user -> not masked */\
-   std r14,PACA_EXGEN+EX_R14(r13); \
-   LOAD_REG_IMMEDIATE_SYM(r14, r10, __end_soft_masked);\
-   mfspr   r10,SPRN_SRR0;  \
-   cmpld   r10,r14;\
-   ld  r14,PACA_EXGEN+EX_R14(r13); \
-   blt masked_interrupt_book3e_##n;\
-1:
+   bne masked_interrupt_book3e_##n
 
 /*
  * Additional regs must be re-loaded from paca before EXCEPTION_COMMON* is
diff --git a/arch/powerpc/kernel/interrupt_64.S 
b/arch/powerpc/kernel/interrupt_64.S
index 

[PATCH 1/8] powerpc/64e: fix CONFIG_RELOCATABLE build

2021-06-28 Thread Nicholas Piggin
Commit 24d33ac5b8ff ("powerpc/64s: Make prom_init require RELOCATABLE")
also made my 64e config require RELOCATABLE, which results in compile
failures.

Whether or not that's the right thing to do for prom_init for 64e, this
fixes CONFIG_RELOCATABLE=y compile errors. That commit is marked as
being fixed, but only because that's what caused the compile error to
show up for a given config.

This passes basic qemu testing.

Fixes: 24d33ac5b8ff ("powerpc/64s: Make prom_init require RELOCATABLE")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64e.S | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 22fcd95dd8dc..d634bfceed2c 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -912,8 +912,14 @@ kernel_dbg_exc:
b   interrupt_return
 
 .macro SEARCH_RESTART_TABLE
+#ifdef CONFIG_RELOCATABLE
+   ld  r11,PACATOC(r13)
+   ld  r14,__start___restart_table@got(r11)
+   ld  r15,__stop___restart_table@got(r11)
+#else
LOAD_REG_IMMEDIATE_SYM(r14, r11, __start___restart_table)
LOAD_REG_IMMEDIATE_SYM(r15, r11, __stop___restart_table)
+#endif
 300:
cmpdr14,r15
beq 302f
@@ -1329,7 +1335,12 @@ a2_tlbinit_code_start:
 a2_tlbinit_after_linear_map:
 
/* Now we branch the new virtual address mapped by this entry */
+#ifdef CONFIG_RELOCATABLE
+   ld  r5,PACATOC(r13)
+   ld  r3,1f@got(r5)
+#else
LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f)
+#endif
mtctr   r3
bctr
 
-- 
2.23.0



[PATCH 0/8] powerpc: fast interrupt exit bug and misc fixes

2021-06-28 Thread Nicholas Piggin
This is a bunch of fixes for powerpc next, mostly a nasty hole in fast
interrupt exit code found by Sachin and some other bits along the way
while looking at it.

So far this survives about 5 hours of stress testing with a workload
that would trigger it in a few seconds (guest 128 vcpus running kernel
compile loops with perf record -ag running in the background).

Thanks,
Nick

Nicholas Piggin (8):
  powerpc/64e: fix CONFIG_RELOCATABLE build
  powerpc/64e: remove implicit soft-masking and interrupt exit restart
logic
  powerpc/64s: add a table of implicit soft-masked addresses
  powerpc/64s/interrupt: preserve regs->softe for NMI interrupts
  powerpc/64: enable MSR[EE] in irq replay pt_regs
  powerpc/64/interrupts: add missing kprobe annotations on interrupt
exit symbols
  powerpc/64s/interrupt: clean up interrupt return labels
  powerpc/64s: move ret_from_fork etc above __end_soft_masked

 arch/powerpc/include/asm/interrupt.h | 41 ++---
 arch/powerpc/include/asm/ppc_asm.h   |  7 +++
 arch/powerpc/kernel/exceptions-64e.S | 23 +++
 arch/powerpc/kernel/exceptions-64s.S | 55 ++---
 arch/powerpc/kernel/interrupt_64.S   | 90 ++--
 arch/powerpc/kernel/irq.c|  1 +
 arch/powerpc/kernel/vmlinux.lds.S|  9 +++
 arch/powerpc/lib/restart_table.c | 26 
 8 files changed, 194 insertions(+), 58 deletions(-)

-- 
2.23.0



Re: [PATCH v4 7/7] powerpc/pseries: Add support for FORM2 associativity

2021-06-28 Thread David Gibson
On Thu, Jun 24, 2021 at 01:50:34PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Thu, Jun 17, 2021 at 10:21:05PM +0530, Aneesh Kumar K.V wrote:
> >> PAPR interface currently supports two different ways of communicating 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0 and Form 1
> >> associativity grouping. Form 0 is the older format and is now considered
> >> deprecated. This patch adds another resource grouping named FORM2.
> >> 
> >> Signed-off-by: Daniel Henrique Barboza 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 135 
> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >>  arch/powerpc/mm/numa.c| 149 +-
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 286 insertions(+), 6 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index ..93be604ac54d
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,135 @@
> >> +
> >> +NUMA resource associativity
> >> +=
> >> +
> >> +Associativity represents the groupings of the various platform resources 
> >> into
> >> +domains of substantially similar mean performance relative to resources 
> >> outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources 
> >> subsets
> >> +are represented as being members of a sub-grouping domain. This 
> >> performance
> >> +characteristic is presented in terms of NUMA node distance within the 
> >> Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >> +
> >> +PAPR interface currently supports different ways of communicating these 
> >> resource
> >> +grouping details to the OS. These are referred to as Form 0, Form 1 and 
> >> Form2
> >> +associativity grouping. Form 0 is the older format and is now considered 
> >> deprecated.
> >> +
> >> +Hypervisor indicates the type/form of associativity used via 
> >> "ibm,arcitecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> >> associativity
> >> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >> +
> >> +Form 0
> >> +-
> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-
> >> +With Form 1 a combination of ibm,associativity-reference-points and 
> >> ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between 
> >> resource groups/domains.
> >> +
> >> +The “ibm,associativity” property contains one or more lists of numbers 
> >> (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains one or more 
> >> list of numbers
> >> +(domainID index) that represents the 1 based ordinal in the associativity 
> >> lists.
> >> +The list of domainID index represnets increasing hierachy of
> >> resource grouping.
> >
> > Typo "represnets".  Also s/hierachy/hierarchy/
> >
> >> +
> >> +ex:
> >> +{ primary domainID index, secondary domainID index, tertiary domainID 
> >> index.. }
> >
> >> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
> >> node id.
> >> +Linux kernel computes NUMA distance between two domains by recursively 
> >> comparing
> >> +if they belong to the same higher-level domains. For mismatch at every 
> >> higher
> >> +level of the resource group, the kernel doubles the NUMA distance between 
> >> the
> >> +comparing domains.
> >
> > The Form1 description is still kinda confusing, but I don't really
> > care.  Form1 *is* confusing, it's Form2 that I hope will be clearer.
> >
> >> +
> >> +Form 2
> >> +---
> >> +Form 2 associativity format adds separate device tree properties 
> >> representing NUMA node distance
> >> +thereby making the node distance computation flexible. Form 2 also allows 
> >> flexible primary
> >> +domain numbering. With numa distance computation now detached from the 
> >> index value of
> >> +"ibm,associativity" property, Form 2 allows a large number of primary 
> >> domain ids at the
> >> +same domainID index representing resource groups of different 
> >> performance/latency characteristics.
> >
> > So, see you've removed the special handling of secondary IDs for pmem
> > - big improvement, thanks.  IIUC, in this revised version, for Form2
> > there's really no reason for 

[RFC] fpga: dfl: fme: Fix cpu hotplug code

2021-06-28 Thread Kajol Jain
Commit 724142f8c42a ("fpga: dfl: fme: add performance
reporting support") added performance reporting support
for FPGA management engine via perf.

It also added cpu hotplug feature but it didn't add
pmu migration call in cpu offline function.
This can create an issue incase the current designated
cpu being used to collect fme pmu data got offline,
as based on current code we are not migrating fme pmu to
new target cpu. Because of that perf will still try to
fetch data from that offline cpu and hence we will not
get counter data.

Patch fixed this issue by adding pmu_migrate_context call
in fme_perf_offline_cpu function.

Fixes: 724142f8c42a ("fpga: dfl: fme: add performance reporting support")
Signed-off-by: Kajol Jain 
---
 drivers/fpga/dfl-fme-perf.c | 4 
 1 file changed, 4 insertions(+)

---
- This fix patch is not tested (as I don't have required environment).
  But issue mentioned in the commit msg can be re-created, by starting any
  fme_perf event and while its still running, offline current designated
  cpu pointed by cpumask file. Since current code didn't migrating pmu,
  perf gonna try getting counts from that offlined cpu and hence we will
  not get event data.
---
diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
index 4299145ef347..b9a54583e505 100644
--- a/drivers/fpga/dfl-fme-perf.c
+++ b/drivers/fpga/dfl-fme-perf.c
@@ -953,6 +953,10 @@ static int fme_perf_offline_cpu(unsigned int cpu, struct 
hlist_node *node)
return 0;
 
priv->cpu = target;
+
+   /* Migrate fme_perf pmu events to the new target cpu */
+   perf_pmu_migrate_context(>pmu, cpu, target);
+
return 0;
 }
 
-- 
2.31.1



Re: [PATCH] perf script python: Fix buffer size to report iregs in perf script

2021-06-28 Thread Nageswara Sastry

Tested by creating perf-script.py using perf script
and priting the iregs. Seen more values with this patch.


Tested-by: Nageswara R Sastry 

On 28/06/21 11:53 am, Kajol Jain wrote:

Commit 48a1f565261d ("perf script python: Add more PMU fields
to event handler dict") added functionality to report fields like
weight, iregs, uregs etc via perf report.
That commit predefined buffer size to 512 bytes to print those fields.

But incase of powerpc, since we added extended regs support
in commits:

Commit 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
Counter SPRs as part of extended regs")
Commit d735599a069f ("powerpc/perf: Add extended regs support for
power10 platform")

Now iregs can carry more bytes of data and this predefined buffer size
can result to data loss in perf script output.

Patch resolve this issue by making buffer size dynamic based on number
of registers needed to print. It also changed return type for function
"regs_map" from int to void, as the return value is not being used by
the caller function "set_regs_in_dict".

Fixes: 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
Counter SPRs as part of extended regs")
Signed-off-by: Kajol Jain 
---
  .../util/scripting-engines/trace-event-python.c | 17 -
  1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index 4e4aa4c97ac5..c8c9706b4643 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -687,7 +687,7 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
_PyUnicode_FromString(decode));
  }
  
-static int regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)

+static void regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)
  {
unsigned int i = 0, r;
int printed = 0;
@@ -695,7 +695,7 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
bf[0] = 0;
  
  	if (!regs || !regs->regs)

-   return 0;
+   return;
  
  	for_each_set_bit(r, (unsigned long *) , sizeof(mask) * 8) {

u64 val = regs->regs[i++];
@@ -704,8 +704,6 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
 "%5s:0x%" PRIx64 " ",
 perf_reg_name(r), val);
}
-
-   return printed;
  }
  
  static void set_regs_in_dict(PyObject *dict,

@@ -713,7 +711,16 @@ static void set_regs_in_dict(PyObject *dict,
 struct evsel *evsel)
  {
struct perf_event_attr *attr = >core.attr;
-   char bf[512];
+
+   /*
+* Here value 28 is a constant size which can be used to print
+* one register value and its corresponds to:
+* 16 chars is to specify 64 bit register in hexadecimal.
+* 2 chars is for appending "0x" to the hexadecimal value and
+* 10 chars is for register name.
+*/
+   int size = __sw_hweight64(attr->sample_regs_intr) * 28;
+   char bf[size];
  
  	regs_map(>intr_regs, attr->sample_regs_intr, bf, sizeof(bf));
  



--
Thanks and Regards
R.Nageswara Sastry


[PATCH] powerpc/4xx: Fix setup_kuep() on SMP

2021-06-28 Thread Christophe Leroy
On SMP, setup_kuep() is also called from start_secondary() since
commit 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and bail out when
not caller on the first CPU as the work is already done.

Reported-by: kernel test robot 
Fixes: 10248dcba120 ("powerpc/44x: Implement Kernel Userspace Exec Protection 
(KUEP)")
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/44x.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 7da6d1e9fc9b..20c18bd5b9a0 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -241,8 +241,11 @@ void __init mmu_init_secondary(int cpu)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
+   if (smp_processor_id() != boot_cpuid)
+   return;
+
if (disabled)
patch_instruction_site(__tlb_44x_kuep, 
ppc_inst(PPC_RAW_NOP()));
else
-- 
2.25.0



[PATCH] powerpc/32s: Fix setup_{kuap/kuep}() on SMP

2021-06-28 Thread Christophe Leroy
On SMP, setup_kup() is also called from start_secondary().

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and and setup_kuap().

Reported-by: kernel test robot 
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/book3s32/kuap.c | 2 +-
 arch/powerpc/mm/book3s32/kuep.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 9df6911b8fde..0f920f09af57 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -18,7 +18,7 @@ void kuap_unlock_all_ool(void)
 }
 EXPORT_SYMBOL(kuap_unlock_all_ool);
 
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
if (!disabled)
kuap_lock_all_ool();
diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c
index 3f6eb6e23fca..c20733d6e02c 100644
--- a/arch/powerpc/mm/book3s32/kuep.c
+++ b/arch/powerpc/mm/book3s32/kuep.c
@@ -5,7 +5,7 @@
 
 struct static_key_false disable_kuep_key;
 
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
if (!disabled)
kuep_lock();
-- 
2.25.0



[PATCH v2] perf vendor events power10: Adds 24x7 nest metric events for power10 platform

2021-06-28 Thread Kajol Jain
Patch adds 24x7 nest metric events for POWER10.

Tested-by: Nageswara R Sastry 
Signed-off-by: Kajol Jain 
---
 .../arch/powerpc/power10/nest_metrics.json| 424 ++
 1 file changed, 424 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json

---
Changelog:
v1 -> v2
- Removed "BriefDescription" field as its value was same as "MetricName"
  field as suggested by Paul A. Clarke
- Added Tested-by tag.
---
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json 
b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
new file mode 100644
index ..8ba3e81c9808
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
@@ -0,0 +1,424 @@
+[
+{
+  "MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PB_VG_PUMP01\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PB_VG_PUMP23\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "LOCAL_NODE_PUMP_RETRY_RATIO_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_LNS_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PB_LNS_PUMP01\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "LOCAL_NODE_PUMP_RETRY_RATIO_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_LNS_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PB_LNS_PUMP23\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "GROUP_PUMP_RETRY_RATIO_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PB_GROUP_PUMP01\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "GROUP_PUMP_RETRY_RATIO_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PB_GROUP_PUMP23\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_GROUP_PUMPS_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_GROUP_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_GROUP_PUMPS_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_GROUP_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_GROUP_PUMPS_RETRIES_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_GROUP_PUMPS_RETRIES_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PB_RNS_PUMP01\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PB_RNS_PUMP23\\,chip\\=?@) * 100",
+  "ScaleUnit": "1%",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_VECTOR_GROUP_PUMPS_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_VG_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_VECTOR_GROUP_PUMPS_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_VG_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_LOCAL_NODE_PUMPS_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_LNS_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_LOCAL_NODE_PUMPS_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_LNS_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_VECTOR_GROUP_PUMPS_RETRIES_P01",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP01\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+},
+{
+  "MetricName": "TOTAL_VECTOR_GROUP_PUMPS_RETRIES_P23",
+  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP23\\,chip\\=?@ / 
hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
+  "ScaleUnit": "4",
+  "AggregationMode": "PerChip"
+ 

Re: [PATCH] perf vendor events power10: Adds 24x7 nest metric events for power10 platform

2021-06-28 Thread kajoljain



On 6/25/21 6:51 PM, Paul A. Clarke wrote:
> On Fri, Jun 25, 2021 at 05:29:48PM +0530, Kajol Jain wrote:
>> Patch adds 24x7 nest metric events for POWER10.
>>
>> Signed-off-by: Kajol Jain 
>> ---
>>  .../arch/powerpc/power10/nest_metrics.json| 491 ++
>>  1 file changed, 491 insertions(+)
>>  create mode 100644 
>> tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
>>
>> diff --git a/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json 
>> b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
>> new file mode 100644
>> index ..b79046cd8b09
>> --- /dev/null
>> +++ b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
>> @@ -0,0 +1,491 @@
>> +[
>> +{
>> +  "MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
>> +  "BriefDescription": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
> 
> Is it possible to get better descriptions than just a restatement of the
> name, or no description at all?
> 
> This comment obviously applies to almost all of the metrics herein.

Hi Paul,
   Thanks for reviewing the patch. Sure I will remove description part for now.


> 
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PB_VG_PUMP01\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P23",
>> +  "BriefDescription": "VEC_GROUP_PUMP_RETRY_RATIO_P23",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PB_VG_PUMP23\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "LOCAL_NODE_PUMP_RETRY_RATIO_P01",
>> +  "BriefDescription": "LOCAL_NODE_PUMP_RETRY_RATIO_P01",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_LNS_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PB_LNS_PUMP01\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "LOCAL_NODE_PUMP_RETRY_RATIO_P23",
>> +  "BriefDescription": "LOCAL_NODE_PUMP_RETRY_RATIO_P23",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_LNS_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PB_LNS_PUMP23\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "GROUP_PUMP_RETRY_RATIO_P01",
>> +  "BriefDescription": "GROUP_PUMP_RETRY_RATIO_P01",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PB_GROUP_PUMP01\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "GROUP_PUMP_RETRY_RATIO_P23",
>> +  "BriefDescription": "GROUP_PUMP_RETRY_RATIO_P23",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PB_GROUP_PUMP23\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "TOTAL_GROUP_PUMPS_P01",
>> +  "BriefDescription": "TOTAL_GROUP_PUMPS_P01(PER-CYC)",
>> +  "MetricExpr": "(hv_24x7@PM_PB_GROUP_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
>> +  "ScaleUnit": "4",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "TOTAL_GROUP_PUMPS_P23",
>> +  "BriefDescription": "TOTAL_GROUP_PUMPS_P23(PER-CYC)",
>> +  "MetricExpr": "(hv_24x7@PM_PB_GROUP_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
>> +  "ScaleUnit": "4",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "TOTAL_GROUP_PUMPS_RETRIES_P01",
>> +  "BriefDescription": "TOTAL_GROUP_PUMPS_RETRIES_P01(PER-CYC)",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
>> +  "ScaleUnit": "4",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "TOTAL_GROUP_PUMPS_RETRIES_P23",
>> +  "BriefDescription": "TOTAL_GROUP_PUMPS_RETRIES_P23(PER-CYC)",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_GROUP_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PAU_CYC\\,chip\\=?@)",
>> +  "ScaleUnit": "4",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P01",
>> +  "BriefDescription": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P01",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP01\\,chip\\=?@ / 
>> hv_24x7@PM_PB_RNS_PUMP01\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P23",
>> +  "BriefDescription": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P23",
>> +  "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP23\\,chip\\=?@ / 
>> hv_24x7@PM_PB_RNS_PUMP23\\,chip\\=?@) * 100",
>> +  "ScaleUnit": "1%",
>> +  "AggregationMode": "PerChip"
>> +},
>> +{
>> +  "MetricName": "TOTAL_VECTOR_GROUP_PUMPS_P01",
>> +  

[PATCH] perf script python: Fix buffer size to report iregs in perf script

2021-06-28 Thread Kajol Jain
Commit 48a1f565261d ("perf script python: Add more PMU fields
to event handler dict") added functionality to report fields like
weight, iregs, uregs etc via perf report.
That commit predefined buffer size to 512 bytes to print those fields.

But incase of powerpc, since we added extended regs support
in commits:

Commit 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
Counter SPRs as part of extended regs")
Commit d735599a069f ("powerpc/perf: Add extended regs support for
power10 platform")

Now iregs can carry more bytes of data and this predefined buffer size
can result to data loss in perf script output.

Patch resolve this issue by making buffer size dynamic based on number
of registers needed to print. It also changed return type for function
"regs_map" from int to void, as the return value is not being used by
the caller function "set_regs_in_dict".

Fixes: 068aeea3773a ("perf powerpc: Support exposing Performance Monitor
Counter SPRs as part of extended regs")
Signed-off-by: Kajol Jain 
---
 .../util/scripting-engines/trace-event-python.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index 4e4aa4c97ac5..c8c9706b4643 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -687,7 +687,7 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
_PyUnicode_FromString(decode));
 }
 
-static int regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)
+static void regs_map(struct regs_dump *regs, uint64_t mask, char *bf, int size)
 {
unsigned int i = 0, r;
int printed = 0;
@@ -695,7 +695,7 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
bf[0] = 0;
 
if (!regs || !regs->regs)
-   return 0;
+   return;
 
for_each_set_bit(r, (unsigned long *) , sizeof(mask) * 8) {
u64 val = regs->regs[i++];
@@ -704,8 +704,6 @@ static int regs_map(struct regs_dump *regs, uint64_t mask, 
char *bf, int size)
 "%5s:0x%" PRIx64 " ",
 perf_reg_name(r), val);
}
-
-   return printed;
 }
 
 static void set_regs_in_dict(PyObject *dict,
@@ -713,7 +711,16 @@ static void set_regs_in_dict(PyObject *dict,
 struct evsel *evsel)
 {
struct perf_event_attr *attr = >core.attr;
-   char bf[512];
+
+   /*
+* Here value 28 is a constant size which can be used to print
+* one register value and its corresponds to:
+* 16 chars is to specify 64 bit register in hexadecimal.
+* 2 chars is for appending "0x" to the hexadecimal value and
+* 10 chars is for register name.
+*/
+   int size = __sw_hweight64(attr->sample_regs_intr) * 28;
+   char bf[size];
 
regs_map(>intr_regs, attr->sample_regs_intr, bf, sizeof(bf));
 
-- 
2.31.1



Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables

2021-06-28 Thread Christophe Leroy




Le 28/06/2021 à 08:03, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.


But we do handle hugetlb separately

if (vma && is_vm_hugetlb_page(vma)) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);

Are we using hugepd format for non hugetlb entries?


Yes, on the 8xx we use hugepd for 8M pages for linear mapping and for kasan shadow mapping (See 
commit bb5f33c06940 ("Merge "Use hugepages to map kernel mem on 8xx" into next")


And I'm working on implementing huge VMAP with 8M pages, that will also make 
use of hugepd.





Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy 
Reviewed-by: Steven Price 
---
v3:
- Rebased on next-20210624 (no change since v2)
- Added Steven's Reviewed-by
- Sent as standalone for merge via mm

v2:
- Add a guard for NULL ops->pte_entry
- Take mm->page_table_lock when walking hugepage table, as suggested by 
follow_huge_pd()
---
  mm/pagewalk.c | 58 ++-
  1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
  }
  
+#ifdef CONFIG_ARCH_HAS_HUGEPD

+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (!ops->pte_entry)
+   return 0;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte;
+
+   spin_lock(>mm->page_table_lock);
+   pte = hugepte_offset(*phpd, addr, pdshift);
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   spin_unlock(>mm->page_table_lock);
+
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }
+   return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   return 0;
+}
+#endif
+
  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
  {
@@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
  
-		err = walk_pte_range(pmd, addr, next, walk);

+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
  
-		err = walk_pmd_range(pud, addr, next, walk);

+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
unsigned long end,
if (err)
break;
}
-   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+   if (is_hugepd(__hugepd(p4d_val(*p4d
+   err = walk_hugepd_range((hugepd_t *)p4d, addr, next, 
walk, P4D_SHIFT);
+   else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long 
end,
if (err)
break;
}
-   if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-   ops->pte_entry)
+   if (is_hugepd(__hugepd(pgd_val(*pgd
+   err = walk_hugepd_range((hugepd_t *)pgd, addr, next, 
walk, 

Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables

2021-06-28 Thread Aneesh Kumar K.V
Christophe Leroy  writes:

> Pagewalk ignores hugepd entries and walk down the tables
> as if it was traditionnal entries, leading to crazy result.

But we do handle hugetlb separately

if (vma && is_vm_hugetlb_page(vma)) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);

Are we using hugepd format for non hugetlb entries?

>
> Add walk_hugepd_range() and use it to walk hugepage tables.
>
> Signed-off-by: Christophe Leroy 
> Reviewed-by: Steven Price 
> ---
> v3:
> - Rebased on next-20210624 (no change since v2)
> - Added Steven's Reviewed-by
> - Sent as standalone for merge via mm
>
> v2:
> - Add a guard for NULL ops->pte_entry
> - Take mm->page_table_lock when walking hugepage table, as suggested by 
> follow_huge_pd()
> ---
>  mm/pagewalk.c | 58 ++-
>  1 file changed, 53 insertions(+), 5 deletions(-)
>
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index e81640d9f177..9b3db11a4d1d 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
> unsigned long end,
>   return err;
>  }
>  
> +#ifdef CONFIG_ARCH_HAS_HUGEPD
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +  unsigned long end, struct mm_walk *walk, int 
> pdshift)
> +{
> + int err = 0;
> + const struct mm_walk_ops *ops = walk->ops;
> + int shift = hugepd_shift(*phpd);
> + int page_size = 1 << shift;
> +
> + if (!ops->pte_entry)
> + return 0;
> +
> + if (addr & (page_size - 1))
> + return 0;
> +
> + for (;;) {
> + pte_t *pte;
> +
> + spin_lock(>mm->page_table_lock);
> + pte = hugepte_offset(*phpd, addr, pdshift);
> + err = ops->pte_entry(pte, addr, addr + page_size, walk);
> + spin_unlock(>mm->page_table_lock);
> +
> + if (err)
> + break;
> + if (addr >= end - page_size)
> + break;
> + addr += page_size;
> + }
> + return err;
> +}
> +#else
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +  unsigned long end, struct mm_walk *walk, int 
> pdshift)
> +{
> + return 0;
> +}
> +#endif
> +
>  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
> struct mm_walk *walk)
>  {
> @@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long 
> addr, unsigned long end,
>   goto again;
>   }
>  
> - err = walk_pte_range(pmd, addr, next, walk);
> + if (is_hugepd(__hugepd(pmd_val(*pmd
> + err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
> walk, PMD_SHIFT);
> + else
> + err = walk_pte_range(pmd, addr, next, walk);
>   if (err)
>   break;
>   } while (pmd++, addr = next, addr != end);
> @@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long 
> addr, unsigned long end,
>   if (pud_none(*pud))
>   goto again;
>  
> - err = walk_pmd_range(pud, addr, next, walk);
> + if (is_hugepd(__hugepd(pud_val(*pud
> + err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
> walk, PUD_SHIFT);
> + else
> + err = walk_pmd_range(pud, addr, next, walk);
>   if (err)
>   break;
>   } while (pud++, addr = next, addr != end);
> @@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
> unsigned long end,
>   if (err)
>   break;
>   }
> - if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
> + if (is_hugepd(__hugepd(p4d_val(*p4d
> + err = walk_hugepd_range((hugepd_t *)p4d, addr, next, 
> walk, P4D_SHIFT);
> + else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
>   err = walk_pud_range(p4d, addr, next, walk);
>   if (err)
>   break;
> @@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned 
> long end,
>   if (err)
>   break;
>   }
> - if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
> - ops->pte_entry)
> + if (is_hugepd(__hugepd(pgd_val(*pgd
> + err = walk_hugepd_range((hugepd_t *)pgd, addr, next, 
> walk, PGDIR_SHIFT);
> + else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || 
> ops->pte_entry)
>   err = walk_p4d_range(pgd, addr, next, walk);
>   if (err)
>