Re: [PATCH kernel] KVM: PPC: Fix TCE handling for VFIO

2022-04-13 Thread David Gibson
On Wed, Apr 06, 2022 at 02:04:16PM +1000, Alexey Kardashevskiy wrote:
> At the moment the IOMMU page size in a pseries VM is 16MB (the biggest
> allowed by LoPAPR), this page size is used for an emulated TCE table.
> If there is a passed though PCI device, that there are hardware IOMMU
> tables with equal or smaller IOMMU page sizes so one emulated IOMMU pages
> is backed by power-of-two hardware pages.
> 
> The code wrongly uses the emulated TCE index instead of hardware TCE
> index in error handling. The problem is easier to see on POWER8 with
> multi-level TCE tables (when only the first level is preallocated)
> as hash mode uses real mode TCE hypercalls handlers.
> The kernel starts using indirect tables when VMs get bigger than 128GB
> (depends on the max page order).
> The very first real mode hcall is going to fail with H_TOO_HARD as
> in the real mode we cannot allocate memory for TCEs (we can in the virtual
> mode) but on the way out the code attempts to clear hardware TCEs using
> emulated TCE indexes which corrupts random kernel memory because
> it_offset==1<<59 is subtracted from those indexes and the resulting index
> is out of the TCE table bounds.
> 
> This fixes kvmppc_clear_tce() to use the correct TCE indexes.
> 
> While at it, this fixes TCE cache invalidation which uses emulated TCE
> indexes instead of the hardware ones. This went unnoticed as 64bit DMA
> is used these days and VMs map all RAM in one go and only then do DMA
> and this is when the TCE cache gets populated.
> 
> Potentially this could slow down mapping, however normally 16MB
> emulated pages are backed by 64K hardware pages so it is one write to
> the "TCE Kill" per 256 updates which is not that bad considering the size
> of the cache (1024 TCEs or so).
> 
> Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU 
> pages with smaller physical pages")
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

In addition, I have confirmations from both our QE team and a customer
that they can no longer reproduce the problem with this patch.  So,

Tested-by: David Gibson 

If we can merge this ASAP, that would be great.

> ---
>  arch/powerpc/kvm/book3s_64_vio.c| 45 +++--
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++--
>  2 files changed, 45 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index d42b4b6d4a79..85cfa6328222 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -420,13 +420,19 @@ static void kvmppc_tce_put(struct 
> kvmppc_spapr_tce_table *stt,
>   tbl[idx % TCES_PER_PAGE] = tce;
>  }
>  
> -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
> - unsigned long entry)
> +static void kvmppc_clear_tce(struct mm_struct *mm, struct 
> kvmppc_spapr_tce_table *stt,
> + struct iommu_table *tbl, unsigned long entry)
>  {
> - unsigned long hpa = 0;
> - enum dma_data_direction dir = DMA_NONE;
> + unsigned long i;
> + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
> + unsigned long io_entry = entry << (stt->page_shift - 
> tbl->it_page_shift);
>  
> - iommu_tce_xchg_no_kill(mm, tbl, entry, , );
> + for (i = 0; i < subpages; ++i) {
> + unsigned long hpa = 0;
> + enum dma_data_direction dir = DMA_NONE;
> +
> + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, , );
> + }
>  }
>  
>  static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
> @@ -485,6 +491,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
>   break;
>   }
>  
> + iommu_tce_kill(tbl, io_entry, subpages);
> +
>   return ret;
>  }
>  
> @@ -544,6 +552,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm,
>   break;
>   }
>  
> + iommu_tce_kill(tbl, io_entry, subpages);
> +
>   return ret;
>  }
>  
> @@ -590,10 +600,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
> long liobn,
>   ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
>   entry, ua, dir);
>  
> - iommu_tce_kill(stit->tbl, entry, 1);
>  
>   if (ret != H_SUCCESS) {
> - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
> + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry);
>   goto unlock_exit;
>   }
>   }
> @@ -669,13 +678,13 @@ long k

Re: [PATCH 0/5] KVM: PPC: Book3S: Modules cleanup and unification

2021-09-02 Thread David Gibson
On Thu, Sep 02, 2021 at 11:32:41AM -0300, Fabiano Rosas wrote:
> David Gibson  writes:
> 
> > On Wed, Sep 01, 2021 at 02:33:52PM -0300, Fabiano Rosas wrote:
> >> This series merges our three kvm modules kvm.ko, kvm-hv.ko and
> >> kvm-pr.ko into one kvm.ko module.
> >
> > That doesn't sound like a good idea to me.  People who aren't on BookS
> > servers don't want - and can't use - kvm-hv.  Almost nobody wants
> > kvm-pr.  It's also kind of inconsistent with x86, which has the
> > separate AMD and Intel modules.
> 
> But this is not altering the ability of having only kvm-hv or only
> kvm-pr. I'm taking the Kconfig options that used to produce separate
> modules and using them to select which code gets built into the one
> kvm.ko module.

> 
> Currently:
> 
> CONFIG_KVM_BOOK3S_64=m <-- produces kvm.ko
> CONFIG_KVM_BOOK3S_64_HV=m  <-- produces kvm-hv.ko
> CONFIG_KVM_BOOK3S_64_PR=m  <-- produces kvm-pr.ko
> 
> I'm making it so we now have one kvm.ko everywhere, but there is still:
> 
> CONFIG_KVM_BOOK3S_64=m   <-- produces kvm.ko
> CONFIG_KVM_BOOK3S_HV_POSSIBLE=y  <-- includes HV in kvm.ko
> CONFIG_KVM_BOOK3S_PR_POSSIBLE=y  <-- includes PR in kvm.ko
> 
> In other words, if you are going to have at least two modules loaded at
> all times (kvm + kvm-hv or kvm + kvm-pr), why not put all that into one
> module? No one needs to build code they are not going to use, this is
> not changing.

Ah.. I see, you're removing the runtime switch from one to the other
at the same time as having just a single one loaded, but leaving the
ability to compile time switch.  And compile time is arguably good
enough for the cases I've described.

Ok, I see your point.

I still think it's conceptually not ideal, but the practical benefit
is more important.  Objection withdrawn.


> About consistency with x86, this situation is not analogous because we
> need to be able to load both modules at the same time, which means
> kvm.ko needs to stick around when one module goes away in case we want
> to load the other module. The KVM common code states that it expects to
> have at most one implementation:
> 
> /*
>  * kvm_arch_init makes sure there's at most one caller
>  * for architectures that support multiple implementations,
>  * like intel and amd on x86.
>  (...)
> 
> which is not true in our case due to this requirement of having two
> separate modules loading independently.
> 
> (tangent) We are already quite different from other architectures since
> we're not making use of kvm_arch_init and some other KVM hooks, such as
> kvm_arch_check_processor_compat. So while other archs have their init
> dispatched by kvm common code, our init and cleanup happens
> independently in the ppc-specific modules, which obviously works but is
> needlessly different and has subtleties in the ordering of operations
> wrt. the kvm common code. (tangent)
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 0/5] KVM: PPC: Book3S: Modules cleanup and unification

2021-09-01 Thread David Gibson
On Wed, Sep 01, 2021 at 02:33:52PM -0300, Fabiano Rosas wrote:
> This series merges our three kvm modules kvm.ko, kvm-hv.ko and
> kvm-pr.ko into one kvm.ko module.

That doesn't sound like a good idea to me.  People who aren't on BookS
servers don't want - and can't use - kvm-hv.  Almost nobody wants
kvm-pr.  It's also kind of inconsistent with x86, which has the
separate AMD and Intel modules.

> The main reason for this is to deal with the issue that kvm.ko can be
> loaded on its own without any of the other modules present. This can
> happen if one or both of the modules fail to init or if the user loads
> kvm.ko only.
> 
> With only kvm.ko loaded, the userspace can call any of the KVM ioctls
> which will fail more or less gracefully depending on what kind of
> verification we do in powerpc.c.

I see that that's awkward, but I'm not sure it justifies compromising
the actual natural structure of the dependencies.

> Instead of adding a check to every entry point or finding a hack to
> link the modules so that when one fails (hv/pr), the other (kvm)
> exits, I think it is cleaner to just make them all a single module.
> 
> The two KVM implementations are already selected by Kconfig options,
> so the only thing that changes is that they are now in the same
> module. I also kept kvm-hv and kvm-pr as aliases to kvm, so that
> people don't get too surprised with the change.
> 
> There is a possible issue with the larger module size for kernel
> builds that should support both HV-only and PR-only environments, but
> PR is usually not used in production so I'm not sure if that is a real
> issue.
> 
> Patches 1,2,3 are standalone cleanups.
> Patches 4,5 are the unification work.
> 
> Fabiano Rosas (5):
>   KVM: PPC: Book3S HV: Check return value of kvmppc_radix_init
>   KVM: PPC: Book3S HV: Delay setting of kvm ops
>   KVM: PPC: Book3S HV: Free allocated memory if module init fails
>   KVM: PPC: Book3S: Unify kvm-hv and kvm-pr modules
>   KVM: PPC: Book3S: Stop exporting non-builtin symbols
> 
>  arch/powerpc/configs/powernv_defconfig |  2 +-
>  arch/powerpc/configs/ppc64_defconfig   |  2 +-
>  arch/powerpc/configs/pseries_defconfig |  2 +-
>  arch/powerpc/kvm/Kconfig   | 72 --
>  arch/powerpc/kvm/Makefile  | 11 ++--
>  arch/powerpc/kvm/book3s.c  | 61 ++
>  arch/powerpc/kvm/book3s.h  | 19 +++
>  arch/powerpc/kvm/book3s_64_mmu_radix.c |  3 --
>  arch/powerpc/kvm/book3s_64_vio.c   |  3 --
>  arch/powerpc/kvm/book3s_hv.c   | 38 --
>  arch/powerpc/kvm/book3s_pr.c   | 13 -
>  arch/powerpc/kvm/book3s_rtas.c |  1 -
>  arch/powerpc/kvm/book3s_xics.c |  4 --
>  arch/powerpc/kvm/book3s_xive.c |  6 ---
>  arch/powerpc/kvm/emulate.c |  1 -
>  arch/powerpc/kvm/powerpc.c | 14 -
>  kernel/irq/irqdesc.c   |  2 +-
>  17 files changed, 125 insertions(+), 129 deletions(-)
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v8 5/5] powerpc/pseries: Add support for FORM2 associativity

2021-08-16 Thread David Gibson
On Thu, Aug 12, 2021 at 06:52:23PM +0530, Aneesh Kumar K.V wrote:
> PAPR interface currently supports two different ways of communicating resource
> grouping details to the OS. These are referred to as Form 0 and Form 1
> associativity grouping. Form 0 is the older format and is now considered
> deprecated. This patch adds another resource grouping named FORM2.
> 
> Signed-off-by: Daniel Henrique Barboza 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

Though there a couple of cosmetic issues and one bad memory access
issue (though only in the case of buggy firmware).

[snip]
> +Form 2
> +---
> +Form 2 associativity format adds separate device tree properties 
> representing NUMA node distance
> +thereby making the node distance computation flexible. Form 2 also allows 
> flexible primary
> +domain numbering. With numa distance computation now detached from the index 
> value in
> +"ibm,associativity-reference-points" property, Form 2 allows a large number 
> of primary domain
> +ids at the same domainID index representing resource groups of different 
> performance/latency
> +characteristics.
> +
> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 
> in the
> +"ibm,architecture-vec-5" property.
> +
> +"ibm,numa-lookup-index-table" property contains a list of one or more 
> numbers representing
> +the domainIDs present in the system. The offset of the domainID in this 
> property is
> +used as an index while computing numa distance information via 
> "ibm,numa-distance-table".
> +
> +prop-encoded-array: The number N of the domainIDs encoded as with 
> encode-int, followed by
> +N domainID encoded as with encode-int
> +
> +For ex:
> +"ibm,numa-lookup-index-table" =  {4, 0, 8, 250, 252}. The offset of domainID 
> 8 (2) is used when

Since you're using dts syntax below, it probably makes sense to use it
here as well.

> +computing the distance of domain 8 from other domains present in the system. 
> For the rest of
> +this document, this offset will be referred to as domain distance offset.
> +
> +"ibm,numa-distance-table" property contains a list of one or more numbers 
> representing the NUMA
> +distance between resource groups/domains present in the system.
> +
> +prop-encoded-array: The number N of the distance values encoded as with 
> encode-int, followed by
> +N distance values encoded as with encode-bytes. The max distance value we 
> could encode is 255.
> +The number N must be equal to the square of m where m is the number of 
> domainIDs in the
> +numa-lookup-index-table.
> +
> +For ex:
> +ibm,numa-lookup-index-table = <3 0 8 40>;
> +ibm,numa-distace-table = <9>, /bits/ 8 < 10  20  80
> +  20  10 160
> +  80 160  10>;

[snip]
> +
> + /* FORM2 affinity  */
> + nid = of_node_to_nid_single(node);
> + if (nid == NUMA_NO_NODE)
> + return;
> +
> + /*
> +  * With FORM2 we expect NUMA distance of all possible NUMA
> +  * nodes to be provided during boot.
> +  */
> + WARN(numa_distance_table[nid][nid] == -1,
> +  "NUMA distance details for node %d not provided\n", nid);
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, . domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6,  N elements}

.. and here too.

> + */
> +static void initialize_form2_numa_distance_lookup_table(void)
> +{
> + int i, j;
> + struct device_node *root;
> + const __u8 *numa_dist_table;
> + const __be32 *numa_lookup_index;
> + int numa_dist_table_length;
> + int max_numa_index, distance_index;
> +
> + if (firmware_has_feature(FW_FEATURE_OPAL))
> + root = of_find_node_by_path("/ibm,opal");
> + else
> + root = of_find_node_by_path("/rtas");
> + if (!root)
> + root = of_find_node_by_path("/");
> +
> + numa_lookup_index = of_get_property(root, 
> "ibm,numa-lookup-index-table", NULL);
> + max_numa_index = of_read_number(_lookup_index[0], 1);
> +
> + /* first element of the array is the size and is encode-int */
> + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", 
> NULL);
> + numa_dist_table_length = of_read_number((const __be32 
> *)_dist_table[0], 1);
> + /* Skip the size which is encoded int */
> + numa_dist_table += sizeof(__be32);
> +
> + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
> +  num

Re: [PATCH v8 3/5] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-08-16 Thread David Gibson
On Thu, Aug 12, 2021 at 06:52:21PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call.
> 
> Instead of updating NUMA distance every time we lookup a node id
> from the associativity property, add helpers that can be used
> during boot which does this only once. Also remove the distance
> update from node id lookup helpers.
> 
> Currently, we duplicate parsing code for ibm,associativity and
> ibm,associativity-lookup-arrays in the kernel. The associativity array 
> provided
> by these device tree properties are very similar and hence can use
> a helper to parse the node id and numa distance details.
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

There are a handful of nits it would be nice to clean up as followups, though:

[snip]
> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> +{
> + struct assoc_arrays aa = { .arrays = NULL };
> + int default_nid = NUMA_NO_NODE;

I don't think there's any point to the 'default_nid' variable.

> + int nid = default_nid;
> + int rc, index;
> +
> + if ((primary_domain_index < 0) || !numa_enabled)
> + return default_nid;
> +
> + rc = of_get_assoc_arrays();
> + if (rc)
> + return default_nid;
> +
> + if (primary_domain_index <= aa.array_sz &&

You don't need this test any more - it's included in __associativity_to_nid().

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity

2021-08-11 Thread David Gibson
On Wed, Aug 11, 2021 at 09:39:32AM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote:
> >> PAPR interface currently supports two different ways of communicating 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0 and Form 1
> >> associativity grouping. Form 0 is the older format and is now considered
> >> deprecated. This patch adds another resource grouping named FORM2.
> >> 
> >> Signed-off-by: Daniel Henrique Barboza 
> >> Signed-off-by: Aneesh Kumar K.V 
> >
> > LGTM, with the exception of some minor nits noted below.
> ...
> 
> > +
> >> +  for (i = 0; i < max_numa_index; i++)
> >> +  /* +1 skip the max_numa_index in the property */
> >> +  numa_id_index_table[i] = of_read_number(_lookup_index[i + 
> >> 1], 1);
> >> +
> >> +
> >> +  if (numa_dist_table_length != max_numa_index * max_numa_index) {
> >> +
> >
> > Stray extra whitespace line here.
> >
> >> +  WARN(1, "Wrong NUMA distance information\n");
> >> +  /* consider everybody else just remote. */
> >> +  for (i = 0;  i < max_numa_index; i++) {
> >> +  for (j = 0; j < max_numa_index; j++) {
> >> +  int nodeA = numa_id_index_table[i];
> >> +  int nodeB = numa_id_index_table[j];
> >> +
> >> +  if (nodeA == nodeB)
> >> +  numa_distance_table[nodeA][nodeB] = 
> >> LOCAL_DISTANCE;
> >> +  else
> >> +  numa_distance_table[nodeA][nodeB] = 
> >> REMOTE_DISTANCE;
> >> +  }
> >> +  }
> >
> > I don't think it's necessarily a problem, but something to consider is
> > that this fallback will initialize distance for *all* node IDs,
> > whereas the normal path will only initialize it for nodes that are in
> > the index table.  Since some later error checks key off whether
> > certain fields in the distance table are initialized, is that the
> > outcome you want?
> >
> 
> With the device tree details not correct, one of the possible way to
> make progress is to consider everybody remote. With new node hotplug
> support we used to check whether the distance table entry is
> initialized. With the updated spec, we expect all possible numa node
> distance to be available during boot.

Sure.  But my main point here is that the fallback behaviour in this
clause is different from the fallback behaviour if the table is there
and parseable, but incomplete - which is also not expected.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v7 3/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-08-09 Thread David Gibson
void)
>   struct device_node *memory;
>   int default_nid = 0;
>   unsigned long i;
> + const __be32 *associativity;
>  
>   if (numa_enabled == 0) {
>   printk(KERN_WARNING "NUMA disabled by user\n");
> @@ -734,18 +805,30 @@ static int __init parse_numa_properties(void)
>* each node to be onlined must have NODE_DATA etc backing it.
>*/
>   for_each_present_cpu(i) {
> + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>   struct device_node *cpu;
> - int nid = vphn_get_nid(i);
> + int nid = NUMA_NO_NODE;
>  
> - /*
> -  * Don't fall back to default_nid yet -- we will plug
> -  * cpus into nodes once the memory scan has discovered
> -  * the topology.
> -  */
> - if (nid == NUMA_NO_NODE) {
> + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> +
> + if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> + nid = associativity_to_nid(vphn_assoc);
> + __initialize_form1_numa_distance(vphn_assoc);
> + } else {
> +
> + /*
> +  * Don't fall back to default_nid yet -- we will plug
> +  * cpus into nodes once the memory scan has discovered
> +  * the topology.
> +  */
>   cpu = of_get_cpu_node(i, NULL);
>   BUG_ON(!cpu);
> - nid = of_node_to_nid_single(cpu);
> +
> + associativity = of_get_associativity(cpu);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + }
>   of_node_put(cpu);
>   }
>  
> @@ -781,8 +864,11 @@ static int __init parse_numa_properties(void)
>* have associativity properties.  If none, then
>* everything goes to default_nid.
>*/
> - nid = of_node_to_nid_single(memory);
> - if (nid < 0)
> + associativity = of_get_associativity(memory);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + } else
>   nid = default_nid;
>  
>   fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), );
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>   return saved_rc;
>   }
>  
> + update_numa_distance(dn);
> +
>   rc = dlpar_online_cpu(dn);
>   if (rc) {
>   saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 377d852f5a9a..ee1d81d7e54a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> drmem_lmb *lmb)
>   return -ENODEV;
>   }
>  
> + update_numa_distance(lmb_node);
> +
>   dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>   if (!dr_node) {
>   dlpar_free_cc_nodes(lmb_node);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity

2021-08-09 Thread David Gibson
ty  */
> + nid = of_node_to_nid_single(node);
> + if (nid == NUMA_NO_NODE)
> + return;
> +
> + /*
> +  * With FORM2 we expect NUMA distance of all possible NUMA
> +  * nodes to be provided during boot.
> +  */
> + WARN(numa_distance_table[nid][nid] == -1,
> +  "NUMA distance details for node %d not provided\n", nid);
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, . domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6,  N elements}
> + */
> +static void initialize_form2_numa_distance_lookup_table(void)
> +{
> + int i, j;
> + struct device_node *root;
> + const __u8 *numa_dist_table;
> + const __be32 *numa_lookup_index;
> + int numa_dist_table_length;
> + int max_numa_index, distance_index;
> +
> + if (firmware_has_feature(FW_FEATURE_OPAL))
> + root = of_find_node_by_path("/ibm,opal");
> + else
> + root = of_find_node_by_path("/rtas");
> + if (!root)
> + root = of_find_node_by_path("/");
> +
> + numa_lookup_index = of_get_property(root, 
> "ibm,numa-lookup-index-table", NULL);
> + max_numa_index = of_read_number(_lookup_index[0], 1);
> +
> + /* first element of the array is the size and is encode-int */
> + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", 
> NULL);
> + numa_dist_table_length = of_read_number((const __be32 
> *)_dist_table[0], 1);
> + /* Skip the size which is encoded int */
> + numa_dist_table += sizeof(__be32);
> +
> + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
> +  numa_dist_table_length, max_numa_index);
> +
> + for (i = 0; i < max_numa_index; i++)
> + /* +1 skip the max_numa_index in the property */
> + numa_id_index_table[i] = of_read_number(_lookup_index[i + 
> 1], 1);
> +
> +
> + if (numa_dist_table_length != max_numa_index * max_numa_index) {
> +

Stray extra whitespace line here.

> + WARN(1, "Wrong NUMA distance information\n");
> + /* consider everybody else just remote. */
> + for (i = 0;  i < max_numa_index; i++) {
> + for (j = 0; j < max_numa_index; j++) {
> + int nodeA = numa_id_index_table[i];
> + int nodeB = numa_id_index_table[j];
> +
> + if (nodeA == nodeB)
> + numa_distance_table[nodeA][nodeB] = 
> LOCAL_DISTANCE;
> + else
> + numa_distance_table[nodeA][nodeB] = 
> REMOTE_DISTANCE;
> + }
> + }

I don't think it's necessarily a problem, but something to consider is
that this fallback will initialize distance for *all* node IDs,
whereas the normal path will only initialize it for nodes that are in
the index table.  Since some later error checks key off whether
certain fields in the distance table are initialized, is that the
outcome you want?

> + }
> +
> + distance_index = 0;
> + for (i = 0;  i < max_numa_index; i++) {
> + for (j = 0; j < max_numa_index; j++) {
> + int nodeA = numa_id_index_table[i];
> + int nodeB = numa_id_index_table[j];
> +
> + numa_distance_table[nodeA][nodeB] = 
> numa_dist_table[distance_index++];
> + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, 
> numa_distance_table[nodeA][nodeB]);
> + }
> + }
> + of_node_put(root);
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -322,6 +428,9 @@ static int __init find_primary_domain_index(void)
>*/
>   if (firmware_has_feature(FW_FEATURE_OPAL)) {
>   affinity_form = FORM1_AFFINITY;
> + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> + dbg("Using form 2 affinity\n");
> + affinity_form = FORM2_AFFINITY;
>   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>   dbg("Using form 1 affinity\n");
>   affinity_form = FORM1_AFFINITY;
> @@ -366,9 +475,12 @@ static int __init find_primary_domain_index(void)
>  
>   index = of_read_number(_ref_points[1], 1);
>   } else {
> + /*
> +  * Both FORM1 and FORM2 affinity find the primary domain details
> +  * at the same offset.
> +  */
>   index = of_read_number(distance_ref_points, 1);
>   }
> -
>   /*
>* Warn and cap if the hardware supports more than
>* MAX_DISTANCE_REF_POINTS domains.
> @@ -807,6 +919,12 @@ static int __init parse_numa_properties(void)
>  
>   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> primary_domain_index);
>  
> + /*
> +  * If it is FORM2 initialize the distance table here.
> +  */
> + if (affinity_form == FORM2_AFFINITY)
> + initialize_form2_numa_distance_lookup_table();
> +
>   /*
>* Even though we connect cpus to numa domains later in SMP
>* init, we need to know the node ids now. This is because
> diff --git a/arch/powerpc/platforms/pseries/firmware.c 
> b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>   {FW_FEATURE_PRRN,   OV5_PRRN},
>   {FW_FEATURE_DRMEM_V2,   OV5_DRMEM_V2},
>   {FW_FEATURE_DRC_INFO,   OV5_DRC_INFO},
> + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v7 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-09 Thread David Gibson
On Mon, Aug 09, 2021 at 10:54:34AM +0530, Aneesh Kumar K.V wrote:
> Currently, we duplicate parsing code for ibm,associativity and
> ibm,associativity-lookup-arrays in the kernel. The associativity array 
> provided
> by these device tree properties are very similar and hence can use
> a helper to parse the node id and numa distance details.
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

Though I'd prefer to see these fixes folded in with the earlier patch.

> ---
>  arch/powerpc/mm/numa.c | 104 +++--
>  1 file changed, 58 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fffb3c40f595..e6d47fcba335 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -171,26 +171,36 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -/*
> - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> - * info is found.
> - */
> -static int associativity_to_nid(const __be32 *associativity)
> +static int __associativity_to_nid(const __be32 *associativity,
> +   int max_array_sz)
>  {
> - int nid = NUMA_NO_NODE;
> + int nid;
> + /*
> +  * primary_domain_index is 1 based array index.
> +  */
> + int index = primary_domain_index  - 1;
>  
> - if (!numa_enabled)
> - goto out;
> + if (!numa_enabled || index >= max_array_sz)
> + return NUMA_NO_NODE;
>  
> - if (of_read_number(associativity, 1) >= primary_domain_index)
> - nid = of_read_number([primary_domain_index], 1);
> + nid = of_read_number([index], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
>   nid = NUMA_NO_NODE;
> -out:
>   return nid;
>  }
> +/*
> + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> + * info is found.
> + */
> +static int associativity_to_nid(const __be32 *associativity)
> +{
> + int array_sz = of_read_number(associativity, 1);
> +
> + /* Skip the first element in the associativity array */
> + return __associativity_to_nid((associativity + 1), array_sz);
> +}
>  
>  static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
> @@ -295,33 +305,39 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static void __initialize_form1_numa_distance(const __be32 *associativity)
> +static void __initialize_form1_numa_distance(const __be32 *associativity,
> +  int max_array_sz)
>  {
>   int i, nid;
>  
>   if (affinity_form != FORM1_AFFINITY)
>   return;
>  
> - nid = associativity_to_nid(associativity);
> + nid = __associativity_to_nid(associativity, max_array_sz);
>   if (nid != NUMA_NO_NODE) {
>   for (i = 0; i < distance_ref_points_depth; i++) {
>   const __be32 *entry;
> + int index = be32_to_cpu(distance_ref_points[i]) - 1;
> +
> + /*
> +  * broken hierarchy, return with broken distance table
> +  */
> + if (WARN(index >= max_array_sz, "Broken 
> ibm,associativity property"))
> + return;
>  
> - entry = 
> [be32_to_cpu(distance_ref_points[i])];
> + entry = [index];
>   distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
>   }
>   }
>  }
>  
> -static void initialize_form1_numa_distance(struct device_node *node)
> +static void initialize_form1_numa_distance(const __be32 *associativity)
>  {
> - const __be32 *associativity;
> -
> - associativity = of_get_associativity(node);
> - if (!associativity)
> - return;
> + int array_sz;
>  
> - __initialize_form1_numa_distance(associativity);
> + array_sz = of_read_number(associativity, 1);
> + /* Skip the first element in the associativity array */
> + __initialize_form1_numa_distance(associativity + 1, array_sz);
>  }
>  
>  /*
> @@ -334,7 +350,13 @@ void update_numa_distance(struct device_node *node)
>   if (affinity_form == FORM0_AFFINITY)
>   return;
>   else if (affinity_form == FORM1_AFFINITY) {
> - initialize_form1_numa_distance(node);
> + const __be32 *associativity;
> +
> + associativity = of_get_as

Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-08 Thread David Gibson
On Fri, Aug 06, 2021 at 09:53:59PM +0530, Aneesh Kumar K.V wrote:
> On 8/6/21 12:17 PM, David Gibson wrote:
> > On Tue, Jul 27, 2021 at 03:33:11PM +0530, Aneesh Kumar K.V wrote:
> > > Currently, we duplicate parsing code for ibm,associativity and
> > > ibm,associativity-lookup-arrays in the kernel. The associativity array 
> > > provided
> > > by these device tree properties are very similar and hence can use
> > > a helper to parse the node id and numa distance details.
> > 
> > Oh... sorry.. comments on the earlier patch were from before I read
> > and saw you adjusted things here.
> > 
> > > 
> > > Signed-off-by: Aneesh Kumar K.V 
> > > ---
> > >   arch/powerpc/mm/numa.c | 83 ++
> > >   1 file changed, 51 insertions(+), 32 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > > index fffb3c40f595..7506251e17f2 100644
> > > --- a/arch/powerpc/mm/numa.c
> > > +++ b/arch/powerpc/mm/numa.c
> > > @@ -171,19 +171,19 @@ static void unmap_cpu_from_node(unsigned long cpu)
> > >   }
> > >   #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
> > > -/*
> > > - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> > > - * info is found.
> > > - */
> > > -static int associativity_to_nid(const __be32 *associativity)
> > > +static int __associativity_to_nid(const __be32 *associativity,
> > > +   int max_array_sz)
> > >   {
> > >   int nid = NUMA_NO_NODE;
> > > + /*
> > > +  * primary_domain_index is 1 based array index.
> > > +  */
> > > + int index = primary_domain_index  - 1;
> > > - if (!numa_enabled)
> > > + if (!numa_enabled || index >= max_array_sz)
> > >   goto out;
> > 
> > You don't need a goto, you can just return NUMA_NO_NODE.
> 
> updated
> 
> > 
> > > - if (of_read_number(associativity, 1) >= primary_domain_index)
> > > - nid = of_read_number([primary_domain_index], 1);
> > > + nid = of_read_number([index], 1);
> > >   /* POWER4 LPAR uses 0x as invalid node */
> > >   if (nid == 0x || nid >= nr_node_ids)
> > > @@ -191,6 +191,17 @@ static int associativity_to_nid(const __be32 
> > > *associativity)
> > >   out:
> > >   return nid;
> > >   }
> > > +/*
> > > + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> > > + * info is found.
> > > + */
> > > +static int associativity_to_nid(const __be32 *associativity)
> > > +{
> > > + int array_sz = of_read_number(associativity, 1);
> > > +
> > > + /* Skip the first element in the associativity array */
> > > + return __associativity_to_nid((associativity + 1), array_sz);
> > > +}
> > >   static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 
> > > *cpu2_assoc)
> > >   {
> > > @@ -295,24 +306,41 @@ int of_node_to_nid(struct device_node *device)
> > >   }
> > >   EXPORT_SYMBOL(of_node_to_nid);
> > > -static void __initialize_form1_numa_distance(const __be32 *associativity)
> > > +static void ___initialize_form1_numa_distance(const __be32 
> > > *associativity,
> > > +  int max_array_sz)
> > >   {
> > >   int i, nid;
> > >   if (affinity_form != FORM1_AFFINITY)
> > >   return;
> > > - nid = associativity_to_nid(associativity);
> > > + nid = __associativity_to_nid(associativity, max_array_sz);
> > >   if (nid != NUMA_NO_NODE) {
> > >   for (i = 0; i < distance_ref_points_depth; i++) {
> > >   const __be32 *entry;
> > > + int index = be32_to_cpu(distance_ref_points[i]) - 1;
> > > +
> > > + /*
> > > +  * broken hierarchy, return with broken distance table
> > 
> > WARN_ON, maybe?
> 
> 
> updated
> 
> > 
> > > +  */
> > > + if (index >= max_array_sz)
> > > + return;
> > > - entry = 
> > > [be32_to_cpu(distance_ref_points[i])];
> > > + entry = [index];
> > >   distance_lookup_table[nid][i] = 
> > > of_read_number(entry, 1)

Re: [PATCH v6 3/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-08-06 Thread David Gibson
 user\n");
> @@ -734,18 +805,30 @@ static int __init parse_numa_properties(void)
>* each node to be onlined must have NODE_DATA etc backing it.
>*/
>   for_each_present_cpu(i) {
> + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>   struct device_node *cpu;
> - int nid = vphn_get_nid(i);
> + int nid = NUMA_NO_NODE;
>  
> - /*
> -  * Don't fall back to default_nid yet -- we will plug
> -  * cpus into nodes once the memory scan has discovered
> -  * the topology.
> -  */
> - if (nid == NUMA_NO_NODE) {
> + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> +
> + if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> + nid = associativity_to_nid(vphn_assoc);
> + __initialize_form1_numa_distance(vphn_assoc);
> + } else {
> +
> + /*
> +  * Don't fall back to default_nid yet -- we will plug
> +  * cpus into nodes once the memory scan has discovered
> +  * the topology.
> +  */
>   cpu = of_get_cpu_node(i, NULL);
>   BUG_ON(!cpu);
> - nid = of_node_to_nid_single(cpu);
> +
> + associativity = of_get_associativity(cpu);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + }
>   of_node_put(cpu);
>   }
>  
> @@ -781,8 +864,11 @@ static int __init parse_numa_properties(void)
>* have associativity properties.  If none, then
>* everything goes to default_nid.
>*/
> - nid = of_node_to_nid_single(memory);
> - if (nid < 0)
> + associativity = of_get_associativity(memory);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + } else
>   nid = default_nid;
>  
>   fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), );
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>   return saved_rc;
>   }
>  
> + update_numa_distance(dn);
> +
>   rc = dlpar_online_cpu(dn);
>   if (rc) {
>   saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 377d852f5a9a..ee1d81d7e54a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> drmem_lmb *lmb)
>   return -ENODEV;
>   }
>  
> + update_numa_distance(lmb_node);
> +
>   dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>   if (!dr_node) {
>   dlpar_free_cc_nodes(lmb_node);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-06 Thread David Gibson
 if (nid == 0x || nid >= nr_node_ids)
> - nid = default_nid;
> + index = lmb->aa_index * aa.array_sz;
> + associativity = [index];
> + nid = __associativity_to_nid(associativity, aa.array_sz);
>   if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> - int i;
> - const __be32 *associativity;
> -
> - index = lmb->aa_index * aa.array_sz;
> - associativity = [index];
>   /*
> -  * lookup array associativity entries have different 
> format
> -  * There is no length of the array as the first element.
> +  * lookup array associativity entries have
> +  * no length of the array as the first element.
>*/
> - for (i = 0; i < distance_ref_points_depth; i++) {
> - const __be32 *entry;
> -
> - entry = 
> [be32_to_cpu(distance_ref_points[i]) - 1];
> - distance_lookup_table[nid][i] = 
> of_read_number(entry, 1);
> - }
> + ___initialize_form1_numa_distance(associativity,
> +   aa.array_sz);

Better, thanks.

>   }
>   }
>   return nid;
> @@ -632,11 +651,11 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  
>   if (primary_domain_index <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> - index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> - nid = of_read_number([index], 1);
> + const __be32 *associativity;
>  
> - if (nid == 0x || nid >= nr_node_ids)
> - nid = default_nid;
> + index = lmb->aa_index * aa.array_sz;
> + associativity = [index];
> + nid = __associativity_to_nid(associativity, aa.array_sz);
>   }
>   return nid;
>  }

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-07-26 Thread David Gibson
On Tue, Jul 27, 2021 at 09:02:33AM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson  writes:
> >> 
> >> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> 
> 
> 
> > 
> >> >
> >> >> +   nid = of_read_number([index], 1);
> >> >> +
> >> >> +   if (nid == 0x || nid >= nr_node_ids)
> >> >> +   nid = default_nid;
> >> >> +   if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> >> >> +   int i;
> >> >> +   const __be32 *associativity;
> >> >> +
> >> >> +   index = lmb->aa_index * aa.array_sz;
> >> >> +   associativity = [index];
> >> >> +   /*
> >> >> +* lookup array associativity entries have 
> >> >> different format
> >> >> +* There is no length of the array as the first 
> >> >> element.
> >> >
> >> > The difference it very small, and this is not a hot path.  Couldn't
> >> > you reduce a chunk of code by prepending aa.array_sz, then re-using
> >> > __initialize_form1_numa_distance.  Or even making
> >> > __initialize_form1_numa_distance() take the length as a parameter.
> >> 
> >> The changes are small but confusing w.r.t how we look at the
> >> associativity-lookup-arrays. The way we interpret associativity array
> >> and associativity lookup array using primary_domain_index is different.
> >> Hence the '-1' in the node lookup here.
> >
> > They're really not, though.  It's exactly the same interpretation of
> > the associativity array itself - it's just that one of them has the
> > array prepended with a (redundant) length.  So you can make
> > __initialize_form1_numa_distance() work on the "bare" associativity
> > array, with a given length.  Here you call it with aa.array_sz as the
> > length, and in the other place you call it with prop[0] as the length.
> >
> >> 
> >>index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> >>nid = of_read_number([index], 1);
> >> 
> >> 
> >> >
> >> >> +*/
> >> >> +   for (i = 0; i < max_associativity_domain_index; 
> >> >> i++) {
> >> >> +   const __be32 *entry;
> >> >> +
> >> >> +   entry = 
> >> >> [be32_to_cpu(distance_ref_points[i]) - 1];
> >> >
> >> > Does anywhere verify that distance_ref_points[i] <= aa.array_size for
> >> > every i?
> >> 
> >> We do check for 
> >> 
> >>if (primary_domain_index <= aa.array_sz &&
> >
> > Right, but that doesn't check the other distance_ref_points entries.
> > Not that there's any reason to have extra entries with Form2, but we
> > still don't want stray array accesses.
> 
> This is how the change looks. I am not convinced this makes it simpler.

It's not, but that's because the lookup_array_assoc flag is not needed...

> I will add that as the last patch and we can drop that if we find that
> not helpful? 
> 
> modified   arch/powerpc/mm/numa.c
> @@ -171,20 +171,31 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -/*
> - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> - * info is found.
> - */
> -static int associativity_to_nid(const __be32 *associativity)
> +static int __associativity_to_nid(const __be32 *associativity,
> +   bool lookup_array_assoc,
> +   int max_array_index)
>  {
>   int nid = NUMA_NO_NODE;
> + int index;
>  
>   if (!numa_enabled)
>   goto out;
> + /*
> +  * ibm,associativity-lookup-array doesn't have element
> +  * count at the start of the associativity. Hence
> +  * decrement the primary_domain_index when used with
> +  * lookup-array associativity.
> +  */
> + if (lookup_array_assoc)
> + index = primary_domain_index - 1;
> + else {
> + index = primary_domain_index;
> + max_arr

Re: [PATCH v5 6/6] powerpc/pseries: Add support for FORM2 associativity

2021-07-25 Thread David Gibson
On Thu, Jul 22, 2021 at 01:04:42PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Jun 28, 2021 at 08:41:17PM +0530, Aneesh Kumar K.V wrote:
> >> PAPR interface currently supports two different ways of communicating 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0 and Form 1
> >> associativity grouping. Form 0 is the older format and is now considered
> >> deprecated. This patch adds another resource grouping named FORM2.
> >> 
> >> Signed-off-by: Daniel Henrique Barboza 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 103 ++
> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >>  arch/powerpc/mm/numa.c| 157 ++
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 242 insertions(+), 26 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index ..31cc7da2c7a6
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,103 @@
> >> +
> >> +NUMA resource associativity
> >> +=
> >> +
> >> +Associativity represents the groupings of the various platform resources 
> >> into
> >> +domains of substantially similar mean performance relative to resources 
> >> outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources 
> >> subsets
> >> +are represented as being members of a sub-grouping domain. This 
> >> performance
> >> +characteristic is presented in terms of NUMA node distance within the 
> >> Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >
> > Pretty hard to decipher, but that's typical for PAPR.
> >
> >> +PAPR interface currently supports different ways of communicating these 
> >> resource
> >> +grouping details to the OS. These are referred to as Form 0, Form 1 and 
> >> Form2
> >> +associativity grouping. Form 0 is the older format and is now considered 
> >> deprecated.
> >
> > Nit: s/older/oldest/ since there are now >2 forms.
> 
> updated.
> 
> >
> >> +Hypervisor indicates the type/form of associativity used via 
> >> "ibm,architecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> >> associativity
> >> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >> +
> >> +Form 0
> >> +-
> >> +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-
> >> +With Form 1 a combination of ibm,associativity-reference-points, and 
> >> ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between 
> >> resource groups/domains.
> >> +
> >> +The “ibm,associativity” property contains a list of one or more numbers 
> >> (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains a list of one 
> >> or more numbers
> >> +(domainID index) that represents the 1 based ordinal in the associativity 
> >> lists.
> >> +The list of domainID indexes represents an increasing hierarchy of 
> >> resource grouping.
> >> +
> >> +ex:
> >> +{ primary domainID index, secondary domainID index, tertiary domainID 
> >> index.. }
> >> +
> >> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
> >> node id.
> >> +Linux kernel computes NUMA distance between two domains by recursively 
> >> comparing
> >> +if they belong to the same higher-level domains. For mismatch at every 
> >> higher
> >> +level of the resource group, t

Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-07-25 Thread David Gibson
On Thu, Jul 22, 2021 at 12:37:46PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> >> The associativity details of the newly added resourced are collected from
> >> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> >> distance details of the newly added numa node after the above call.
> >> 
> >> Instead of updating NUMA distance every time we lookup a node id
> >> from the associativity property, add helpers that can be used
> >> during boot which does this only once. Also remove the distance
> >> update from node id lookup helpers.
> >> 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  arch/powerpc/mm/numa.c| 173 +-
> >>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
> >>  .../platforms/pseries/hotplug-memory.c|   2 +
> >>  arch/powerpc/platforms/pseries/pseries.h  |   1 +
> >>  4 files changed, 132 insertions(+), 46 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 0ec16999beef..7b142f79d600 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
> >>  }
> >>  EXPORT_SYMBOL(__node_distance);
> >>  
> >> -static void initialize_distance_lookup_table(int nid,
> >> -  const __be32 *associativity)
> >> -{
> >> -  int i;
> >> -
> >> -  if (affinity_form != FORM1_AFFINITY)
> >> -  return;
> >> -
> >> -  for (i = 0; i < max_associativity_domain_index; i++) {
> >> -  const __be32 *entry;
> >> -
> >> -  entry = [be32_to_cpu(distance_ref_points[i]) - 1];
> >> -  distance_lookup_table[nid][i] = of_read_number(entry, 1);
> >> -  }
> >> -}
> >> -
> >>  /*
> >>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> >>   * info is found.
> >> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
> >> *associativity)
> >>/* POWER4 LPAR uses 0x as invalid node */
> >>if (nid == 0x || nid >= nr_node_ids)
> >>nid = NUMA_NO_NODE;
> >> -
> >> -  if (nid > 0 &&
> >> -  of_read_number(associativity, 1) >= 
> >> max_associativity_domain_index) {
> >> -  /*
> >> -   * Skip the length field and send start of associativity array
> >> -   */
> >> -  initialize_distance_lookup_table(nid, associativity + 1);
> >> -  }
> >> -
> >>  out:
> >>return nid;
> >>  }
> >> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
> >>  }
> >>  EXPORT_SYMBOL(of_node_to_nid);
> >>  
> >> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> >> +{
> >> +  int i, nid;
> >> +
> >> +  if (affinity_form != FORM1_AFFINITY)
> >
> > Since this shouldn't be called on a !form1 system, this could be a 
> > WARN_ON().
> 
> The way we call functions currently, instead of doing
> 
> if (affinity_form == FORM1_AFFINITY)
> __initialize_form1_numa_distance()
> 
> We avoid doing the if check in multiple places. For example
> parse_numa_properties will fetch the associativity array to find the
> details of online node and set it online. We use the same code path to
> initialize distance.
> 
>   if (__vphn_get_associativity(i, vphn_assoc) == 0) {
>   nid = associativity_to_nid(vphn_assoc);
>   __initialize_form1_numa_distance(vphn_assoc);
>   } else {
> 
>   cpu = of_get_cpu_node(i, NULL);
>   BUG_ON(!cpu);
> 
>   associativity = of_get_associativity(cpu);
>   if (associativity) {
>   nid = associativity_to_nid(associativity);
>   __initialize_form1_numa_distance(associativity);
>   }
> 
> We avoid the the if (affinity_form == FORM1_AFFINITY) check there by
> moving the check inside __initialize_form1_numa_distance().

Oh.. ok.  The only caller I spotted was already doing a test against
affinity_form.

> >> +  return;
> >> +
> >> +  if (of_read_number(associativity, 1) &

Re: [PATCH v5 5/6] powerpc/pseries: Add a helper for form1 cpu distance

2021-07-25 Thread David Gibson
On Thu, Jul 22, 2021 at 12:39:27PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Jun 28, 2021 at 08:41:16PM +0530, Aneesh Kumar K.V wrote:
> >> This helper is only used with the dispatch trace log collection.
> >> A later patch will add Form2 affinity support and this change helps
> >> in keeping that simpler. Also add a comment explaining we don't expect
> >> the code to be called with FORM0
> >> 
> >> Reviewed-by: David Gibson 
> >> Signed-off-by: Aneesh Kumar K.V 
> >
> > What makes it a "relative_distance" rather than just a "distance"?
> 
> I added that to indicate that the function is not returning the actual
> distance but a number indicative of 'near', 'far' etc. (it actually returns
> 1, 2 etc).

Hm... ok.  To me at least it doesn't really convey that meaning, but
then I'm not sure what would.  To be "relative distance" means the
distance relative to some other object, but then all the NUMA
distances are that - the distance of one node relative to another.

> >> ---
> >>  arch/powerpc/include/asm/topology.h   |  4 ++--
> >>  arch/powerpc/mm/numa.c| 10 +-
> >>  arch/powerpc/platforms/pseries/lpar.c |  4 ++--
> >>  3 files changed, 13 insertions(+), 5 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/include/asm/topology.h 
> >> b/arch/powerpc/include/asm/topology.h
> >> index e4db64c0e184..ac8b5ed79832 100644
> >> --- a/arch/powerpc/include/asm/topology.h
> >> +++ b/arch/powerpc/include/asm/topology.h
> >> @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
> >> cpu_all_mask : \
> >> cpumask_of_node(pcibus_to_node(bus)))
> >>  
> >> -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
> >> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
> >>  extern int __node_distance(int, int);
> >>  #define node_distance(a, b) __node_distance(a, b)
> >>  
> >> @@ -83,7 +83,7 @@ static inline void sysfs_remove_device_from_node(struct 
> >> device *dev,
> >>  
> >>  static inline void update_numa_cpu_lookup_table(unsigned int cpu, int 
> >> node) {}
> >>  
> >> -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> >> +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 
> >> *cpu2_assoc)
> >>  {
> >>return 0;
> >>  }
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 7b142f79d600..c6293037a103 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
> >>  }
> >>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
> >>  
> >> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> >> +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 
> >> *cpu2_assoc)
> >>  {
> >>int dist = 0;
> >>  
> >> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 
> >> *cpu2_assoc)
> >>return dist;
> >>  }
> >>  
> >> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> >> +{
> >> +  /* We should not get called with FORM0 */
> >> +  VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> >> +
> >> +  return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
> >> +}
> >> +
> >>  /* must hold reference to node during call */
> >>  static const __be32 *of_get_associativity(struct device_node *dev)
> >>  {
> >> diff --git a/arch/powerpc/platforms/pseries/lpar.c 
> >> b/arch/powerpc/platforms/pseries/lpar.c
> >> index dab356e3ff87..afefbdfe768d 100644
> >> --- a/arch/powerpc/platforms/pseries/lpar.c
> >> +++ b/arch/powerpc/platforms/pseries/lpar.c
> >> @@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int 
> >> last_disp_cpu, int cur_disp_cpu)
> >>if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
> >>return -EIO;
> >>  
> >> -  return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
> >> +  return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
> >>  }
> >>  
> >>  static int cpu_home_node_dispatch_distance(int disp_cpu)
> >> @@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int 
> >> disp_cpu)
> >>if (!disp_cpu_assoc || !vcpu_assoc)
> >>return -EIO;
> >>  
> >> -  return cpu_distance(disp_cpu_assoc, vcpu_assoc);
> >> +  return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
> >>  }
> >>  
> >>  static void update_vcpu_disp_stat(int disp_cpu)
> >
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-25 Thread David Gibson
On Thu, Jul 22, 2021 at 10:47:49AM +0530, Aneesh Kumar K.V wrote:
> On 7/22/21 8:06 AM, David Gibson wrote:
> > On Thu, Jul 22, 2021 at 11:59:15AM +1000, David Gibson wrote:
> > > On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> > > > No functional change in this patch.
> > > 
> > > The new name does not match how you describe "primary domain index" in
> > > the documentation from patch 6/6.  There it comes from the values in
> > > associativity-reference-points, but here it simply comes from the
> > > lengths of all the associativity properties.
> > 
> > No, sorry, I misread this code... misled by the old name, so it's a
> > good thing you're changing it.
> > 
> > But.. I'm still not sure the new name is accurate, either...
> > 
> > [snip]
> > > > if (form1_affinity) {
> > > > -   depth = of_read_number(distance_ref_points, 1);
> > > > +   index = of_read_number(distance_ref_points, 1);
> > 
> > AFACIT distance_ref_points hasn't been altered from the
> > of_get_property() at this point, so isn't this setting depth / index
> > to the number of entries in ref-points, rather than the value of the
> > first entry (which is what primary domain index is supposed to be).
> > 
> 
> ibm,associativity-reference-points property format is as below.
> 
> # lsprop  ibm,associativity-reference-points
> ibm,associativity-reference-points
>  0004 0002
> 
> it doesn't have the number of elements as the first item.
> 
> For FORM1 1 element is the NUMA boundary index/primary_domain_index
> For FORM0 2 element is the NUMA boundary index/primary_domain_index.

Sorry, my bad.  I foolishly expected consistency from PAPR.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-21 Thread David Gibson
On Thu, Jul 22, 2021 at 11:59:15AM +1000, David Gibson wrote:
> On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> > No functional change in this patch.
> 
> The new name does not match how you describe "primary domain index" in
> the documentation from patch 6/6.  There it comes from the values in
> associativity-reference-points, but here it simply comes from the
> lengths of all the associativity properties.

No, sorry, I misread this code... misled by the old name, so it's a
good thing you're changing it.

But.. I'm still not sure the new name is accurate, either...

[snip]
> > if (form1_affinity) {
> > -   depth = of_read_number(distance_ref_points, 1);
> > +   index = of_read_number(distance_ref_points, 1);

AFACIT distance_ref_points hasn't been altered from the
of_get_property() at this point, so isn't this setting depth / index
to the number of entries in ref-points, rather than the value of the
first entry (which is what primary domain index is supposed to be).

> > } else {
> > if (distance_ref_points_depth < 2) {
> > printk(KERN_WARNING "NUMA: "
> > @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
> > goto err;
> > }
> >  
> > -   depth = of_read_number(_ref_points[1], 1);
> > +   index = of_read_number(_ref_points[1], 1);
> > }
> >  
> > /*
> > @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
> > }
> >  
> > of_node_put(root);
> > -   return depth;
> > +   return index;
> >  
> >  err:
> > of_node_put(root);
> > @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
> > int nid = default_nid;
> > int rc, index;
> >  
> > -   if ((min_common_depth < 0) || !numa_enabled)
> > +   if ((primary_domain_index < 0) || !numa_enabled)
> > return default_nid;
> >  
> > rc = of_get_assoc_arrays();
> > if (rc)
> > return default_nid;
> >  
> > -   if (min_common_depth <= aa.array_sz &&
> > +   if (primary_domain_index <= aa.array_sz &&
> > !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> > aa.n_arrays) {
> > -   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> > +   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> > nid = of_read_number([index], 1);
> >  
> > if (nid == 0x || nid >= nr_node_ids)
> > @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
> > return -1;
> > }
> >  
> > -   min_common_depth = find_min_common_depth();
> > +   primary_domain_index = find_primary_domain_index();
> >  
> > -   if (min_common_depth < 0) {
> > +   if (primary_domain_index < 0) {
> > /*
> > -* if we fail to parse min_common_depth from device tree
> > +* if we fail to parse primary_domain_index from device tree
> >  * mark the numa disabled, boot with numa disabled.
> >  */
> > numa_enabled = false;
> > -   return min_common_depth;
> > +   return primary_domain_index;
> > }
> >  
> > -   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> > +   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> > primary_domain_index);
> >  
> > /*
> >  * Even though we connect cpus to numa domains later in SMP
> > @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
> > goto out;
> > }
> >  
> > -   max_nodes = of_read_number([min_common_depth], 1);
> > +   max_nodes = of_read_number([primary_domain_index], 1);
> > for (i = 0; i < max_nodes; i++) {
> >     if (!node_possible(i))
> > node_set(i, node_possible_map);
> > }
> >  
> > prop_length /= sizeof(int);
> > -   if (prop_length > min_common_depth + 2)
> > +   if (prop_length > primary_domain_index + 2)
> > coregroup_enabled = 1;
> >  
> >  out:
> > @@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
> > goto out;
> >  
> > index = of_read_number(associativity, 1);
> > -   if (index > min_common_depth + 1)
> > +   if (index > primary_domain_index + 1)
> > return of_read_number([index - 1], 1);
> >  
> >  out:
> 



-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:13PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/mm/numa.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..132813dd1a6c 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_associativity_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>   int i, index;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   index = be32_to_cpu(distance_ref_points[i]);
>   if (cpu1_assoc[index] == cpu2_assoc[index])
>   break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>   if (!form1_affinity)
>   return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>   break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>   if (!form1_affinity)
>   return;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   const __be32 *entry;
>  
>   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   nid = NUMA_NO_NODE;
>  
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= 
> max_associativity_domain_index) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>*/
>   distance_ref_points = of_get_property(root,
>   "ibm,associativity-reference-points",
> - _ref_points_depth);
> + _associativity_domain_index);
>  
>   if (!distance_ref_points) {
>   dbg("NUMA: ibm,associativity-reference-points not found.\n");
>   goto err;
>   }
>  
> - distance_ref_points_depth /= sizeof(int);
> + max_associativity_domain_index /= sizeof(int);
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL) ||
>   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>   if (form1_affinity) {
>   index = of_read_number(distance_ref_points, 1);
>   } else {
> - if (distance_ref_points_depth < 2) {
> + if (max_associativity_domain_index < 2) {
>   printk(KERN_WARNING "NUMA: "
>   "short ibm,associativity-reference-points\n");
>   goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>* Warn and cap if the hardware supports more than
>* MAX_DISTANCE_REF_POINTS domains.
>*/
> - if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> + if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
>   printk(KERN_WARNING "NUMA: distance array capped at "
>   "%d entries\n", MAX_DISTANCE_REF_POINTS);
> - distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> + max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
>   }
>  
>   of_node_put(root);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch.

The new name does not match how you describe "primary domain index" in
the documentation from patch 6/6.  There it comes from the values in
associativity-reference-points, but here it simply comes from the
lengths of all the associativity properties.

> 
> Reviewed-by: David Gibson 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c | 38 +++---
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..8365b298ec48 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
>  EXPORT_SYMBOL(node_to_cpumask_map);
>  EXPORT_SYMBOL(node_data);
>  
> -static int min_common_depth;
> +static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
> @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   if (!numa_enabled)
>   goto out;
>  
> - if (of_read_number(associativity, 1) >= min_common_depth)
> - nid = of_read_number([min_common_depth], 1);
> + if (of_read_number(associativity, 1) >= primary_domain_index)
> + nid = of_read_number([primary_domain_index], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static int __init find_min_common_depth(void)
> +static int __init find_primary_domain_index(void)
>  {
> - int depth;
> + int index;
>   struct device_node *root;
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL))
> @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   if (form1_affinity) {
> - depth = of_read_number(distance_ref_points, 1);
> + index = of_read_number(distance_ref_points, 1);
>   } else {
>   if (distance_ref_points_depth < 2) {
>   printk(KERN_WARNING "NUMA: "
> @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
>   goto err;
>   }
>  
> - depth = of_read_number(_ref_points[1], 1);
> + index = of_read_number(_ref_points[1], 1);
>   }
>  
>   /*
> @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   of_node_put(root);
> - return depth;
> + return index;
>  
>  err:
>   of_node_put(root);
> @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   int nid = default_nid;
>   int rc, index;
>  
> - if ((min_common_depth < 0) || !numa_enabled)
> + if ((primary_domain_index < 0) || !numa_enabled)
>   return default_nid;
>  
>   rc = of_get_assoc_arrays();
>   if (rc)
>   return default_nid;
>  
> - if (min_common_depth <= aa.array_sz &&
> + if (primary_domain_index <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> - index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>   nid = of_read_number([index], 1);
>  
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
>   return -1;
>   }
>  
> - min_common_depth = find_min_common_depth();
> + primary_domain_index = find_primary_domain_index();
>  
> - if (min_common_depth < 0) {
> + if (primary_domain_index < 0) {
>   /*
> -  * if we fail to parse min_common_depth from device tree
> +  * if we fail to parse primary_domain_index from device tree
>* mark the numa disabled, boot with numa disabled.
>*/
>   numa_enabled = false;
> - return min_common_depth;
> + return primary_domain_index;
>   }
>  
> - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> + dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> primary_domain_index);
>  
>   /*
>* Even though we connect cpus to numa domains later in SMP
> @@ -919,14 +919,14 @@ static void __init find_pos

Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-07-21 Thread David Gibson
if (nid == NUMA_NO_NODE) {
> + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));

What's the memset() for?  AFAICT you only look at vphn_assoc in the
branch where __vphn_get_associativity() succeeds.

> +
> + if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> + nid = associativity_to_nid(vphn_assoc);
> + __initialize_form1_numa_distance(vphn_assoc);
> + } else {
> +
> + /*
> +  * Don't fall back to default_nid yet -- we will plug
> +  * cpus into nodes once the memory scan has discovered
> +  * the topology.
> +  */
>   cpu = of_get_cpu_node(i, NULL);
>   BUG_ON(!cpu);
> - nid = of_node_to_nid_single(cpu);
> +
> + associativity = of_get_associativity(cpu);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + }
>   of_node_put(cpu);
>   }
>  
> @@ -781,8 +859,11 @@ static int __init parse_numa_properties(void)
>* have associativity properties.  If none, then
>* everything goes to default_nid.
>*/
> - nid = of_node_to_nid_single(memory);
> - if (nid < 0)
> + associativity = of_get_associativity(memory);
> + if (associativity) {
> + nid = associativity_to_nid(associativity);
> + __initialize_form1_numa_distance(associativity);
> + } else
>   nid = default_nid;
>  
>   fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), );
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>   return saved_rc;
>   }
>  
> + update_numa_distance(dn);
> +
>   rc = dlpar_online_cpu(dn);
>   if (rc) {
>   saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 36f66556a7c6..40d350f31a34 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> drmem_lmb *lmb)
>   return -ENODEV;
>   }
>  
> + update_numa_distance(lmb_node);
> +
>   dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>   if (!dr_node) {
>   dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 5/6] powerpc/pseries: Add a helper for form1 cpu distance

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:16PM +0530, Aneesh Kumar K.V wrote:
> This helper is only used with the dispatch trace log collection.
> A later patch will add Form2 affinity support and this change helps
> in keeping that simpler. Also add a comment explaining we don't expect
> the code to be called with FORM0
> 
> Reviewed-by: David Gibson 
> Signed-off-by: Aneesh Kumar K.V 

What makes it a "relative_distance" rather than just a "distance"?

> ---
>  arch/powerpc/include/asm/topology.h   |  4 ++--
>  arch/powerpc/mm/numa.c| 10 +-
>  arch/powerpc/platforms/pseries/lpar.c |  4 ++--
>  3 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index e4db64c0e184..ac8b5ed79832 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
>cpu_all_mask : \
>cpumask_of_node(pcibus_to_node(bus)))
>  
> -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
>  extern int __node_distance(int, int);
>  #define node_distance(a, b) __node_distance(a, b)
>  
> @@ -83,7 +83,7 @@ static inline void sysfs_remove_device_from_node(struct 
> device *dev,
>  
>  static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) 
> {}
>  
> -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
>   return 0;
>  }
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 7b142f79d600..c6293037a103 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
>   int dist = 0;
>  
> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>   return dist;
>  }
>  
> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> + /* We should not get called with FORM0 */
> + VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> +
> + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
> +}
> +
>  /* must hold reference to node during call */
>  static const __be32 *of_get_associativity(struct device_node *dev)
>  {
> diff --git a/arch/powerpc/platforms/pseries/lpar.c 
> b/arch/powerpc/platforms/pseries/lpar.c
> index dab356e3ff87..afefbdfe768d 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int 
> last_disp_cpu, int cur_disp_cpu)
>   if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
>   return -EIO;
>  
> - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
> + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
>  }
>  
>  static int cpu_home_node_dispatch_distance(int disp_cpu)
> @@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu)
>   if (!disp_cpu_assoc || !vcpu_assoc)
>   return -EIO;
>  
> - return cpu_distance(disp_cpu_assoc, vcpu_assoc);
> + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
>  }
>  
>  static void update_vcpu_disp_stat(int disp_cpu)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 6/6] powerpc/pseries: Add support for FORM2 associativity

2021-07-21 Thread David Gibson
nid = of_node_to_nid_single(node);
> + if (nid == NUMA_NO_NODE)
> + return;
> +
> + /*
> +  * With FORM2 we expect NUMA distance of all possible NUMA
> +  * nodes to be provided during boot.
> +  */
> + WARN(numa_distance_table[nid][nid] == -1,
> +  "NUMA distance details for node %d not provided\n", nid);
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, . domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6,  N elements}
> + */
> +static void initialize_form2_numa_distance_lookup_table(struct device_node 
> *root)
> +{
> + int i, j;
> + const __u8 *numa_dist_table;
> + const __be32 *numa_lookup_index;
> + int numa_dist_table_length;
> + int max_numa_index, distance_index;
> +
> + numa_lookup_index = of_get_property(root, 
> "ibm,numa-lookup-index-table", NULL);
> + max_numa_index = of_read_number(_lookup_index[0], 1);
> +
> + /* first element of the array is the size and is encode-int */
> + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", 
> NULL);
> + numa_dist_table_length = of_read_number((const __be32 
> *)_dist_table[0], 1);
> + /* Skip the size which is encoded int */
> + numa_dist_table += sizeof(__be32);
> +
> + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
> +  numa_dist_table_length, max_numa_index);
> +
> + for (i = 0; i < max_numa_index; i++)
> + /* +1 skip the max_numa_index in the property */
> + numa_id_index_table[i] = of_read_number(_lookup_index[i + 
> 1], 1);
> +
> +
> + if (numa_dist_table_length != max_numa_index * max_numa_index) {
> +
> + WARN(1, "Wrong NUMA distance information\n");
> + /* consider everybody else just remote. */
> + for (i = 0;  i < max_numa_index; i++) {
> + for (j = 0; j < max_numa_index; j++) {
> + int nodeA = numa_id_index_table[i];
> + int nodeB = numa_id_index_table[j];
> +
> + if (nodeA == nodeB)
> + numa_distance_table[nodeA][nodeB] = 
> LOCAL_DISTANCE;
> + else
> + numa_distance_table[nodeA][nodeB] = 
> REMOTE_DISTANCE;
> + }
> + }
> + }
> +
> + distance_index = 0;
> + for (i = 0;  i < max_numa_index; i++) {
> + for (j = 0; j < max_numa_index; j++) {
> + int nodeA = numa_id_index_table[i];
> + int nodeB = numa_id_index_table[j];
> +
> + numa_distance_table[nodeA][nodeB] = 
> numa_dist_table[distance_index++];
> + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, 
> numa_distance_table[nodeA][nodeB]);
> + }
> + }
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -323,6 +420,9 @@ static int __init find_primary_domain_index(void)
>*/
>   if (firmware_has_feature(FW_FEATURE_OPAL)) {
>   affinity_form = FORM1_AFFINITY;
> + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> + dbg("Using form 2 affinity\n");
> + affinity_form = FORM2_AFFINITY;
>   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>       dbg("Using form 1 affinity\n");
>   affinity_form = FORM1_AFFINITY;
> @@ -367,8 +467,17 @@ static int __init find_primary_domain_index(void)
>  
>   index = of_read_number(_ref_points[1], 1);
>   } else {
> + /*
> +  * Both FORM1 and FORM2 affinity find the primary domain details
> +  * at the same offset.
> +  */
>   index = of_read_number(distance_ref_points, 1);
>   }
> + /*
> +  * If it is FORM2 also initialize the distance table here.
> +  */
> + if (affinity_form == FORM2_AFFINITY)
> + initialize_form2_numa_distance_lookup_table(root);

Ew.  Calling a function called "find_primary_domain_index" to also
initialize the main distance table is needlessly counterintuitive.
Move this call to parse_numa_properties().
>  
>   /*
>* Warn and cap if the hardware supports more than
> diff --git a/arch/powerpc/platforms/pseries/firmware.c 
> b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>   {FW_FEATURE_PRRN,   OV5_PRRN},
>   {FW_FEATURE_DRMEM_V2,   OV5_DRMEM_V2},
>   {FW_FEATURE_DRC_INFO,   OV5_DRC_INFO},
> + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-07-21 Thread David Gibson
On Thu, Jul 22, 2021 at 10:59:09AM +1000, David Gibson wrote:
> On Mon, Jun 28, 2021 at 08:41:13PM +0530, Aneesh Kumar K.V wrote:
> > No functional change in this patch
> > 
> > Signed-off-by: Aneesh Kumar K.V 
> 
> Reviewed-by: David Gibson 

No... wait, I take that back.  This change makes the code *more*
confusing.

"distance_ref_points_depth" is accurate - it's the length of the
distance_ref_points array.

"max_associativity_domain_index" is not.  That implies it's the
maximum value that a domain index can have - which it isn't.  You
could have 15 entries in every associativity array, but if only 2 of
them are referenced in distance_ref_points, then
"max_associativity_domain_index" would only be 2.

> 
> > ---
> >  arch/powerpc/mm/numa.c | 20 ++--
> >  1 file changed, 10 insertions(+), 10 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > index 8365b298ec48..132813dd1a6c 100644
> > --- a/arch/powerpc/mm/numa.c
> > +++ b/arch/powerpc/mm/numa.c
> > @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
> >  static int form1_affinity;
> >  
> >  #define MAX_DISTANCE_REF_POINTS 4
> > -static int distance_ref_points_depth;
> > +static int max_associativity_domain_index;
> >  static const __be32 *distance_ref_points;
> >  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> >  
> > @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> >  
> > int i, index;
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > index = be32_to_cpu(distance_ref_points[i]);
> > if (cpu1_assoc[index] == cpu2_assoc[index])
> > break;
> > @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
> > if (!form1_affinity)
> > return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
> > break;
> >  
> > @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
> > if (!form1_affinity)
> > return;
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > const __be32 *entry;
> >  
> > entry = [be32_to_cpu(distance_ref_points[i]) - 1];
> > @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> > *associativity)
> > nid = NUMA_NO_NODE;
> >  
> > if (nid > 0 &&
> > -   of_read_number(associativity, 1) >= distance_ref_points_depth) {
> > +   of_read_number(associativity, 1) >= 
> > max_associativity_domain_index) {
> > /*
> >  * Skip the length field and send start of associativity array
> >  */
> > @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
> >  */
> > distance_ref_points = of_get_property(root,
> > "ibm,associativity-reference-points",
> > -   _ref_points_depth);
> > +   _associativity_domain_index);
> >  
> > if (!distance_ref_points) {
> > dbg("NUMA: ibm,associativity-reference-points not found.\n");
> > goto err;
> > }
> >  
> > -   distance_ref_points_depth /= sizeof(int);
> > +   max_associativity_domain_index /= sizeof(int);
> >  
> > if (firmware_has_feature(FW_FEATURE_OPAL) ||
> > firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> > @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
> > if (form1_affinity) {
> > index = of_read_number(distance_ref_points, 1);
> > } else {
> > -   if (distance_ref_points_depth < 2) {
> > +   if (max_associativity_domain_index < 2) {
> > printk(KERN_WARNING "NUMA: "
> > "short ibm,associativity-reference-points\n");
> > goto err;
> > @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
> >  * Warn and cap if the hardware supports more than
> >  * MAX_DISTANCE_REF_POI

Re: [PATCH v8 3/6] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-07-04 Thread David Gibson
signed long 
> end,
> +  unsigned long pid, unsigned long lpid,
> +  unsigned long page_size,
> +  unsigned long psize)
> +{
> + unsigned long addr;
> + unsigned long ap = mmu_get_ap(psize);
> +
> + for (addr = start; addr < end; addr += page_size)
> + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
> +
> + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
> +}
> +
>  static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
> unsigned long psize, unsigned long ric)
>  {
> @@ -549,6 +655,18 @@ static inline void _tlbie_va_range(unsigned long start, 
> unsigned long end,
>   asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long 
> end,
> + unsigned long pid, unsigned long lpid,
> + unsigned long page_size,
> + unsigned long psize, bool also_pwc)
> +{
> + asm volatile("ptesync" : : : "memory");
> + if (also_pwc)
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
> + asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
> +
>  static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
>   unsigned long start, unsigned long end,
>   unsigned long pid, unsigned long page_size,
> @@ -1381,4 +1499,58 @@ extern void radix_kvm_prefetch_workaround(struct 
> mm_struct *mm)
>   }
>  }
>  EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
> +
> +/*
> + * Performs process-scoped invalidations for a given LPID
> + * as part of H_RPT_INVALIDATE hcall.
> + */
> +void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
> +  unsigned long type, unsigned long pg_sizes,
> +  unsigned long start, unsigned long end)
> +{
> + unsigned long psize, nr_pages;
> + struct mmu_psize_def *def;
> + bool flush_pid;
> +
> + /*
> +  * A H_RPTI_TYPE_ALL request implies RIC=3, hence
> +  * do a single IS=1 based flush.
> +  */
> + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> + return;
> + }
> +
> + if (type & H_RPTI_TYPE_PWC)
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +
> + /* Full PID flush */
> + if (start == 0 && end == -1)
> + return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> +
> + /* Do range invalidation for all the valid page sizes */
> + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
> + def = _psize_defs[psize];
> + if (!(pg_sizes & def->h_rpt_pgsize))
> + continue;
> +
> + nr_pages = (end - start) >> def->shift;
> + flush_pid = nr_pages > tlb_single_page_flush_ceiling;
> +
> + /*
> +  * If the number of pages spanning the range is above
> +  * the ceiling, convert the request into a full PID flush.
> +  * And since PID flush takes out all the page sizes, there
> +  * is no need to consider remaining page sizes.
> +  */
> + if (flush_pid) {
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> + return;
> + }
> + _tlbie_va_range_lpid(start, end, pid, lpid,
> +  (1UL << def->shift), psize, false);
> + }
> +}
> +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
> +
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 7/7] powerpc/pseries: Add support for FORM2 associativity

2021-06-28 Thread David Gibson
On Thu, Jun 24, 2021 at 01:50:34PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Thu, Jun 17, 2021 at 10:21:05PM +0530, Aneesh Kumar K.V wrote:
> >> PAPR interface currently supports two different ways of communicating 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0 and Form 1
> >> associativity grouping. Form 0 is the older format and is now considered
> >> deprecated. This patch adds another resource grouping named FORM2.
> >> 
> >> Signed-off-by: Daniel Henrique Barboza 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 135 
> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >>  arch/powerpc/mm/numa.c| 149 +-
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 286 insertions(+), 6 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index ..93be604ac54d
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,135 @@
> >> +
> >> +NUMA resource associativity
> >> +=
> >> +
> >> +Associativity represents the groupings of the various platform resources 
> >> into
> >> +domains of substantially similar mean performance relative to resources 
> >> outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources 
> >> subsets
> >> +are represented as being members of a sub-grouping domain. This 
> >> performance
> >> +characteristic is presented in terms of NUMA node distance within the 
> >> Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >> +
> >> +PAPR interface currently supports different ways of communicating these 
> >> resource
> >> +grouping details to the OS. These are referred to as Form 0, Form 1 and 
> >> Form2
> >> +associativity grouping. Form 0 is the older format and is now considered 
> >> deprecated.
> >> +
> >> +Hypervisor indicates the type/form of associativity used via 
> >> "ibm,arcitecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> >> associativity
> >> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >> +
> >> +Form 0
> >> +-
> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-
> >> +With Form 1 a combination of ibm,associativity-reference-points and 
> >> ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between 
> >> resource groups/domains.
> >> +
> >> +The “ibm,associativity” property contains one or more lists of numbers 
> >> (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains one or more 
> >> list of numbers
> >> +(domainID index) that represents the 1 based ordinal in the associativity 
> >> lists.
> >> +The list of domainID index represnets increasing hierachy of
> >> resource grouping.
> >
> > Typo "represnets".  Also s/hierachy/hierarchy/
> >
> >> +
> >> +ex:
> >> +{ primary domainID index, secondary domainID index, tertiary domainID 
> >> index.. }
> >
> >> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
> >> node id.
> >> +Linux kernel computes NUMA distance between two domains by recursively 
> >> comparing
> >> +if they belong to the same higher-level domains. For mismatch at every 
> >> higher
> >> +level of the resource group, the kernel doubles the NUMA distance between 
> >> the
> >> +comparing 

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-23 Thread David Gibson
On Thu, Jun 17, 2021 at 04:29:01PM +0530, Aneesh Kumar K.V wrote:
> On 6/17/21 1:16 PM, David Gibson wrote:
> > On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> > > David Gibson  writes:
> > > 
> > > > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> > > > > David Gibson  writes:
> 
> ...
> 
> > > > It's weird to me that you'd want to consider them in different nodes
> > > > for those different purposes.
> > > 
> > > 
> > > --
> > >|NUMA node0 |
> > >|ProcA -> MEMA  |
> > >| | |
> > >|  | |
> > >|  ---> PMEMB|
> > >|   |
> > > ---
> > > 
> > > ---
> > >|NUMA node1 |
> > >|   |
> > >|ProcB ---> MEMC|
> > >|  | |
> > >|  ---> PMEMD|
> > >|   |
> > >|   |
> > > ---
> > > 
> > > For a topology like the above application running of ProcA wants to find 
> > > out
> > > persistent memory mount local to its NUMA node. Hence when using it as
> > > pmem fsdax mount or devdax device we want PMEMB to have associativity
> > > of NUMA node0 and PMEMD to have associativity of NUMA node 1. But when
> > > we want to use it as memory using dax kmem driver, we want both PMEMB
> > > and PMEMD to appear as memory only NUMA node at a distance that is
> > > derived based on the latency of the media.
> > 
> > I'm still not understanding why the latency we care about is different
> > in the two cases.  Can you give an example of when this would result
> > in different actual node assignments for the two different cases?
> > 
> 
> In the above example in order allow use of PMEMB and PMEMD as memory only
> NUMA nodes
> we need platform to represent them in its own domainID. Let's assume that
> platform assigned id 40 and 41 and hence both PMEMB and PMEMD will have
> associativity array like below
> 
> { 4, 6, 0}  -> PROCA/MEMA
> { 4, 6, 40} -> PMEMB
> { 4, 6, 41} -> PMEMD
> { 4, 6, 1} ->  PROCB/MEMB
> 
> When we want to use this device PMEMB and PMEMD as fsdax/devdax devices, we
> essentially look for the first nearest online node. Which means both PMEMB
> and PMEMD will appear as devices attached to node0. That is not ideal for
> for many applications.

Not if you actually look at the distance table which tells you that
PMEMB is closer to node0 and PMEMD is closer to node1.  That's exactly
what the distance table is for - making this information explicit,
rather than intuited from a confusing set of nested domains.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 4/7] powerpc/pseries: Consolidate DLPAR NUMA distance update

2021-06-23 Thread David Gibson
On Thu, Jun 17, 2021 at 10:21:02PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call. In
> later patch we will remove updating NUMA distance when we are looking
> for node id from associativity array.
> 
> Signed-off-by: Aneesh Kumar K.V 

I think this patch and the next would be easier to review if merged
together.  That would make the fact that this is (half of) a code
motion clearer.

> ---
>  arch/powerpc/mm/numa.c| 41 +++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
>  .../platforms/pseries/hotplug-memory.c|  2 +
>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>  4 files changed, 46 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 0ec16999beef..645a95e3a7ea 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> + int i, nid;
> +
> + if (of_read_number(associativity, 1) >= primary_domain_index) {
> + nid = of_read_number([primary_domain_index], 1);
> +
> + for (i = 0; i < max_domain_index; i++) {
> + const __be32 *entry;
> +
> + entry = 
> [be32_to_cpu(distance_ref_points[i])];
> + distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
> + }
> + }
> +}
> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> + const __be32 *associativity;
> +
> + associativity = of_get_associativity(node);
> + if (!associativity)
> + return;
> +
> + __initialize_form1_numa_distance(associativity);
> + return;
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> + if (affinity_form == FORM0_AFFINITY)
> + return;
> + else if (affinity_form == FORM1_AFFINITY) {
> + initialize_form1_numa_distance(node);
> + return;
> + }
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>   int index;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>   return saved_rc;
>   }
>  
> + update_numa_distance(dn);
> +
>   rc = dlpar_online_cpu(dn);
>   if (rc) {
>   saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..0e602c3b01ea 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> drmem_lmb *lmb)
>   return -ENODEV;
>   }
>  
> + update_numa_distance(lmb_node);
> +
>   dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>   if (!dr_node) {
>   dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 2/7] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-06-23 Thread David Gibson
On Thu, Jun 17, 2021 at 10:21:00PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch

I've been convinced of your other rename, but I'm not yet convinced
this one actually clarifies anything.

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..132813dd1a6c 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_associativity_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>   int i, index;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   index = be32_to_cpu(distance_ref_points[i]);
>   if (cpu1_assoc[index] == cpu2_assoc[index])
>   break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>   if (!form1_affinity)
>   return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>   break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>   if (!form1_affinity)
>   return;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   const __be32 *entry;
>  
>   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   nid = NUMA_NO_NODE;
>  
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= 
> max_associativity_domain_index) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>*/
>   distance_ref_points = of_get_property(root,
>   "ibm,associativity-reference-points",
> - _ref_points_depth);
> + _associativity_domain_index);
>  
>   if (!distance_ref_points) {
>   dbg("NUMA: ibm,associativity-reference-points not found.\n");
>   goto err;
>   }
>  
> - distance_ref_points_depth /= sizeof(int);
> + max_associativity_domain_index /= sizeof(int);
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL) ||
>   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>   if (form1_affinity) {
>   index = of_read_number(distance_ref_points, 1);
>   } else {
> - if (distance_ref_points_depth < 2) {
> + if (max_associativity_domain_index < 2) {
>   printk(KERN_WARNING "NUMA: "
>   "short ibm,associativity-reference-points\n");
>   goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>* Warn and cap if the hardware supports more than
>* MAX_DISTANCE_REF_POINTS domains.
>*/
> - if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> + if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
>   printk(KERN_WARNING "NUMA: distance array capped at "
>   "%d entries\n", MAX_DISTANCE_REF_POINTS);
> - distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> + max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
>   }
>  
>   of_node_put(root);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 7/7] powerpc/pseries: Add support for FORM2 associativity

2021-06-23 Thread David Gibson
 splitting this loop and carrying i over seems a confusing way
to code this.  It's basically two loops of N, one writing a row of the
distance matrix, one writing a column (other_nid will even go through
the same values in each loop).

> + numa_distance = numa_distancep[i];
> + other_nid = numa_id_index_table[other_nid_index++];
> + numa_distance_table[nid][other_nid] = numa_distance;
> + }
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, . domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6,  N elements}
> + */
> +static void initialize_form2_numa_distance_lookup_table(struct device_node 
> *root)
> +{
> + const __u8 *numa_dist_table;
> + const __be32 *numa_lookup_index;
> + int numa_dist_table_length;
> + int max_numa_index, distance_index;
> + int i, curr_row = 0, curr_column = 0;
> +
> + numa_lookup_index = of_get_property(root, 
> "ibm,numa-lookup-index-table", NULL);
> + max_numa_index = of_read_number(_lookup_index[0], 1);

max_numa_index here has a different meaning to max_numa_index in the
previous function, which is pointlessly confusing.

> + /* first element of the array is the size and is encode-int */
> + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", 
> NULL);
> + numa_dist_table_length = of_read_number((const __be32 
> *)_dist_table[0], 1);
> + /* Skip the size which is encoded int */
> + numa_dist_table += sizeof(__be32);
> +
> + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d \n",
> +  numa_dist_table_length, max_numa_index);
> +
> + for (i = 0; i < max_numa_index; i++)
> + /* +1 skip the max_numa_index in the property */
> + numa_id_index_table[i] = of_read_number(_lookup_index[i + 
> 1], 1);
> +
> +
> + VM_WARN_ON(numa_dist_table_length != max_numa_index * max_numa_index);

Again, you don't actually bail out in this case.  And if it has to
have this value, what's the point of encoding it into the property.

> + for (distance_index = 0; distance_index < numa_dist_table_length; 
> distance_index++) {
> + int nodeA = numa_id_index_table[curr_row];
> + int nodeB = numa_id_index_table[curr_column++];

You've already (sort of) verified that the distance table has size
N^2, in which case you can just to a simple two dimensional loop
rather than having to do ugly calculations of row and column.

> +
> + numa_distance_table[nodeA][nodeB] = 
> numa_dist_table[distance_index];
> +
> + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, 
> numa_distance_table[nodeA][nodeB]);
> + if (curr_column >= max_numa_index) {
> + curr_row++;
> + /* reset the column */
> + curr_column = 0;
> + }
> + }
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -324,6 +453,9 @@ static int __init find_primary_domain_index(void)
>*/
>   if (firmware_has_feature(FW_FEATURE_OPAL)) {
>   affinity_form = FORM1_AFFINITY;
> + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> + dbg("Using form 2 affinity\n");
> + affinity_form = FORM2_AFFINITY;
>   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>   dbg("Using form 1 affinity\n");
>   affinity_form = FORM1_AFFINITY;
> @@ -368,8 +500,17 @@ static int __init find_primary_domain_index(void)
>  
>   index = of_read_number(_ref_points[1], 1);
>   } else {
> + /*
> +  * Both FORM1 and FORM2 affinity find the primary domain details
> +  * at the same offset.
> +  */
>   index = of_read_number(distance_ref_points, 1);
>   }
> + /*
> +  * If it is FORM2 also initialize the distance table here.
> +  */
> + if (affinity_form == FORM2_AFFINITY)
> + initialize_form2_numa_distance_lookup_table(root);
>  
>   /*
>* Warn and cap if the hardware supports more than
> diff --git a/arch/powerpc/platforms/pseries/firmware.c 
> b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>   {FW_FEATURE_PRRN,   OV5_PRRN},
>   {FW_FEATURE_DRMEM_V2,   OV5_DRMEM_V2},
>   {FW_FEATURE_DRC_INFO,   OV5_DRC_INFO},
> + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 1/7] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-23 Thread David Gibson
On Thu, Jun 17, 2021 at 10:20:59PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch.
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/mm/numa.c | 38 +++---
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..8365b298ec48 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
>  EXPORT_SYMBOL(node_to_cpumask_map);
>  EXPORT_SYMBOL(node_data);
>  
> -static int min_common_depth;
> +static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
> @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   if (!numa_enabled)
>   goto out;
>  
> - if (of_read_number(associativity, 1) >= min_common_depth)
> - nid = of_read_number([min_common_depth], 1);
> + if (of_read_number(associativity, 1) >= primary_domain_index)
> + nid = of_read_number([primary_domain_index], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static int __init find_min_common_depth(void)
> +static int __init find_primary_domain_index(void)
>  {
> - int depth;
> + int index;
>   struct device_node *root;
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL))
> @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   if (form1_affinity) {
> - depth = of_read_number(distance_ref_points, 1);
> + index = of_read_number(distance_ref_points, 1);
>   } else {
>   if (distance_ref_points_depth < 2) {
>   printk(KERN_WARNING "NUMA: "
> @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
>   goto err;
>   }
>  
> - depth = of_read_number(_ref_points[1], 1);
> + index = of_read_number(_ref_points[1], 1);
>   }
>  
>   /*
> @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   of_node_put(root);
> - return depth;
> + return index;
>  
>  err:
>   of_node_put(root);
> @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   int nid = default_nid;
>   int rc, index;
>  
> - if ((min_common_depth < 0) || !numa_enabled)
> + if ((primary_domain_index < 0) || !numa_enabled)
>   return default_nid;
>  
>   rc = of_get_assoc_arrays();
>   if (rc)
>   return default_nid;
>  
> - if (min_common_depth <= aa.array_sz &&
> + if (primary_domain_index <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> - index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>   nid = of_read_number([index], 1);
>  
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
>   return -1;
>   }
>  
> - min_common_depth = find_min_common_depth();
> + primary_domain_index = find_primary_domain_index();
>  
> - if (min_common_depth < 0) {
> + if (primary_domain_index < 0) {
>   /*
> -  * if we fail to parse min_common_depth from device tree
> +  * if we fail to parse primary_domain_index from device tree
>* mark the numa disabled, boot with numa disabled.
>*/
>   numa_enabled = false;
> - return min_common_depth;
> + return primary_domain_index;
>   }
>  
> - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> + dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> primary_domain_index);
>  
>   /*
>* Even though we connect cpus to numa domains later in SMP
> @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
>   goto out;
>   }
>  
> - max_nodes = of_read_number([min_common_depth], 1);
> + max_nodes = of_read_number([primary_domain_index], 1);
>   for (i = 0; i < max_nodes; i++) {
>   

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-17 Thread David Gibson
On Tue, Jun 15, 2021 at 12:35:17PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson  writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> >> when using persistent memory device. For DAX kmem use the logical domain
> >> >> id introduced in FORM2. This new numa node
> >> >> 
> >> >> Signed-off-by: Aneesh Kumar K.V 
> >> >> ---
> >> >>  arch/powerpc/mm/numa.c| 28 +++
> >> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +
> >> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> >> 
> >> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> >> --- a/arch/powerpc/mm/numa.c
> >> >> +++ b/arch/powerpc/mm/numa.c
> >> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
> >> >> *associativity)
> >> >> return nid;
> >> >>  }
> >> >>  
> >> >> +int get_primary_and_secondary_domain(struct device_node *node, int 
> >> >> *primary, int *secondary)
> >> >> +{
> >> >> +   int secondary_index;
> >> >> +   const __be32 *associativity;
> >> >> +
> >> >> +   if (!numa_enabled) {
> >> >> +   *primary = NUMA_NO_NODE;
> >> >> +   *secondary = NUMA_NO_NODE;
> >> >> +   return 0;
> >> >> +   }
> >> >> +
> >> >> +   associativity = of_get_associativity(node);
> >> >> +   if (!associativity)
> >> >> +   return -ENODEV;
> >> >> +
> >> >> +   if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> >> +   *primary = 
> >> >> of_read_number([primary_domain_index], 1);
> >> >> +   secondary_index = 
> >> >> of_read_number(_ref_points[1], 1);
> >> >
> >> > Secondary ID is always the second reference point, but primary depends
> >> > on the length of resources?  That seems very weird.
> >> 
> >> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> >> both primary and secondary domain ID same for all resources other than
> >> persistent memory device. The usage w.r.t. persistent memory is
> >> explained in patch 7.
> >
> > Right, I misunderstood
> >
> >> 
> >> With Form2 the primary domainID and secondary domainID are used to 
> >> identify the NUMA nodes
> >> the kernel should use when using persistent memory devices.
> >
> > This seems kind of bogus.  With Form1, the primary/secondary ID are a
> > sort of heirarchy of distance (things with same primary ID are very
> > close, things with same secondary are kinda-close, etc.).  With Form2,
> > it's referring to their effective node for different purposes.
> >
> > Using the same terms for different meanings seems unnecessarily
> > confusing.
> 
> They are essentially domainIDs. The interpretation of them are different
> between Form1 and Form2. Hence I kept referring to them as primary and
> secondary domainID. Any suggestion on what to name them with Form2?

My point is that reusing associativity-reference-points for something
with completely unrelated semantics seems like a very poor choice.

> >> Persistent memory devices
> >> can also be used as regular memory using DAX KMEM driver and primary 
> >> domainID indicates
> >> the numa node number OS should use when using these devices as regular 
> >> memory. Secondary
> >> domainID is the numa node number that should be used when using this 
> >> device as
> >> persistent memory.
> >
> > It's weird to me that you'd want to consider them in different nodes
> > for those different purposes.
> 
> 
>--
>   |NUMA node0 |
>   |ProcA -> MEMA  |
>   | |   

Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-17 Thread David Gibson
On Tue, Jun 15, 2021 at 01:10:27PM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> >> David Gibson  writes:
> >> 
> >> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> >> Signed-off-by: Daniel Henrique Barboza 
> >> >> Signed-off-by: Aneesh Kumar K.V 
> >> >> ---
> >> >>  Documentation/powerpc/associativity.rst   | 139 
> >> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >> >>  arch/powerpc/mm/numa.c| 149 +-
> >> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> >> 
> >> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> >> b/Documentation/powerpc/associativity.rst
> >> >> new file mode 100644
> >> >> index ..58abedea81d7
> >> >> --- /dev/null
> >> >> +++ b/Documentation/powerpc/associativity.rst
> >> >> @@ -0,0 +1,139 @@
> >> >> +
> >> >> +NUMA resource associativity
> >> >> +=
> >> >> +
> >> >> +Associativity represents the groupings of the various platform 
> >> >> resources into
> >> >> +domains of substantially similar mean performance relative to 
> >> >> resources outside
> >> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> >> +performance relative to each other than relative to other resources 
> >> >> subsets
> >> >> +are represented as being members of a sub-grouping domain. This 
> >> >> performance
> >> >> +characteristic is presented in terms of NUMA node distance within the 
> >> >> Linux kernel.
> >> >> +From the platform view, these groups are also referred to as domains.
> >> >> +
> >> >> +PAPR interface currently supports two different ways of communicating 
> >> >> these resource
> >> >
> >> > You describe form 2 below as well, which contradicts this.
> >> 
> >> Fixed as below.
> >> 
> >> PAPR interface currently supports different ways of communicating these 
> >> resource
> >> grouping details to the OS. These are referred to as Form 0, Form 1 and 
> >> Form2
> >> associativity grouping. Form 0 is the older format and is now considered 
> >> deprecated.
> >> 
> >> Hypervisor indicates the type/form of associativity used via 
> >> "ibm,arcitecture-vec-5 property".
> >> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> >> associativity
> >> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> >
> > LGTM.
> >
> >> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 
> >> >> associativity grouping.
> >> >> +Form 0 is the older format and is now considered deprecated.
> >> >> +
> >> >> +Hypervisor indicates the type/form of associativity used via 
> >> >> "ibm,arcitecture-vec-5 property".
> >> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates 
> >> >> usage of Form 0 or Form 1.
> >> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> >> +
> >> >> +Form 0
> >> >> +-
> >> >> +Form 0 associativity supports only two NUMA distance (LOCAL and 
> >> >> REMOTE).
> >> >> +
> >> >> +Form 1
> >> >> +-
> >> >> +With Form 1 a combination of ibm,associativity-reference-points and 
> >> >> ibm,associativity
> >> >> +device tree properties are used to determine the NUMA distance between 
> >> >> resource groups/domains. 
> >> >> +
> >> >> +The “ibm,associativity” property contains

Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-15 Thread David Gibson
On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> when using persistent memory device. For DAX kmem use the logical domain
> >> id introduced in FORM2. This new numa node
> >> 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  arch/powerpc/mm/numa.c| 28 +++
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +
> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 
> >> *associativity)
> >>return nid;
> >>  }
> >>  
> >> +int get_primary_and_secondary_domain(struct device_node *node, int 
> >> *primary, int *secondary)
> >> +{
> >> +  int secondary_index;
> >> +  const __be32 *associativity;
> >> +
> >> +  if (!numa_enabled) {
> >> +  *primary = NUMA_NO_NODE;
> >> +  *secondary = NUMA_NO_NODE;
> >> +  return 0;
> >> +  }
> >> +
> >> +  associativity = of_get_associativity(node);
> >> +  if (!associativity)
> >> +  return -ENODEV;
> >> +
> >> +  if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +  *primary = of_read_number([primary_domain_index], 
> >> 1);
> >> +  secondary_index = of_read_number(_ref_points[1], 1);
> >
> > Secondary ID is always the second reference point, but primary depends
> > on the length of resources?  That seems very weird.
> 
> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> both primary and secondary domain ID same for all resources other than
> persistent memory device. The usage w.r.t. persistent memory is
> explained in patch 7.

Right, I misunderstood

> 
> With Form2 the primary domainID and secondary domainID are used to identify 
> the NUMA nodes
> the kernel should use when using persistent memory devices.

This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.

> Persistent memory devices
> can also be used as regular memory using DAX KMEM driver and primary domainID 
> indicates
> the numa node number OS should use when using these devices as regular 
> memory. Secondary
> domainID is the numa node number that should be used when using this device as
> persistent memory.

It's weird to me that you'd want to consider them in different nodes
for those different purposes.

> In the later case, we are interested in the locality of the
> device to an established numa node. In the above example, if the last row 
> represents a
> persistent memory device/resource, NUMA node number 40 will be used when 
> using the device
> as regular memory and NUMA node number 0 will be the device numa node when 
> using it as
> a persistent memory device.

I don't really get what you mean by "locality of the device to an
established numa node".  Or at least how that's different from
anything else we're handling here.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-15 Thread David Gibson
On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> David Gibson  writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> Signed-off-by: Daniel Henrique Barboza 
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 139 
> >>  arch/powerpc/include/asm/firmware.h   |   3 +-
> >>  arch/powerpc/include/asm/prom.h   |   1 +
> >>  arch/powerpc/kernel/prom_init.c   |   3 +-
> >>  arch/powerpc/mm/numa.c| 149 +-
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst 
> >> b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index ..58abedea81d7
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,139 @@
> >> +
> >> +NUMA resource associativity
> >> +=
> >> +
> >> +Associativity represents the groupings of the various platform resources 
> >> into
> >> +domains of substantially similar mean performance relative to resources 
> >> outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources 
> >> subsets
> >> +are represented as being members of a sub-grouping domain. This 
> >> performance
> >> +characteristic is presented in terms of NUMA node distance within the 
> >> Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >> +
> >> +PAPR interface currently supports two different ways of communicating 
> >> these resource
> >
> > You describe form 2 below as well, which contradicts this.
> 
> Fixed as below.
> 
> PAPR interface currently supports different ways of communicating these 
> resource
> grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> associativity grouping. Form 0 is the older format and is now considered 
> deprecated.
> 
> Hypervisor indicates the type/form of associativity used via 
> "ibm,arcitecture-vec-5 property".
> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
> Form 0 or Form 1.
> A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> associativity
> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.

LGTM.

> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 
> >> associativity grouping.
> >> +Form 0 is the older format and is now considered deprecated.
> >> +
> >> +Hypervisor indicates the type/form of associativity used via 
> >> "ibm,arcitecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage 
> >> of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> +
> >> +Form 0
> >> +-
> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-
> >> +With Form 1 a combination of ibm,associativity-reference-points and 
> >> ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between 
> >> resource groups/domains. 
> >> +
> >> +The “ibm,associativity” property contains one or more lists of numbers 
> >> (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains one or more 
> >> list of numbers
> >> +(domain index) that represents the 1 based ordinal in the associativity 
> >> lists of the most
> >> +significant boundary, with subsequent entries indicating progressively 
> >> less significant boundaries.
> >> +
> >> +Linux kernel uses the domain id of the most significant boundary (aka 
> >> primary domain)
> >
> > I thought we used the *least* significant boundary (the smallest
> > grouping, not the largest).  That is, the last index, not the first.
> >
> > Actually... come to think of it, I'm not even sure how to interpret
> > "mo

Re: [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance

2021-06-14 Thread David Gibson
On Mon, Jun 14, 2021 at 10:10:01PM +0530, Aneesh Kumar K.V wrote:
> This helper is only used with the dispatch trace log collection.
> A later patch will add Form2 affinity support and this change helps
> in keeping that simpler. Also add a comment explaining we don't expect
> the code to be called with FORM0
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/mm/numa.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 64caaf07cf82..696e5bfe1414 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>   int dist = 0;
>  
> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>   return dist;
>  }
>  
> +int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> + /* We should not get called with FORM0 */
> + VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> +
> + return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +}
> +
>  /* must hold reference to node during call */
>  static const __be32 *of_get_associativity(struct device_node *dev)
>  {

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details

2021-06-14 Thread David Gibson
t;   p = kzalloc(sizeof(*p), GFP_KERNEL);
>   if (!p)
>   return -ENOMEM;
>  
> + if (get_primary_and_secondary_domain(dn, >target_node, _node)) {
> + dev_err(>dev, "%pOF: missing NUMA attributes!\n", dn);
> + rc = -ENODEV;
> + goto err;
> + }
> + p->numa_node = numa_map_to_online_node(numa_node);
> + if (numa_node != p->numa_node)
> + dev_info(>dev, "Region registered with online node %d and 
> device tree node %d",
> +  p->numa_node, numa_node);
> +
>   /* Initialize the dimm mutex */
>   mutex_init(>health_mutex);
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 663a0859cf13..9c2a1fc9ded1 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
>  void update_numa_distance(struct device_node *node);
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, 
> int *secondary);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity

2021-06-14 Thread David Gibson
stance_index < numa_dist_table_length; 
> distance_index++) {
> + int nodeA = numa_id_index_table[curr_row];
> + int nodeB = numa_id_index_table[curr_column++];
> +
> + numa_distance_table[nodeA][nodeB] = 
> numa_dist_table[distance_index];
> +
> + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, 
> numa_distance_table[nodeA][nodeB]);
> + if (curr_column >= max_numa_index) {
> + curr_row++;
> + /* reset the column */
> + curr_column = 0;
> + }
> + }
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -324,6 +453,9 @@ static int __init find_primary_domain_index(void)
>*/
>   if (firmware_has_feature(FW_FEATURE_OPAL)) {
>   affinity_form = FORM1_AFFINITY;
> + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> + dbg("Using form 2 affinity\n");
> + affinity_form = FORM2_AFFINITY;
>   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>   dbg("Using form 1 affinity\n");
>   affinity_form = FORM1_AFFINITY;
> @@ -368,8 +500,17 @@ static int __init find_primary_domain_index(void)
>  
>   index = of_read_number(_ref_points[1], 1);
>   } else {
> + /*
> +  * Both FORM1 and FORM2 affinity find the primary domain details
> +  * at the same offset.
> +  */
>   index = of_read_number(distance_ref_points, 1);
>   }
> + /*
> +  * If it is FORM2 also initialize the distance table here.
> +  */
> + if (affinity_form == FORM2_AFFINITY)
> + initialize_form2_numa_distance_lookup_table(root);
>  
>   /*
>* Warn and cap if the hardware supports more than
> diff --git a/arch/powerpc/platforms/pseries/firmware.c 
> b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>   {FW_FEATURE_PRRN,   OV5_PRRN},
>   {FW_FEATURE_DRMEM_V2,   OV5_DRMEM_V2},
>   {FW_FEATURE_DRC_INFO,   OV5_DRC_INFO},
> + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-06-14 Thread David Gibson
On Mon, Jun 14, 2021 at 10:09:58PM +0530, Aneesh Kumar K.V wrote:
> Also make related code cleanup that will allow adding FORM2_AFFINITY in
> later patches. No functional change in this patch.
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/include/asm/firmware.h   |  4 +--
>  arch/powerpc/include/asm/prom.h   |  2 +-
>  arch/powerpc/kernel/prom_init.c   |  2 +-
>  arch/powerpc/mm/numa.c| 35 ++-
>  arch/powerpc/platforms/pseries/firmware.c |  2 +-
>  5 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/firmware.h 
> b/arch/powerpc/include/asm/firmware.h
> index 7604673787d6..60b631161360 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -44,7 +44,7 @@
>  #define FW_FEATURE_OPAL  ASM_CONST(0x1000)
>  #define FW_FEATURE_SET_MODE  ASM_CONST(0x4000)
>  #define FW_FEATURE_BEST_ENERGY   ASM_CONST(0x8000)
> -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
> +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
>  #define FW_FEATURE_PRRN  ASM_CONST(0x0002)
>  #define FW_FEATURE_DRMEM_V2  ASM_CONST(0x0004)
>  #define FW_FEATURE_DRC_INFO  ASM_CONST(0x0008)
> @@ -69,7 +69,7 @@ enum {
>   FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
>   FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
>   FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
> - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
> + FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
>   FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
>   FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
>   FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index 324a13351749..df9fec9d232c 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
>  #define OV5_MSI  0x0201  /* PCIe/MSI support */
>  #define OV5_CMO  0x0480  /* Cooperative Memory 
> Overcommitment */
>  #define OV5_XCMO 0x0440  /* Page Coalescing */
> -#define OV5_TYPE1_AFFINITY   0x0580  /* Type 1 NUMA affinity */
> +#define OV5_FORM1_AFFINITY   0x0580  /* FORM1 NUMA affinity */
>  #define OV5_PRRN 0x0540  /* Platform Resource Reassignment */
>  #define OV5_HP_EVT   0x0604  /* Hot Plug Event support */
>  #define OV5_RESIZE_HPT   0x0601  /* Hash Page Table resizing */
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 41ed7e33d897..64b9593038a7 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec 
> ibm_architecture_vec_template __initconst = {
>  #else
>   0,
>  #endif
> - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
> OV5_FEAT(OV5_PRRN),
> + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
> OV5_FEAT(OV5_PRRN),
>   .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
>   .micro_checkpoint = 0,
>   .reserved0 = 0,
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 5941da201fa3..192067991f8a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
>  
>  static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
> -static int form1_affinity;
> +
> +#define FORM0_AFFINITY 0
> +#define FORM1_AFFINITY 1
> +static int affinity_form;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
>  static int max_domain_index;
> @@ -190,7 +193,7 @@ int __node_distance(int a, int b)
>   int i;
>   int distance = LOCAL_DISTANCE;
>  
> - if (!form1_affinity)
> + if (affinity_form == FORM0_AFFINITY)
>   return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
>   for (i = 0; i < max_domain_index; i++) {
> @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>  {
>   int i;
>  
> - if (!form1_affinity)
> + if (affinity_form != FORM1_AFFINITY)
>   return;
>  
>   for (i = 0; i < max_domain_index; i++) {
> @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
>   int index;
>   struct device_node *root;
>  
> + /*
> +  * Check 

Re: [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-06-14 Thread David Gibson
 max_nodes = of_read_number([primary_domain_index], 1);
>   for (i = 0; i < max_nodes; i++) {
>   if (!node_possible(i))
>   node_set(i, node_possible_map);
>   }
>  
>   prop_length /= sizeof(int);
> - if (prop_length > min_common_depth + 2)
> + if (prop_length > primary_domain_index + 2)
>   coregroup_enabled = 1;
>  
>  out:
> @@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
>   goto out;
>  
>   index = of_read_number(associativity, 1);
> - if (index > min_common_depth + 1)
> + if (index > primary_domain_index + 1)
>   return of_read_number([index - 1], 1);
>  
>  out:

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update

2021-06-14 Thread David Gibson
On Mon, Jun 14, 2021 at 10:09:59PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call. In
> later patch we will remove updating NUMA distance when we are looking
> for node id from associativity array.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c| 41 +++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
>  .../platforms/pseries/hotplug-memory.c|  2 +
>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>  4 files changed, 46 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 192067991f8a..fec47981c1ef 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> + int i, nid;
> +
> + if (of_read_number(associativity, 1) >= primary_domain_index) {
> + nid = of_read_number([primary_domain_index], 1);
> +
> + for (i = 0; i < max_domain_index; i++) {
> + const __be32 *entry;
> +
> + entry = 
> [be32_to_cpu(distance_ref_points[i])];
> + distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
> + }
> + }
> +}

This logic is almost identicaly to initialize_distance_lookup_table()
- it would be good if they could be consolidated, so it's clear that
coldplugged and hotplugged nodes are parsing the NUMA information in
the same way.

> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> + const __be32 *associativity;
> +
> + associativity = of_get_associativity(node);
> + if (!associativity)
> + return;
> +
> + __initialize_form1_numa_distance(associativity);
> + return;
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> + if (affinity_form == FORM0_AFFINITY)
> + return;
> + else if (affinity_form == FORM1_AFFINITY) {
> + initialize_form1_numa_distance(node);
> + return;
> + }
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>   int index;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>   return saved_rc;
>   }
>  
> + update_numa_distance(dn);
> +
>   rc = dlpar_online_cpu(dn);
>   if (rc) {
>   saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..0e602c3b01ea 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct 
> drmem_lmb *lmb)
>   return -ENODEV;
>   }
>  
> + update_numa_distance(lmb_node);
> +
>   dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>   if (!dr_node) {
>   dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index

2021-06-14 Thread David Gibson
On Mon, Jun 14, 2021 at 10:09:57PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch

As with 1/8 an explanation of what this actually means and therefore
why this is a better name would be very helpful.

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..5941da201fa3 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>   int i, index;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_domain_index; i++) {
>   index = be32_to_cpu(distance_ref_points[i]);
>   if (cpu1_assoc[index] == cpu2_assoc[index])
>   break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>   if (!form1_affinity)
>   return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_domain_index; i++) {
>   if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>   break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>   if (!form1_affinity)
>   return;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_domain_index; i++) {
>   const __be32 *entry;
>  
>   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   nid = NUMA_NO_NODE;
>  
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= max_domain_index) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>*/
>   distance_ref_points = of_get_property(root,
>   "ibm,associativity-reference-points",
> - _ref_points_depth);
> + _domain_index);
>  
>   if (!distance_ref_points) {
>   dbg("NUMA: ibm,associativity-reference-points not found.\n");
>   goto err;
>   }
>  
> - distance_ref_points_depth /= sizeof(int);
> + max_domain_index /= sizeof(int);
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL) ||
>   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>   if (form1_affinity) {
>   index = of_read_number(distance_ref_points, 1);
>   } else {
> - if (distance_ref_points_depth < 2) {
> + if (max_domain_index < 2) {
>   printk(KERN_WARNING "NUMA: "
>   "short ibm,associativity-reference-points\n");
>   goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>* Warn and cap if the hardware supports more than
>* MAX_DISTANCE_REF_POINTS domains.
>*/
> - if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> + if (max_domain_index > MAX_DISTANCE_REF_POINTS) {
>   printk(KERN_WARNING "NUMA: distance array capped at "
>   "%d entries\n", MAX_DISTANCE_REF_POINTS);
> - distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> + max_domain_index = MAX_DISTANCE_REF_POINTS;
>   }
>  
>   of_node_put(root);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-06-09 Thread David Gibson
On Wed, Jun 09, 2021 at 02:51:49AM -0300, Leonardo Brás wrote:
> On Wed, 2021-06-09 at 14:40 +1000, David Gibson wrote:
> > On Tue, Jun 08, 2021 at 09:52:10PM -0300, Leonardo Brás wrote:
> > > On Mon, 2021-06-07 at 15:02 +1000, David Gibson wrote:
> > > > On Fri, Apr 30, 2021 at 11:36:06AM -0300, Leonardo Bras wrote:
> > > > > Because hypervisors may need to create HPTs without knowing the
> > > > > guest
> > > > > page size, the smallest used page-size (4k) may be chosen,
> > > > > resulting in
> > > > > a HPT that is possibly bigger than needed.
> > > > > 
> > > > > On a guest with bigger page-sizes, the amount of entries for
> > > > > HTP
> > > > > may be
> > > > > too high, causing the guest to ask for a HPT resize-down on the
> > > > > first
> > > > > hotplug.
> > > > > 
> > > > > This becomes a problem when HPT resize-down fails, and causes
> > > > > the
> > > > > HPT resize to be performed on every LMB added, until HPT size
> > > > > is
> > > > > compatible to guest memory size, causing a major slowdown.
> > > > > 
> > > > > So, avoiding HPT resizing-down on hot-add significantly
> > > > > improves
> > > > > memory
> > > > > hotplug times.
> > > > > 
> > > > > As an example, hotplugging 256GB on a 129GB guest took 710s
> > > > > without
> > > > > this
> > > > > patch, and 21s after applied.
> > > > > 
> > > > > Signed-off-by: Leonardo Bras 
> > > > 
> > > > Sorry it's taken me so long to look at these
> > > > 
> > > > I don't love the extra statefulness that the 'shrinking'
> > > > parameter
> > > > adds, but I can't see an elegant way to avoid it, so:
> > > > 
> > > > Reviewed-by: David Gibson 
> > > 
> > > np, thanks for reviewing!
> > 
> > Actually... I take that back.  With the subsequent patches my
> > discomfort with the complexity of implementing the batching grew.
> > 
> > I think I can see a simpler way - although it wasn't as clear as I
> > thought it might be, without some deep history on this feature.
> > 
> > What's going on here is pretty hard to follow, because it starts in
> > arch-specific code (arch/powerpc/platforms/pseries/hotplug-memory.c)
> > where it processes the add/remove requests, then goes into generic
> > code __add_memory() which eventually emerges back in arch specific
> > code (hash__create_section_mapping()).
> > 
> > The HPT resizing calls are in the "inner" arch specific section,
> > whereas it's only the outer arch section that has the information to
> > batch properly.  The mutex and 'shrinking' parameter in Leonardo's
> > code are all about conveying information from the outer to inner
> > section.
> > 
> > Now, I think the reason I had the resize calls in the inner section
> > was to accomodate the notion that a) pHyp might support resizing in
> > future, and it could come in through a different path with its drmgr
> > thingy and/or b) bare metal hash architectures might want to
> > implement
> > hash resizing, and this would make at least part of the path common.
> > 
> > Given the decreasing relevance of hash MMUs, I think we can now
> > safely
> > say neither of these is ever going to happen.
> > 
> > Therefore, we can simplify things by moving the HPT resize calls into
> > the pseries LMB code, instead of create/remove_section_mapping.  Then
> > to do batching without extra complications we just need this logic
> > for
> > all resizes (both add and remove):
> > 
> > let new_hpt_order = expected HPT size for new mem size;
> > 
> > if (new_hpt_order > current_hpt_order)
> > resize to new_hpt_order
> > 
> > add/remove memory
> > 
> > if (new_hpt_order < current_hpt_order - 1)
> > resize to new_hpt_order
> > 
> > 
> 
> 
> Ok, that really does seem to simplify a lot the batching.
> 
> Question:
> by LMB code, you mean dlpar_memory_{add,remove}_by_* ?
> (dealing only with dlpar_{add,remove}_lmb() would not be enough to deal
> with batching)

I was thinking of a two stage process.  First moving the resizes to
dlpar_{add,remote}_lmb() (not changing behaviour for the pseries dlpar
path), then implementing the batching by moving to the {add,remove}_by
functions.

But..

> In my 3/3 repĺy I sent you some other examples of functions that
> currently end up calling resize_hpt_for_hotplug() without comming from 
> hotplug-memory.c. Is that ok that they do not call it anymore?

..as I replied there, I was wrong about it being safe to move the
resizes all to the pseries dlpar code.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-06-09 Thread David Gibson
On Wed, Jun 09, 2021 at 02:30:36AM -0300, Leonardo Brás wrote:
> On Mon, 2021-06-07 at 15:20 +1000, David Gibson wrote:
> > On Fri, Apr 30, 2021 at 11:36:10AM -0300, Leonardo Bras wrote:
> > > During memory hotunplug, after each LMB is removed, the HPT may be
> > > resized-down if it would map a max of 4 times the current amount of
> > > memory.
> > > (2 shifts, due to introduced histeresis)
> > > 
> > > It usually is not an issue, but it can take a lot of time if HPT
> > > resizing-down fails. This happens  because resize-down failures
> > > usually repeat at each LMB removal, until there are no more bolted
> > > entries
> > > conflict, which can take a while to happen.
> > > 
> > > This can be solved by doing a single HPT resize at the end of
> > > memory
> > > hotunplug, after all requested entries are removed.
> > > 
> > > To make this happen, it's necessary to temporarily disable all HPT
> > > resize-downs before hotunplug, re-enable them after hotunplug ends,
> > > and then resize-down HPT to the current memory size.
> > > 
> > > As an example, hotunplugging 256GB from a 385GB guest took 621s
> > > without
> > > this patch, and 100s after applied.
> > > 
> > > Signed-off-by: Leonardo Bras 
> > 
> > Hrm.  This looks correct, but it seems overly complicated.
> > 
> > AFAICT, the resize calls that this adds should in practice be the
> > *only* times we call resize, all the calls from the lower level code
> > should be suppressed. 
> 
> That's correct.
> 
> >  In which case can't we just remove those calls
> > entirely, and not deal with the clunky locking and exclusion here.
> > That should also remove the need for the 'shrinking' parameter in
> > 1/3.
> 
> 
> If I get your suggestion correctly, you suggest something like:
> 1 - Never calling resize_hpt_for_hotplug() in
> hash__remove_section_mapping(), thus not needing the srinking
> parameter.
> 2 - Functions in hotplug-memory.c that call dlpar_remove_lmb() would in
> fact call another function to do the batch resize_hpt_for_hotplug() for
> them

Basically, yes.

> If so, that assumes that no other function that currently calls
> resize_hpt_for_hotplug() under another path, or if they do, it does not
> need to actually resize the HPT.
> 
> Is the above correct?
> 
> There are some examples of functions that currently call
> resize_hpt_for_hotplug() by another path:
> 
> add_memory_driver_managed
>   virtio_mem_add_memory
>   dev_dax_kmem_probe

Oh... virtio-mem.  I didn't think of that.


> reserve_additional_memory
>   balloon_process
>   add_ballooned_pages

AFAICT this comes from drivers/xen, and Xen has never been a thing on
POWER.

> __add_memory
>   probe_store

So this is a sysfs triggered memory add.  If the user is doing this
manually, then I think it's reasonable for them to manually manage the
HPT size as well, which they can do through debugfs.  I think it might
also be used my drmgr under pHyp, but pHyp doesn't support HPT
resizing.

> __remove_memory
>   pseries_remove_memblock

Huh, this one comes through OF_RECONFIG_DETACH_NODE.  I don't really
know when those happen, but I strongly suspect it's only under pHyp
again.

> remove_memory
>   dev_dax_kmem_remove
>   virtio_mem_remove_memory

virtio-mem again.

> memunmap_pages
>   pci_p2pdma_add_resource
>   virtio_fs_setup_dax

And virtio-fs in dax mode.  Didn't think of that either.


Ugh, yeah, I'm used to the world where the platform provides the only
way of hotplugging memory, but virtio-mem does indeed provide another
one, and we could indeed need to manage the HPT size based on that.
Drat, so moving all the HPT resizing handling up into
pseries/hotplug-memory.c won't work.

I still think we can simplify the communication between the stuff in
the pseries hotplug code and the actual hash resizing.  In your draft
there are kind of 3 ways the information is conveyed: the mutex
suppresses HPT shrinks, pre-growing past what we need prevents HPT
grows, and the 'shrinking' flag handles some edge cases.

I suggest instead a single flag that will suppress all the current
resizes.  Not sure it technically has to be an atomic mutex, but
that's probably the obvious safe choice.  Then have a "resize up to
target" and "resize down to target" that ignore that suppression and
are no-ops if the target is in the other direction.
Then you should be able to make the path for pseries hotplugs be:

suppress other resizes

resize up to target

do the actual adds or removes

resize down to target

unsuppress other resizes


-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-06-08 Thread David Gibson
On Tue, Jun 08, 2021 at 09:52:10PM -0300, Leonardo Brás wrote:
> On Mon, 2021-06-07 at 15:02 +1000, David Gibson wrote:
> > On Fri, Apr 30, 2021 at 11:36:06AM -0300, Leonardo Bras wrote:
> > > Because hypervisors may need to create HPTs without knowing the
> > > guest
> > > page size, the smallest used page-size (4k) may be chosen,
> > > resulting in
> > > a HPT that is possibly bigger than needed.
> > > 
> > > On a guest with bigger page-sizes, the amount of entries for HTP
> > > may be
> > > too high, causing the guest to ask for a HPT resize-down on the
> > > first
> > > hotplug.
> > > 
> > > This becomes a problem when HPT resize-down fails, and causes the
> > > HPT resize to be performed on every LMB added, until HPT size is
> > > compatible to guest memory size, causing a major slowdown.
> > > 
> > > So, avoiding HPT resizing-down on hot-add significantly improves
> > > memory
> > > hotplug times.
> > > 
> > > As an example, hotplugging 256GB on a 129GB guest took 710s without
> > > this
> > > patch, and 21s after applied.
> > > 
> > > Signed-off-by: Leonardo Bras 
> > 
> > Sorry it's taken me so long to look at these
> > 
> > I don't love the extra statefulness that the 'shrinking' parameter
> > adds, but I can't see an elegant way to avoid it, so:
> > 
> > Reviewed-by: David Gibson 
> 
> np, thanks for reviewing!

Actually... I take that back.  With the subsequent patches my
discomfort with the complexity of implementing the batching grew.

I think I can see a simpler way - although it wasn't as clear as I
thought it might be, without some deep history on this feature.

What's going on here is pretty hard to follow, because it starts in
arch-specific code (arch/powerpc/platforms/pseries/hotplug-memory.c)
where it processes the add/remove requests, then goes into generic
code __add_memory() which eventually emerges back in arch specific
code (hash__create_section_mapping()).

The HPT resizing calls are in the "inner" arch specific section,
whereas it's only the outer arch section that has the information to
batch properly.  The mutex and 'shrinking' parameter in Leonardo's
code are all about conveying information from the outer to inner
section.

Now, I think the reason I had the resize calls in the inner section
was to accomodate the notion that a) pHyp might support resizing in
future, and it could come in through a different path with its drmgr
thingy and/or b) bare metal hash architectures might want to implement
hash resizing, and this would make at least part of the path common.

Given the decreasing relevance of hash MMUs, I think we can now safely
say neither of these is ever going to happen.

Therefore, we can simplify things by moving the HPT resize calls into
the pseries LMB code, instead of create/remove_section_mapping.  Then
to do batching without extra complications we just need this logic for
all resizes (both add and remove):

let new_hpt_order = expected HPT size for new mem size;

if (new_hpt_order > current_hpt_order)
resize to new_hpt_order

add/remove memory

if (new_hpt_order < current_hpt_order - 1)
resize to new_hpt_order


-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-06-06 Thread David Gibson
  /* Do not try to resize to the starting size, or bigger value */
> + if (htab_shift_for_mem_size(newsize) >= starting_size)
> + break;
> + }
> +
> + /* Re-enables HPT resize-down after hot-unplug */
> + mutex_unlock(_resize_down_lock);
> +}
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  static void __init hash_init_partition_table(phys_addr_t hash_table,
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 48b2cfe4ce69..44bc50d72353 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -426,6 +426,9 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>   return -EINVAL;
>   }
>  
> + if (!radix_enabled())
> + hash_batch_shrink_begin();
> +
>   for_each_drmem_lmb(lmb) {
>   rc = dlpar_remove_lmb(lmb);
>   if (rc)
> @@ -471,6 +474,9 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>   rc = 0;
>   }
>  
> + if (!radix_enabled())
> + hash_batch_shrink_end();
> +
>   return rc;
>  }
>  
> @@ -533,6 +539,9 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   if (lmbs_available < lmbs_to_remove)
>   return -EINVAL;
>  
> + if (!radix_enabled())
> + hash_batch_shrink_begin();
> +
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
>   continue;
> @@ -573,6 +582,9 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   }
>   }
>  
> + if (!radix_enabled())
> + hash_batch_shrink_end();
> +
>   return rc;
>  }
>  
> @@ -703,6 +715,9 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>   if (lmbs_added != lmbs_to_add) {
>   pr_err("Memory hot-add failed, removing any added LMBs\n");
>  
> + if (!radix_enabled())
> + hash_batch_shrink_begin();
> +
>   for_each_drmem_lmb(lmb) {
>   if (!drmem_lmb_reserved(lmb))
>   continue;
> @@ -716,6 +731,10 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>  
>   drmem_remove_lmb_reservation(lmb);
>   }
> +
> + if (!radix_enabled())
> + hash_batch_shrink_end();
> +
>   rc = -EINVAL;
>   } else {
>   for_each_drmem_lmb(lmb) {
> @@ -817,6 +836,9 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>   if (rc) {
>   pr_err("Memory indexed-count-add failed, removing any added 
> LMBs\n");
>  
> +     if (!radix_enabled())
> + hash_batch_shrink_begin();
> +
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (!drmem_lmb_reserved(lmb))
>   continue;
> @@ -830,6 +852,10 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>  
>   drmem_remove_lmb_reservation(lmb);
>   }
> +
> + if (!radix_enabled())
> + hash_batch_shrink_end();
> +
>   rc = -EINVAL;
>   } else {
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-06-06 Thread David Gibson
On Fri, Apr 30, 2021 at 11:36:08AM -0300, Leonardo Bras wrote:
> Every time a memory hotplug happens, and the memory limit crosses a 2^n
> value, it may be necessary to perform HPT resizing-up, which can take
> some time (over 100ms in my tests).
> 
> It usually is not an issue, but it can take some time if a lot of memory
> is added to a guest with little starting memory:
> Adding 256G to a 2GB guest, for example will require 8 HPT resizes.
> 
> Perform an HPT resize before memory hotplug, updating HPT to its
> final size (considering a successful hotplug), taking the number of
> HPT resizes to at most one per memory hotplug action.
> 
> Signed-off-by: Leonardo Bras 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/include/asm/book3s/64/hash.h |  2 ++
>  arch/powerpc/mm/book3s64/hash_utils.c | 20 +++
>  .../platforms/pseries/hotplug-memory.c|  9 +
>  3 files changed, 31 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
> b/arch/powerpc/include/asm/book3s/64/hash.h
> index d959b0195ad9..fad4af8b8543 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, 
> unsigned long end,
>int nid, pgprot_t prot);
>  int hash__remove_section_mapping(unsigned long start, unsigned long end);
>  
> +void hash_batch_expand_prepare(unsigned long newsize);
> +
>  #endif /* !__ASSEMBLY__ */
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 608e4ed397a9..3fa395b3fe57 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -859,6 +859,26 @@ int hash__remove_section_mapping(unsigned long start, 
> unsigned long end)
>  
>   return rc;
>  }
> +
> +void hash_batch_expand_prepare(unsigned long newsize)
> +{
> + const u64 starting_size = ppc64_pft_size;
> +
> + /*
> +  * Resizing-up HPT should never fail, but there are some cases system 
> starts with higher
> +  * SHIFT than required, and we go through the funny case of resizing 
> HPT down while
> +  * adding memory
> +  */
> +
> + while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
> + newsize *= 2;
> + pr_warn("Hash collision while resizing HPT\n");
> +
> + /* Do not try to resize to the starting size, or bigger value */
> + if (htab_shift_for_mem_size(newsize) >= starting_size)
> + break;
> + }
> +}
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  static void __init hash_init_partition_table(phys_addr_t hash_table,
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..48b2cfe4ce69 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -671,6 +672,10 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>   if (lmbs_available < lmbs_to_add)
>   return -EINVAL;
>  
> + if (!radix_enabled())
> + hash_batch_expand_prepare(memblock_phys_mem_size() +
> +  lmbs_to_add * 
> drmem_lmb_size());
> +
>   for_each_drmem_lmb(lmb) {
>   if (lmb->flags & DRCONF_MEM_ASSIGNED)
>   continue;
> @@ -788,6 +793,10 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>   if (lmbs_available < lmbs_to_add)
>   return -EINVAL;
>  
> + if (!radix_enabled())
> + hash_batch_expand_prepare(memblock_phys_mem_size() +
> +   lmbs_to_add * drmem_lmb_size());
> +
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (lmb->flags & DRCONF_MEM_ASSIGNED)
>   continue;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-06-06 Thread David Gibson
On Fri, Apr 30, 2021 at 11:36:06AM -0300, Leonardo Bras wrote:
> Because hypervisors may need to create HPTs without knowing the guest
> page size, the smallest used page-size (4k) may be chosen, resulting in
> a HPT that is possibly bigger than needed.
> 
> On a guest with bigger page-sizes, the amount of entries for HTP may be
> too high, causing the guest to ask for a HPT resize-down on the first
> hotplug.
> 
> This becomes a problem when HPT resize-down fails, and causes the
> HPT resize to be performed on every LMB added, until HPT size is
> compatible to guest memory size, causing a major slowdown.
> 
> So, avoiding HPT resizing-down on hot-add significantly improves memory
> hotplug times.
> 
> As an example, hotplugging 256GB on a 129GB guest took 710s without this
> patch, and 21s after applied.
> 
> Signed-off-by: Leonardo Bras 

Sorry it's taken me so long to look at these

I don't love the extra statefulness that the 'shrinking' parameter
adds, but I can't see an elegant way to avoid it, so:

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/mm/book3s64/hash_utils.c | 36 ---
>  1 file changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 581b20a2feaf..608e4ed397a9 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -795,7 +795,7 @@ static unsigned long __init htab_get_table_size(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -static int resize_hpt_for_hotplug(unsigned long new_mem_size)
> +static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
>  {
>   unsigned target_hpt_shift;
>  
> @@ -804,19 +804,25 @@ static int resize_hpt_for_hotplug(unsigned long 
> new_mem_size)
>  
>   target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
>  
> - /*
> -  * To avoid lots of HPT resizes if memory size is fluctuating
> -  * across a boundary, we deliberately have some hysterisis
> -  * here: we immediately increase the HPT size if the target
> -  * shift exceeds the current shift, but we won't attempt to
> -  * reduce unless the target shift is at least 2 below the
> -  * current shift
> -  */
> - if (target_hpt_shift > ppc64_pft_size ||
> - target_hpt_shift < ppc64_pft_size - 1)
> - return mmu_hash_ops.resize_hpt(target_hpt_shift);
> + if (shrinking) {
>  
> - return 0;
> + /*
> +  * To avoid lots of HPT resizes if memory size is fluctuating
> +  * across a boundary, we deliberately have some hysterisis
> +  * here: we immediately increase the HPT size if the target
> +  * shift exceeds the current shift, but we won't attempt to
> +  * reduce unless the target shift is at least 2 below the
> +  * current shift
> +  */
> +
> + if (target_hpt_shift >= ppc64_pft_size - 1)
> + return 0;
> +
> + } else if (target_hpt_shift <= ppc64_pft_size) {
> + return 0;
> + }
> +
> + return mmu_hash_ops.resize_hpt(target_hpt_shift);
>  }
>  
>  int hash__create_section_mapping(unsigned long start, unsigned long end,
> @@ -829,7 +835,7 @@ int hash__create_section_mapping(unsigned long start, 
> unsigned long end,
>   return -1;
>   }
>  
> - resize_hpt_for_hotplug(memblock_phys_mem_size());
> + resize_hpt_for_hotplug(memblock_phys_mem_size(), false);
>  
>   rc = htab_bolt_mapping(start, end, __pa(start),
>  pgprot_val(prot), mmu_linear_psize,
> @@ -848,7 +854,7 @@ int hash__remove_section_mapping(unsigned long start, 
> unsigned long end)
>   int rc = htab_remove_mapping(start, end, mmu_linear_psize,
>        mmu_kernel_ssize);
>  
> - if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
> + if (resize_hpt_for_hotplug(memblock_phys_mem_size(), true) == -ENOSPC)
>   pr_warn("Hash collision while resizing HPT\n");
>  
>   return rc;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 3/4] powerpc/pseries: break early in dlpar_memory_remove_by_count() loops

2021-05-12 Thread David Gibson
On Wed, May 12, 2021 at 05:28:08PM -0300, Daniel Henrique Barboza wrote:
> After marking the LMBs as reserved depending on dlpar_remove_lmb() rc,
> we evaluate whether we need to add the LMBs back or if we can release
> the LMB DRCs. In both cases, a for_each_drmem_lmb() loop without a break
> condition is used. This means that we're going to cycle through all LMBs
> of the partition even after we're done with what we were going to do.
> 
> This patch adds break conditions in both loops to avoid this. The
> 'lmbs_removed' variable was renamed to 'lmbs_reserved', and it's now
> being decremented each time a lmb reservation is removed, indicating
> that the operation we're doing (adding back LMBs or releasing DRCs) is
> completed.
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

The fact that DRCONF_MEM_RESERVED and DRMEM_LMB_RESERVED look so
similar but have totally different meanings doesn't make this easy to
follow :/.

> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index c21d9278c1ce..3c7ce5361ce3 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -402,7 +402,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>  static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
>  {
>   struct drmem_lmb *lmb;
> - int lmbs_removed = 0;
> + int lmbs_reserved = 0;
>   int lmbs_available = 0;
>   int rc;
>  
> @@ -436,12 +436,12 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>*/
>   drmem_mark_lmb_reserved(lmb);
>  
> - lmbs_removed++;
> - if (lmbs_removed == lmbs_to_remove)
> + lmbs_reserved++;
> + if (lmbs_reserved == lmbs_to_remove)
>   break;
>   }
>  
> - if (lmbs_removed != lmbs_to_remove) {
> + if (lmbs_reserved != lmbs_to_remove) {
>   pr_err("Memory hot-remove failed, adding LMB's back\n");
>  
>   for_each_drmem_lmb(lmb) {
> @@ -454,6 +454,10 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>  lmb->drc_index);
>  
>   drmem_remove_lmb_reservation(lmb);
> +
> + lmbs_reserved--;
> + if (lmbs_reserved == 0)
> + break;
>   }
>  
>   rc = -EINVAL;
> @@ -467,6 +471,10 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>   lmb->base_addr);
>  
>   drmem_remove_lmb_reservation(lmb);
> +
> + lmbs_reserved--;
> + if (lmbs_reserved == 0)
> + break;
>   }
>   rc = 0;
>   }

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 4/4] powerpc/pseries: minor enhancements in dlpar_memory_remove_by_ic()

2021-05-12 Thread David Gibson
On Wed, May 12, 2021 at 05:28:09PM -0300, Daniel Henrique Barboza wrote:
> We don't need the 'lmbs_available' variable to count the valid LMBs and
> to check if we have less than 'lmbs_to_remove'. We must ensure that the
> entire LMB range must be removed, so we can error out immediately if any
> LMB in the range is marked as reserved.
> 
> Add a couple of comments explaining the reasoning behind the differences
> we have in this function in contrast to what it is done in its sister
> function, dlpar_memory_remove_by_count().
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

> ---
>  .../platforms/pseries/hotplug-memory.c| 28 +--
>  1 file changed, 19 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 3c7ce5361ce3..ee88c1540fba 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -517,7 +517,6 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
>  static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
>  {
>   struct drmem_lmb *lmb, *start_lmb, *end_lmb;
> - int lmbs_available = 0;
>   int rc;
>  
>   pr_info("Attempting to hot-remove %u LMB(s) at %x\n",
> @@ -530,18 +529,29 @@ static int dlpar_memory_remove_by_ic(u32 
> lmbs_to_remove, u32 drc_index)
>   if (rc)
>   return -EINVAL;
>  
> - /* Validate that there are enough LMBs to satisfy the request */
> + /*
> +  * Validate that all LMBs in range are not reserved. Note that it
> +  * is ok if they are !ASSIGNED since our goal here is to remove the
> +  * LMB range, regardless of whether some LMBs were already removed
> +  * by any other reason.
> +  *
> +  * This is a contrast to what is done in remove_by_count() where we
> +  * check for both RESERVED and !ASSIGNED (via lmb_is_removable()),
> +  * because we want to remove a fixed amount of LMBs in that function.
> +  */
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> - if (lmb->flags & DRCONF_MEM_RESERVED)
> - break;
> -
> - lmbs_available++;
> + if (lmb->flags & DRCONF_MEM_RESERVED) {
> + pr_err("Memory at %llx (drc index %x) is reserved\n",
> + lmb->base_addr, lmb->drc_index);
> + return -EINVAL;
> + }
>   }
>  
> - if (lmbs_available < lmbs_to_remove)
> - return -EINVAL;
> -
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> + /*
> +  * dlpar_remove_lmb() will error out if the LMB is already
> +  * !ASSIGNED, but this case is a no-op for us.
> +  */
>   if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
>   continue;
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 2/4] powerpc/pseries: check DRCONF_MEM_RESERVED in lmb_is_removable()

2021-05-12 Thread David Gibson
On Wed, May 12, 2021 at 05:28:07PM -0300, Daniel Henrique Barboza wrote:
> DRCONF_MEM_RESERVED is a flag that represents the "Reserved Memory"
> status in LOPAR v2.10, section 4.2.8. If a LMB is marked as reserved,
> quoting LOPAR, "is not to be used or altered by the base OS". This flag
> is read only in the kernel, being set by the firmware/hypervisor in the
> DT. As an example, QEMU will set this flag in hw/ppc/spapr.c,
> spapr_dt_dynamic_memory().
> 
> lmb_is_removable() does not check for DRCONF_MEM_RESERVED. This function
> is used in dlpar_remove_lmb() as a guard before the removal logic. Since
> it is failing to check for !RESERVED, dlpar_remove_lmb() will fail in a
> later stage instead of failing in the validation when receiving a
> reserved LMB as input.
> 
> lmb_is_removable() is also used in dlpar_memory_remove_by_count() to
> evaluate if we have enough LMBs to complete the request. The missing
> !RESERVED check in this case is causing dlpar_memory_remove_by_count()
> to miscalculate the number of elegible LMBs for the removal, and can
> make it error out later on instead of failing in the validation with the
> 'not enough LMBs to satisfy request' message.
> 
> Making a DRCONF_MEM_RESERVED check in lmb_is_removable() fixes all these
> issues.
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index bb98574a84a2..c21d9278c1ce 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -348,7 +348,8 @@ static int pseries_remove_mem_node(struct device_node *np)
>  
>  static bool lmb_is_removable(struct drmem_lmb *lmb)
>  {
> - if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
> + if ((lmb->flags & DRCONF_MEM_RESERVED) ||
> + !(lmb->flags & DRCONF_MEM_ASSIGNED))
>   return false;
>  
>  #ifdef CONFIG_FA_DUMP

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 2/3] hotplug-memory.c: enhance dlpar_memory_remove* LMB checks

2021-05-12 Thread David Gibson
On Wed, May 12, 2021 at 05:35:39PM -0300, Daniel Henrique Barboza wrote:
> 
> On 5/3/21 10:02 PM, David Gibson wrote:
> > On Fri, Apr 30, 2021 at 09:09:16AM -0300, Daniel Henrique Barboza wrote:
> > > dlpar_memory_remove_by_ic() validates the amount of LMBs to be removed
> > > by checking !DRCONF_MEM_RESERVED, and in the following loop before
> > > dlpar_remove_lmb() a check for DRCONF_MEM_ASSIGNED is made before
> > > removing it. This means that a LMB that is both !DRCONF_MEM_RESERVED and
> > > !DRCONF_MEM_ASSIGNED will be counted as valid, but then not being
> > > removed.  The function will end up not removing all 'lmbs_to_remove'
> > > LMBs while also not reporting any errors.
> > > 
> > > Comparing it to dlpar_memory_remove_by_count(), the validation is done
> > > via lmb_is_removable(), which checks for DRCONF_MEM_ASSIGNED and fadump
> > > constraints. No additional check is made afterwards, and
> > > DRCONF_MEM_RESERVED is never checked before dlpar_remove_lmb(). The
> > > function doesn't have the same 'check A for validation, then B for
> > > removal' issue as remove_by_ic(), but it's not checking if the LMB is
> > > reserved.
> > > 
> > > There is no reason for these functions to validate the same operation in
> > > two different manners.
> > 
> > Actually, I think there is: remove_by_ic() is handling a request to
> > remove a specific range of LMBs.  If any are reserved, they can't be
> > removed and so this needs to fail.  But if they are !ASSIGNED, that
> > essentially means they're *already* removed (or never added), so
> > "removing" them is, correctly, a no-op.
> > 
> > remove_by_count(), in contrast, is being asked to remove a fixed
> > number of LMBs from wherever they can be found, and for that it needs
> > to find LMBs that haven't already been removed.
> > 
> > Basically remove_by_ic() is an absolute request: "make this set of
> > LMBs be not-plugged", whereas remove_by_count() is a relative request
> > "make N less LMBs be plugged".
> > 
> > 
> > So I think remove_by_ic()s existing handling is correct.  I'm less
> > sure if remove_by_count() ignoring RESERVED is correct - I couldn't
> > quickly find under what circumstances RESERVED gets set.
> 
> RESERVED is never set by the kernel. It is written in the DT by the
> firmware/hypervisor and the kernel just checks its value. QEMU sets it in
> spapr_dt_dynamic_memory() with the following comment:
> 
> 
> /*
>  * LMB information for RMA, boot time RAM and gap b/n RAM and
>  * device memory region -- all these are marked as reserved
>  * and as having no valid DRC.
>  */
> dynamic_memory[0] = cpu_to_be32(addr >> 32);
> dynamic_memory[1] = cpu_to_be32(addr & 0x);
> dynamic_memory[2] = cpu_to_be32(0);
> dynamic_memory[3] = cpu_to_be32(0); /* reserved */
> dynamic_memory[4] = cpu_to_be32(-1);
> dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
> SPAPR_LMB_FLAGS_DRC_INVALID);
> 
> 
> The flag is formally described in LOPAR section 4.2.8, "Reserved Memory":
> 
> "Memory nodes marked with the special value of the “status” property of
> “reserved” is not to be used or altered by the base OS."
> 
> 
> This makes me confident that we should check DRCONF_MEM_RESERVED in
> remove_by_count() as well, since phyp needs do adhere to these semantics and
> shouldn't be able to remove a LMB marked as RESERVED.

Right.  I doubt it would have caused a problem in practice, because
I'm pretty sure we should never get an LMB which is RESERVED &&
ASSIGNED, but it's probably safer to make it explicit.


-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 2/3] hotplug-memory.c: enhance dlpar_memory_remove* LMB checks

2021-05-09 Thread David Gibson
On Fri, May 07, 2021 at 01:36:06PM -0300, Daniel Henrique Barboza wrote:
> 
> 
> On 5/3/21 10:02 PM, David Gibson wrote:
> > On Fri, Apr 30, 2021 at 09:09:16AM -0300, Daniel Henrique Barboza wrote:
> > > dlpar_memory_remove_by_ic() validates the amount of LMBs to be removed
> > > by checking !DRCONF_MEM_RESERVED, and in the following loop before
> > > dlpar_remove_lmb() a check for DRCONF_MEM_ASSIGNED is made before
> > > removing it. This means that a LMB that is both !DRCONF_MEM_RESERVED and
> > > !DRCONF_MEM_ASSIGNED will be counted as valid, but then not being
> > > removed.  The function will end up not removing all 'lmbs_to_remove'
> > > LMBs while also not reporting any errors.
> > > 
> > > Comparing it to dlpar_memory_remove_by_count(), the validation is done
> > > via lmb_is_removable(), which checks for DRCONF_MEM_ASSIGNED and fadump
> > > constraints. No additional check is made afterwards, and
> > > DRCONF_MEM_RESERVED is never checked before dlpar_remove_lmb(). The
> > > function doesn't have the same 'check A for validation, then B for
> > > removal' issue as remove_by_ic(), but it's not checking if the LMB is
> > > reserved.
> > > 
> > > There is no reason for these functions to validate the same operation in
> > > two different manners.
> > 
> > Actually, I think there is: remove_by_ic() is handling a request to
> > remove a specific range of LMBs.  If any are reserved, they can't be
> > removed and so this needs to fail.  But if they are !ASSIGNED, that
> > essentially means they're *already* removed (or never added), so
> > "removing" them is, correctly, a no-op.
> 
> I guess that makes sense. Although I am not aware of any situation, at least
> thinking about how QEMU adds/removes LMBs, where some LMBs would be removed
> 'ad-hoc' in the middle of a LMB range that maps to a QEMU DIMM, I can't say
> that this wouldn't never happen either.

Right.  I believe a user could explicitly offline LMBs in the middle
of a DIMM. There's not much reason to do so, but it's possible.  There
might also be situations involving memory errors where individual LMBs
could get offlined.

> It is sensible to make remove_by_ic()
> resilient to this situation.
> 
> I'll re-send this patch just with the remove_by_count() change.
> 
> 
> Thanks,
> 
> 
> Daniel
> 
> > 
> > remove_by_count(), in contrast, is being asked to remove a fixed
> > number of LMBs from wherever they can be found, and for that it needs
> > to find LMBs that haven't already been removed.
> > 
> > Basically remove_by_ic() is an absolute request: "make this set of
> > LMBs be not-plugged", whereas remove_by_count() is a relative request
> > "make N less LMBs be plugged".
> > 
> > 
> > So I think remove_by_ic()s existing handling is correct.  I'm less
> > sure if remove_by_count() ignoring RESERVED is correct - I couldn't
> > quickly find under what circumstances RESERVED gets set.
> > 
> > 
> > > This patch addresses that by changing
> > > lmb_is_removable() to also check for DRCONF_MEM_RESERVED to tell if a
> > > lmb is removable, making dlpar_memory_remove_by_count() take the
> > > reservation state into account when counting the LMBs.
> > > lmb_is_removable() is then used in the validation step of
> > > dlpar_memory_remove_by_ic(), which is already checking for both states
> > > but in different stages, to avoid counting a LMB that is not assigned as
> > > eligible for removal. We can then skip the check before
> > > dlpar_remove_lmb() since we're validating all LMBs beforehand.
> > > 
> > > Signed-off-by: Daniel Henrique Barboza 
> > > ---
> > >   arch/powerpc/platforms/pseries/hotplug-memory.c | 8 +++-
> > >   1 file changed, 3 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> > > b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > index bb98574a84a2..4e6d162c3f1a 100644
> > > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > > @@ -348,7 +348,8 @@ static int pseries_remove_mem_node(struct device_node 
> > > *np)
> > >   static bool lmb_is_removable(struct drmem_lmb *lmb)
> > >   {
> > > - if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
> > > + if ((lmb->flags & DRCONF_MEM_RESERVED) ||
> > > + !(lmb->flags & DRCONF_MEM_ASSIGNED))
> > >   return false;

Re: [PATCH 2/3] hotplug-memory.c: enhance dlpar_memory_remove* LMB checks

2021-05-03 Thread David Gibson
On Fri, Apr 30, 2021 at 09:09:16AM -0300, Daniel Henrique Barboza wrote:
> dlpar_memory_remove_by_ic() validates the amount of LMBs to be removed
> by checking !DRCONF_MEM_RESERVED, and in the following loop before
> dlpar_remove_lmb() a check for DRCONF_MEM_ASSIGNED is made before
> removing it. This means that a LMB that is both !DRCONF_MEM_RESERVED and
> !DRCONF_MEM_ASSIGNED will be counted as valid, but then not being
> removed.  The function will end up not removing all 'lmbs_to_remove'
> LMBs while also not reporting any errors.
> 
> Comparing it to dlpar_memory_remove_by_count(), the validation is done
> via lmb_is_removable(), which checks for DRCONF_MEM_ASSIGNED and fadump
> constraints. No additional check is made afterwards, and
> DRCONF_MEM_RESERVED is never checked before dlpar_remove_lmb(). The
> function doesn't have the same 'check A for validation, then B for
> removal' issue as remove_by_ic(), but it's not checking if the LMB is
> reserved.
> 
> There is no reason for these functions to validate the same operation in
> two different manners.

Actually, I think there is: remove_by_ic() is handling a request to
remove a specific range of LMBs.  If any are reserved, they can't be
removed and so this needs to fail.  But if they are !ASSIGNED, that
essentially means they're *already* removed (or never added), so
"removing" them is, correctly, a no-op.

remove_by_count(), in contrast, is being asked to remove a fixed
number of LMBs from wherever they can be found, and for that it needs
to find LMBs that haven't already been removed.

Basically remove_by_ic() is an absolute request: "make this set of
LMBs be not-plugged", whereas remove_by_count() is a relative request
"make N less LMBs be plugged".


So I think remove_by_ic()s existing handling is correct.  I'm less
sure if remove_by_count() ignoring RESERVED is correct - I couldn't
quickly find under what circumstances RESERVED gets set.


> This patch addresses that by changing
> lmb_is_removable() to also check for DRCONF_MEM_RESERVED to tell if a
> lmb is removable, making dlpar_memory_remove_by_count() take the
> reservation state into account when counting the LMBs.
> lmb_is_removable() is then used in the validation step of
> dlpar_memory_remove_by_ic(), which is already checking for both states
> but in different stages, to avoid counting a LMB that is not assigned as
> eligible for removal. We can then skip the check before
> dlpar_remove_lmb() since we're validating all LMBs beforehand.
> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 8 +++-
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index bb98574a84a2..4e6d162c3f1a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -348,7 +348,8 @@ static int pseries_remove_mem_node(struct device_node *np)
>  
>  static bool lmb_is_removable(struct drmem_lmb *lmb)
>  {
> - if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
> + if ((lmb->flags & DRCONF_MEM_RESERVED) ||
> + !(lmb->flags & DRCONF_MEM_ASSIGNED))
>   return false;
>  
>  #ifdef CONFIG_FA_DUMP
> @@ -523,7 +524,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>  
>   /* Validate that there are enough LMBs to satisfy the request */
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> - if (lmb->flags & DRCONF_MEM_RESERVED)
> + if (!lmb_is_removable(lmb))
>   break;
>  
>   lmbs_available++;
> @@ -533,9 +534,6 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   return -EINVAL;
>  
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> - if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
> - continue;
> -
>   rc = dlpar_remove_lmb(lmb);
>   if (rc)
>   break;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 1/3] powerpc/pseries: Set UNISOLATE on dlpar_memory_remove_by_ic() error

2021-05-03 Thread David Gibson
On Fri, Apr 30, 2021 at 09:09:15AM -0300, Daniel Henrique Barboza wrote:
> As previously done in dlpar_cpu_remove() for CPUs, this patch changes
> dlpar_memory_remove_by_ic() to unisolate the LMB DRC when the LMB is
> failed to be removed. The hypervisor, seeing a LMB DRC that was supposed
> to be removed being unisolated instead, can do error recovery on its
> side.
> 
> This change is done in dlpar_memory_remove_by_ic() only because, as of
> today, only QEMU is using this code path for error recovery (via the
> PSERIES_HP_ELOG_ID_DRC_IC event). phyp treats it as a no-op.
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..bb98574a84a2 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -551,6 +551,13 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   if (!drmem_lmb_reserved(lmb))
>   continue;
>  
> + /*
> +  * Setting the isolation state of an 
> UNISOLATED/CONFIGURED
> +  * device to UNISOLATE is a no-op, but the hypervisor 
> can
> +  * use it as a hint that the LMB removal failed.
> +  */
> + dlpar_unisolate_drc(lmb->drc_index);
> +
>   rc = dlpar_add_lmb(lmb);
>       if (rc)
>   pr_err("Failed to add LMB, drc index %x\n",

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-04-18 Thread David Gibson
On Fri, Apr 09, 2021 at 12:31:03AM -0300, Leonardo Bras wrote:
> Hello David, thanks for commenting.
> 
> On Tue, 2021-03-23 at 10:45 +1100, David Gibson wrote:
> > > @@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long 
> > > new_mem_size, bool shrinking)
> > >   if (shrinking) {
> > > 
> > > + /* When batch removing entries, only resizes HPT at the end. */
> > > + if (atomic_read_acquire(_resize_disable))
> > > + return 0;
> > > +
> > 
> > I'm not quite convinced by this locking.  Couldn't hpt_resize_disable
> > be set after this point, but while you're still inside
> > resize_hpt_for_hotplug()?  Probably better to use an explicit mutex
> > (and mutex_trylock()) to make the critical sections clearer.
> 
> Sure, I can do that for v2.
> 
> > Except... do we even need the fancy mechanics to suppress the resizes
> > in one place to do them elswhere.  Couldn't we just replace the
> > existing resize calls with the batched ones?
> 
> How do you think of having batched resizes-down in HPT?

I think it's a good idea.  We still have to have the loop to resize
bigger if we can't fit everything into the smallest target size, but
that still only makes the worst case as bad at the always-case is
currently.

> Other than the current approach, I could only think of a way that would
> touch a lot of generic code, and/or duplicate some functions, as
> dlpar_add_lmb() does a lot of other stuff.
> 
> > > +void hash_memory_batch_shrink_end(void)
> > > +{
> > > + unsigned long newsize;
> > > +
> > > + /* Re-enables HPT resize-down after hot-unplug */
> > > + atomic_set_release(_resize_disable, 0);
> > > +
> > > + newsize = memblock_phys_mem_size();
> > > + /* Resize to smallest SHIFT possible */
> > > + while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
> > > + newsize *= 2;
> > 
> > As noted earlier, doing this without an explicit cap on the new hpt
> > size (of the existing size) this makes me nervous. 
> > 
> 
> I can add a stop in v2.
> 
> >  Less so, but doing
> > the calculations on memory size, rather than explictly on HPT size /
> > HPT order also seems kinda clunky.
> 
> Agree, but at this point, it would seem kind of a waste to find the
> shift from newsize, then calculate (1 << shift) for each retry of
> resize_hpt_for_hotplug() only to point that we are retrying the order
> value.

Yeah, I see your poiint.

> 
> But sure, if you think it looks better, I can change that. 
> 
> > > +void memory_batch_shrink_begin(void)
> > > +{
> > > + if (!radix_enabled())
> > > + hash_memory_batch_shrink_begin();
> > > +}
> > > +
> > > +void memory_batch_shrink_end(void)
> > > +{
> > > + if (!radix_enabled())
> > > + hash_memory_batch_shrink_end();
> > > +}
> > 
> > Again, these wrappers don't seem particularly useful to me.
> 
> Options would be add 'if (!radix_enabled())' to hotplug-memory.c
> functions or to hash* functions, which look kind of wrong.

I think the if !radix_enabled in hotplug-memory.c isn't too bad, in
fact possibly helpful as a hint that this is HPT only logic.

> 
> > > + memory_batch_shrink_end();
> > 
> > remove_by_index only removes a single LMB, so there's no real point to
> > batching here.
> 
> Sure, will be fixed for v2.
> 
> > > @@ -700,6 +712,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
> > >   if (lmbs_added != lmbs_to_add) {
> > >   pr_err("Memory hot-add failed, removing any added LMBs\n");
> > > 
> > > + memory_batch_shrink_begin();
> > 
> > 
> > The effect of these on the memory grow path is far from clear.
> > 
> 
> On hotplug, HPT is resized-up before adding LMBs.
> On hotunplug, HPT is resized-down after removing LMBs.
> And each one has it's own mechanism to batch HPT resizes...
> 
> I can't understand exactly how using it on hotplug fail path can be any
> different than using it on hotunplug.
> > 
> 
> Can you please help me understanding this?
> 
> Best regards,
> Leonardo Bras
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-04-18 Thread David Gibson
On Thu, Apr 08, 2021 at 11:51:36PM -0300, Leonardo Bras wrote:
> Hello David, thanks for the feedback!
> 
> On Mon, 2021-03-22 at 18:55 +1100, David Gibson wrote:
> > > +void hash_memory_batch_expand_prepare(unsigned long newsize)
> > > +{
> > > + /*
> > > +  * Resizing-up HPT should never fail, but there are some cases system 
> > > starts with higher
> > > +  * SHIFT than required, and we go through the funny case of resizing 
> > > HPT down while
> > > +  * adding memory
> > > +  */
> > > +
> > > + while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
> > > + newsize *= 2;
> > > + pr_warn("Hash collision while resizing HPT\n");
> > 
> > This unbounded increase in newsize makes me nervous - we should be
> > bounded by the current size of the HPT at least.  In practice we
> > should be fine, since the resize should always succeed by the time we
> > reach our current HPT size, but that's far from obvious from this
> > point in the code.
> 
> Sure, I will add bounds in v2.
> 
> > 
> > And... you're doubling newsize which is a value which might not be a
> > power of 2.  I'm wondering if there's an edge case where this could
> > actually cause us to skip the current size and erroneously resize to
> > one bigger than we have currently.
> 
> I also though that at the start, but it seems quite reliable.
> Before using this value, htab_shift_for_mem_size() will always round it
> to next power of 2. 
> Ex.
> Any value between 0b0101 and 0b1000 will be rounded to 0b1000 for shift
> calculation. If we multiply it by 2 (same as << 1), we have that
> anything between 0b01010 and 0b1 will be rounded to 0b1. 

Ah, good point.

> This works just fine as long as we are multiplying. 
> Division may have the behavior you expect, as 0b0101 >> 1 would become
> 0b010 and skip a shift.
>   
> > > +void memory_batch_expand_prepare(unsigned long newsize)
> > 
> > This wrapper doesn't seem useful.
> 
> Yeah, it does little, but I can't just jump into hash_* functions
> directly from hotplug-memory.c, without even knowing if it's using hash
> pagetables. (in case the suggestion would be test for disable_radix
> inside hash_memory_batch*)
> 
> > 
> > > +{
> > > + if (!radix_enabled())
> > > + hash_memory_batch_expand_prepare(newsize);
> > > +}
> > >  #endif /* CONFIG_MEMORY_HOTPLUG */
> > >  
> > > 
> > > + memory_batch_expand_prepare(memblock_phys_mem_size() +
> > > +  drmem_info->n_lmbs * drmem_lmb_size());
> > 
> > This doesn't look right.  memory_add_by_index() is adding a *single*
> > LMB, I think using drmem_info->n_lmbs here means you're counting this
> > as adding again as much memory as you already have hotplugged.
> 
> Yeah, my mistake. This makes sense.
> I will change it to something like 
> memblock_phys_mem_size() + drmem_lmb_size()
> 
> > > 
> > > + memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
> > > drmem_lmb_size());
> > > +
> > >   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> > >   if (lmb->flags & DRCONF_MEM_ASSIGNED)
> > >   continue;
> > 
> > I don't see memory_batch_expand_prepare() suppressing any existing HPT
> > resizes.  Won't this just resize to the right size for the full add,
> > then resize several times again as we perform the add?  Or.. I guess
> > that will be suppressed by patch 1/3. 
> 
> Correct.
> 
> >  That's seems kinda fragile, though.
> 
> What do you mean by fragile here?
> What would you suggest doing different?
> 
> Best regards,
> Leonardo Bras
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 2/2] hotplug-cpu.c: set UNISOLATE on dlpar_cpu_remove() failure

2021-04-18 Thread David Gibson
On Fri, Apr 16, 2021 at 06:02:16PM -0300, Daniel Henrique Barboza wrote:
> The RTAS set-indicator call, when attempting to UNISOLATE a DRC that is
> already UNISOLATED or CONFIGURED, returns RTAS_OK and does nothing else
> for both QEMU and phyp. This gives us an opportunity to use this
> behavior to signal the hypervisor layer when an error during device
> removal happens, allowing it to do a proper error handling, while not
> breaking QEMU/phyp implementations that don't have this support.
> 
> This patch introduces this idea by unisolating all CPU DRCs that failed
> to be removed by dlpar_cpu_remove_by_index(), when handling the
> PSERIES_HP_ELOG_ID_DRC_INDEX event. This is being done for this event
> only because its the only CPU removal event QEMU uses, and there's no
> need at this moment to add this mechanism for phyp only code.
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

except...

> ---
>  arch/powerpc/platforms/pseries/hotplug-cpu.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 12cbffd3c2e3..ed66895c2f51 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -802,8 +802,15 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
>   case PSERIES_HP_ELOG_ACTION_REMOVE:
>   if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
>   rc = dlpar_cpu_remove_by_count(count);
> - else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
> + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
>   rc = dlpar_cpu_remove_by_index(drc_index);
> + /* Setting the isolation state of an 
> UNISOLATED/CONFIGURED
> +  * device to UNISOLATE is a no-op, but the hypervison 
> can

typo here s/hypervison/hypervisor/

> +  * use it as a hint that the cpu removal failed.
> +  */
> + if (rc)
> + dlpar_unisolate_drc(drc_index);
> + }
>   else
>   rc = -EINVAL;
>   break;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 1/2] dlpar.c: introduce dlpar_unisolate_drc()

2021-04-18 Thread David Gibson
On Fri, Apr 16, 2021 at 06:02:15PM -0300, Daniel Henrique Barboza wrote:
> Next patch will execute a set-indicator call in hotplug-cpu.c.
> 
> Create a dlpar_unisolate_drc() helper to avoid spreading more
> rtas_set_indicator() calls outside of dlpar.c.
> 
> Signed-off-by: Daniel Henrique Barboza 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/platforms/pseries/dlpar.c   | 14 ++
>  arch/powerpc/platforms/pseries/pseries.h |  1 +
>  2 files changed, 15 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 233503fcf8f0..3ac70790ec7a 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -329,6 +329,20 @@ int dlpar_release_drc(u32 drc_index)
>   return 0;
>  }
>  
> +int dlpar_unisolate_drc(u32 drc_index)
> +{
> + int dr_status, rc;
> +
> + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, _status,
> + DR_ENTITY_SENSE, drc_index);
> + if (rc || dr_status != DR_ENTITY_PRESENT)
> + return -1;
> +
> + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
> +
> + return 0;
> +}
> +
>  int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog)
>  {
>   int rc;
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 4fe48c04c6c2..4ea12037c920 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -55,6 +55,7 @@ extern int dlpar_attach_node(struct device_node *, struct 
> device_node *);
>  extern int dlpar_detach_node(struct device_node *);
>  extern int dlpar_acquire_drc(u32 drc_index);
>  extern int dlpar_release_drc(u32 drc_index);
> +extern int dlpar_unisolate_drc(u32 drc_index);
>  
>  void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog);
>  int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 0/2] pseries: UNISOLATE DRCs to signal device removal error

2021-04-18 Thread David Gibson
On Fri, Apr 16, 2021 at 06:02:14PM -0300, Daniel Henrique Barboza wrote:
> At this moment, PAPR [1] does not have a way to report errors during a device
> removal operation. This puts a strain in the hypervisor, which needs extra
> mechanisms to try to fallback and recover from an error that might have
> happened during the removal. The QEMU community has dealt with it during these
> years by either trying to preempt the error before sending the HP event or, in
> case of a guest side failure, reboot the guest to complete the removal 
> process.
> 
> This started to change with QEMU commit fe1831eff8a4 ("spapr_drc.c: use DRC
> reconfiguration to cleanup DIMM unplug state"), where a way to fallback from a
> memory removal error was introduced. In this case, when QEMU detects that the
> kernel is reconfiguring LMBs DRCs that were marked as pending removal, the
> entire process is reverted from the QEMU side as well. Around the same time,
> other discussions in the QEMU mailing discussed an alternative for other 
> device
> as well.
> 
> In [2] the idea of using RTAS set-indicator for this role was first 
> introduced.
> The RTAS set-indicator call, when attempting to UNISOLATE a DRC that is 
> already
> UNISOLATED or CONFIGURED, returns RTAS_OK and does nothing else for both QEMU
> and phyp. This gives us an opportunity to use this behavior to signal the
> hypervisor layer when a device removal happens, allowing it to do a
> proper

Nit: it's not when a device removal happens, but when it *fails* to happen.

> error handling knowing for sure that the removal failed in the kernel. Using
> set-indicator to report HP errors isn't strange to PAPR, as per R1-13.5.3.4-4.
> of table 13.7 of [1]:

> "For all DR options: If this is a DR operation that involves the user insert-
> ing a DR entity, then if the firmware can determine that the inserted entity
> would cause a system disturbance, then the set-indicator RTAS call must not
> unisolate the entity and must return an error status which is unique to the
> particular error."
> 
> PAPR does not make any restrictions or considerations about setting an already
> Unisolated/Configured DRC to 'unisolate', meaning we have a chance to use it
> for this purpose - signal an OS side error when attempting to remove a DR
> entity.  To validate the design, this is being implemented only for CPUs.
> 
> QEMU will use this mechanism to rollback the device removal (hotunplug) state,
> allowing for a better error handling mechanism. A implementation of how QEMU
> can do it is in [3]. When using a kernel with this series applied, together
> with this QEMU build, this is what happens in a common CPU removal/hotunplug
> error scenario (trying to remove the last online CPU):
> 
> ( QEMU command line: qemu-system-ppc64 -machine pseries,accel=kvm,usb=off
> -smp 1,maxcpus=2,threads=1,cores=2,sockets=1 ... )
> 
> [root@localhost ~]# QEMU 5.2.92 monitor - type 'help' for more information
> (qemu) device_add host-spapr-cpu-core,core-id=1,id=core1
> (qemu) 
> 
> [root@localhost ~]# echo 0 > /sys/devices/system/cpu/cpu0/online
> [   77.548442][   T13] IRQ 19: no longer affine to CPU0
> [   77.548452][   T13] IRQ 20: no longer affine to CPU0
> [   77.548458][   T13] IRQ 256: no longer affine to CPU0
> [   77.548465][   T13] IRQ 258: no longer affine to CPU0
> [   77.548472][   T13] IRQ 259: no longer affine to CPU0
> [   77.548479][   T13] IRQ 260: no longer affine to CPU0
> [   77.548485][   T13] IRQ 261: no longer affine to CPU0
> [   77.548590][T0] cpu 0 (hwid 0) Ready to die...
> [root@localhost ~]# (qemu) 
> (qemu) device_del core1
> (qemu) [   83.214073][  T100] pseries-hotplug-cpu: Failed to offline CPU 
> PowerPC,POWER9, rc: -16
> qemu-system-ppc64: Device hotunplug rejected by the guest for device core1
> 
> (qemu) 
> 
> As soon as the CPU removal fails in dlpar_cpu(), QEMU becames aware of
> it and is able to do error recovery.
> 
> If this solution is well received, I'll push for an architecture change
> request internally at IBM to make this mechanism PAPR official.
> 
> 
> [1] 
> https://openpowerfoundation.org/wp-content/uploads/2020/07/LoPAR-20200611.pdf
> [2] https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg06395.html
> [3] https://github.com/danielhb/qemu/tree/unisolate_drc_callback_v1
> 
> Daniel Henrique Barboza (2):
>   dlpar.c: introduce dlpar_unisolate_drc()
>   hotplug-cpu.c: set UNISOLATE on dlpar_cpu_remove() failure
> 
>  arch/powerpc/platforms/pseries/dlpar.c   | 14 ++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |  9 -
>  arch/powerpc/platforms/pseries/pseries.h |  1 +
>  3 files changed, 23 insertions(+), 1 deletion(-)
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 3/3] powerpc/smp: Cache CPU to chip lookup

2021-04-18 Thread David Gibson
On Fri, Apr 16, 2021 at 09:27:48PM +0530, Gautham R Shenoy wrote:
> On Thu, Apr 15, 2021 at 11:21:10PM +0530, Srikar Dronamraju wrote:
> > * Gautham R Shenoy  [2021-04-15 22:49:21]:
> > 
> > > > 
> > > > +int *chip_id_lookup_table;
> > > > +
> > > >  #ifdef CONFIG_PPC64
> > > >  int __initdata iommu_is_off;
> > > >  int __initdata iommu_force_on;
> > > > @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id);
> > > >  int cpu_to_chip_id(int cpu)
> > > >  {
> > > > struct device_node *np;
> > > > +   int ret = -1, idx;
> > > > +
> > > > +   idx = cpu / threads_per_core;
> > > > +   if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1)
> > > 
> > 
> > > The value -1 is ambiguous since we won't be able to determine if
> > > it is because we haven't yet made a of_get_ibm_chip_id() call
> > > or if of_get_ibm_chip_id() call was made and it returned a -1.
> > > 
> > 
> > We don't allocate chip_id_lookup_table unless cpu_to_chip_id() return
> > !-1 value for the boot-cpuid. So this ensures that we dont
> > unnecessarily allocate chip_id_lookup_table. Also I check for
> > chip_id_lookup_table before calling cpu_to_chip_id() for other CPUs.
> > So this avoids overhead of calling cpu_to_chip_id() for platforms that
> > dont support it.  Also its most likely that if the
> > chip_id_lookup_table is initialized then of_get_ibm_chip_id() call
> > would return a valid value.
> > 
> > + Below we are only populating the lookup table, only when the
> > of_get_cpu_node is valid.
> > 
> > So I dont see any drawbacks of initializing it to -1. Do you see
> any?
> 
> 
> Only if other callers of cpu_to_chip_id() don't check for whether the
> chip_id_lookup_table() has been allocated or not. From a code
> readability point of view, it is easier to have that check  this inside
> cpu_to_chip_id() instead of requiring all its callers to make that
> check.

Even if they do, and the bad invalid value should never be read, I
think it's worth initializing that way.  If means if there's a mistake
and we do accidentally read the value, then the error is likely to be
much clearer.  Likewise if someone looks at this value from a
debugger, it will be clearer what's going on.

> 
> > 
> > > Thus, perhaps we can initialize chip_id_lookup_table[idx] with a
> > > different unique negative value. How about S32_MIN ? and check
> > > chip_id_lookup_table[idx] is different here ?
> > > 
> > 
> > I had initially initialized to -2, But then I thought we adding in
> > more confusion than necessary and it was not solving any issues.
> > 
> > 
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-18 Thread David Gibson
On Fri, Apr 16, 2021 at 11:15:49AM +0530, Srikar Dronamraju wrote:
> * David Gibson  [2021-04-16 13:21:34]:
> 
> Thanks for having a look at the patches.
> 
> > On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
> > > Daniel reported that with Commit 4ca234a9cbd7 ("powerpc/smp: Stop
> > > updating cpu_core_mask") QEMU was unable to set single NUMA node SMP
> > > topologies such as:
> > >  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2
> > >  i.e he expected 2 sockets in one NUMA node.
> > 
> > Well, strictly speaking, you can still set that toplogy in qemu but a
> > PAPR guest with that commit will show as having 1 socket in lscpu and
> > similar things.
> > 
> 
> Right, I did mention the o/p of lscpu in QEMU with the said commit and
> with the new patches in the cover letter. Somehow I goofed up the cc
> list for the cover letter.
> 
> Reference for the cover letter:
> https://lore.kernel.org/linuxppc-dev/20210415120934.232271-1-sri...@linux.vnet.ibm.com/t/#u
> 
> > Basically, this is because PAPR has no meaningful distinction between
> > cores and sockets.  So it's kind of a cosmetic problem, but it is a
> > user-unexpected behaviour that it would be nice to avoid if it's not
> > excessively difficult.
> > 
> > > The above commit helped to reduce boot time on Large Systems for
> > > example 4096 vCPU single socket QEMU instance. PAPR is silent on
> > > having more than one socket within a NUMA node.
> > > 
> > > cpu_core_mask and cpu_cpu_mask for any CPU would be same unless the
> > > number of sockets is different from the number of NUMA nodes.
> > 
> > Number of sockets being different from number of NUMA nodes is routine
> > in qemu, and I don't think it's something we should enforce.
> > 
> > > One option is to reintroduce cpu_core_mask but use a slightly
> > > different method to arrive at the cpu_core_mask. Previously each CPU's
> > > chip-id would be compared with all other CPU's chip-id to verify if
> > > both the CPUs were related at the chip level. Now if a CPU 'A' is
> > > found related / (unrelated) to another CPU 'B', all the thread
> > > siblings of 'A' and thread siblings of 'B' are automatically marked as
> > > related / (unrelated).
> > > 
> > > Also if a platform doesn't support ibm,chip-id property, i.e its
> > > cpu_to_chip_id returns -1, cpu_core_map holds a copy of
> > > cpu_cpu_mask().
> > 
> > Yeah, the other weirdness here is that ibm,chip-id isn't a PAPR
> > property at all - it was added for powernv.  We then added it to qemu
> > for PAPR guests because that was the way at the time to get the guest
> > to advertise the expected number of sockets.  It therefore basically
> > *only* exists on PAPR/qemu for that purpose, so if it's not serving it
> > we need to come up with something else.
> > 
> 
> Do you have ideas on what that something could be like?

Not really, sorry.

> So if that's
> more beneficial then we could move over to that scheme. Also apart
> from ibm,chip-id being not a PAPR property, do you have any other
> concerns with it.

I think if we can keep ibm,chip-id doing this job, that would be
simplest - as long as our PAPR usage isn't implying semantics which
contradict what it does on powernv.  AIUI Cédric thought it did that,
but with further discussion it seems like that might have been a
misunderstanding incorrectly conflating chip-id with NUMA nodes.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 1/3] powerpc/smp: Reintroduce cpu_core_mask

2021-04-15 Thread David Gibson
On Thu, Apr 15, 2021 at 05:39:32PM +0530, Srikar Dronamraju wrote:
> Daniel reported that with Commit 4ca234a9cbd7 ("powerpc/smp: Stop
> updating cpu_core_mask") QEMU was unable to set single NUMA node SMP
> topologies such as:
>  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2
>  i.e he expected 2 sockets in one NUMA node.

Well, strictly speaking, you can still set that toplogy in qemu but a
PAPR guest with that commit will show as having 1 socket in lscpu and
similar things.

Basically, this is because PAPR has no meaningful distinction between
cores and sockets.  So it's kind of a cosmetic problem, but it is a
user-unexpected behaviour that it would be nice to avoid if it's not
excessively difficult.

> The above commit helped to reduce boot time on Large Systems for
> example 4096 vCPU single socket QEMU instance. PAPR is silent on
> having more than one socket within a NUMA node.
> 
> cpu_core_mask and cpu_cpu_mask for any CPU would be same unless the
> number of sockets is different from the number of NUMA nodes.

Number of sockets being different from number of NUMA nodes is routine
in qemu, and I don't think it's something we should enforce.

> One option is to reintroduce cpu_core_mask but use a slightly
> different method to arrive at the cpu_core_mask. Previously each CPU's
> chip-id would be compared with all other CPU's chip-id to verify if
> both the CPUs were related at the chip level. Now if a CPU 'A' is
> found related / (unrelated) to another CPU 'B', all the thread
> siblings of 'A' and thread siblings of 'B' are automatically marked as
> related / (unrelated).
> 
> Also if a platform doesn't support ibm,chip-id property, i.e its
> cpu_to_chip_id returns -1, cpu_core_map holds a copy of
> cpu_cpu_mask().

Yeah, the other weirdness here is that ibm,chip-id isn't a PAPR
property at all - it was added for powernv.  We then added it to qemu
for PAPR guests because that was the way at the time to get the guest
to advertise the expected number of sockets.  It therefore basically
*only* exists on PAPR/qemu for that purpose, so if it's not serving it
we need to come up with something else.

> 
> Fixes: 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask")
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: qemu-...@nongnu.org
> Cc: Cedric Le Goater 
> Cc: David Gibson 
> Cc: Nathan Lynch 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Gautham R Shenoy 
> Reported-by: Daniel Henrique Barboza 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/include/asm/smp.h |  5 +
>  arch/powerpc/kernel/smp.c  | 39 --
>  2 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 7a13bc20f0a0..47081a9e13ca 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -121,6 +121,11 @@ static inline struct cpumask *cpu_sibling_mask(int cpu)
>   return per_cpu(cpu_sibling_map, cpu);
>  }
>  
> +static inline struct cpumask *cpu_core_mask(int cpu)
> +{
> + return per_cpu(cpu_core_map, cpu);
> +}
> +
>  static inline struct cpumask *cpu_l2_cache_mask(int cpu)
>  {
>   return per_cpu(cpu_l2_cache_map, cpu);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5a4d59a1070d..5c7ce1d50631 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1057,17 +1057,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   local_memory_node(numa_cpu_lookup_table[cpu]));
>   }
>  #endif
> - /*
> -  * cpu_core_map is now more updated and exists only since
> -  * its been exported for long. It only will have a snapshot
> -  * of cpu_cpu_mask.
> -  */
> - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu));
>   }
>  
>   /* Init the cpumasks so the boot CPU is related to itself */
>   cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
>   cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
> + cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
>  
>   if (has_coregroup_support())
>   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> @@ -1408,6 +1403,9 @@ static void remove_cpu_from_masks(int cpu)
>   set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
>   }
>  
> + for_each_cpu(i, cpu_core_mask(cpu))
> + set_cpus_unrelated(cpu, i, cpu_core_mask);
> +
>   if (has_coregroup_support()) {
>   for_each_cpu(i, cpu_coregroup_mask(cpu))
>  

Re: [PATCH v3 1/9] powerpc/xive: Use cpu_to_node() instead of "ibm,chip-id" property

2021-03-31 Thread David Gibson
On Wed, Mar 31, 2021 at 04:45:06PM +0200, Cédric Le Goater wrote:
> The 'chip_id' field of the XIVE CPU structure is used to choose a
> target for a source located on the same chip when possible. The XIVE
> driver queries the chip id value from the "ibm,chip-id" DT property
> but this property is not available on all platforms. It was first
> introduced on the PowerNV platform and later, under QEMU for pseries.
> However, the property does not exist under PowerVM since it is not
> specified in PAPR.
> 
> cpu_to_node() is a better alternative. On the PowerNV platform, the
> node id is computed from the "ibm,associativity" property of the CPU.
> Its value is built in the OPAL firmware from the physical chip id and
> is equivalent to "ibm,chip-id".

Hrm... I mean, for powernv this is certainly correct, but seems to be
relying on pretty specific specifics of the OPAL / chip behaviour,
namely that the NUMA id == chip ID.

> On pSeries, the hcall
> H_HOME_NODE_ASSOCIATIVITY returns the node id.

AFAICT, the chip_id field is never actually used in the PAPR version
of XIVE.  The only access to the field outside native.c is in
xive_pick_irq_target(), and it only looks at chip_id if src_chip is
valid.  But src_chip is initialized to XIVE_INVALID_CHIP_ID in papr.c

So it would make more sense to me to also initialize chip_id to
XIVE_INVALID_CHIP_ID for PAPR to make it clearer that it's not
relevant.

> Also to be noted that under QEMU/KVM "ibm,chip-id" is badly calculated
> with unusual SMT configuration. This leads to a bogus chip id value
> being returned by of_get_ibm_chip_id().

I *still* don't clearly understand what you think is bogus about the
chip id value that qemu generates.  It's clearly not a problem for
XIVE, since PAPR XIVE never uses it.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v6 5/6] KVM: PPC: Book3S HV: Add KVM_CAP_PPC_RPT_INVALIDATE capability

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:38PM +0530, Bharata B Rao wrote:
> Now that we have H_RPT_INVALIDATE fully implemented, enable
> support for the same via KVM_CAP_PPC_RPT_INVALIDATE KVM capability
> 
> Signed-off-by: Bharata B Rao 

Reviewed-by: David Gibson 

> ---
>  Documentation/virt/kvm/api.rst | 18 ++
>  arch/powerpc/kvm/powerpc.c |  3 +++
>  include/uapi/linux/kvm.h   |  1 +
>  3 files changed, 22 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 1a2b5210cdbf..d769cef5f904 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6227,6 +6227,24 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between 
> them.
>  This capability can be used to check / enable 2nd DAWR feature provided
>  by POWER10 processor.
>  
> +7.24 KVM_CAP_PPC_RPT_INVALIDATE
> +--
> +
> +:Capability: KVM_CAP_PPC_RPT_INVALIDATE
> +:Architectures: ppc
> +:Type: vm
> +
> +This capability indicates that the kernel is capable of handling
> +H_RPT_INVALIDATE hcall.
> +
> +In order to enable the use of H_RPT_INVALIDATE in the guest,
> +user space might have to advertise it for the guest. For example,
> +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
> +present in the "ibm,hypertas-functions" device-tree property.
> +
> +This capability is enabled for hypervisors on platforms like POWER9
> +that support radix MMU.
> +
>  8. Other capabilities.
>  ==
>  
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index a2a68a958fa0..be33b5321a76 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -682,6 +682,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
>  !kvmppc_hv_ops->enable_dawr1(NULL));
>   break;
> + case KVM_CAP_PPC_RPT_INVALIDATE:
> + r = 1;
> + break;
>  #endif
>   default:
>   r = 0;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f6afee209620..2b2370475cec 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_DIRTY_LOG_RING 192
>  #define KVM_CAP_X86_BUS_LOCK_EXIT 193
>  #define KVM_CAP_PPC_DAWR1 194
> +#define KVM_CAP_PPC_RPT_INVALIDATE 195
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v6 3/6] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:36PM +0530, Bharata B Rao wrote:
> H_RPT_INVALIDATE does two types of TLB invalidations:
> 
> 1. Process-scoped invalidations for guests when LPCR[GTSE]=0.
>This is currently not used in KVM as GTSE is not usually
>disabled in KVM.
> 2. Partition-scoped invalidations that an L1 hypervisor does on
>behalf of an L2 guest. This is currently handled
>by H_TLB_INVALIDATE hcall and this new replaces the old that.
> 
> This commit enables process-scoped invalidations for L1 guests.
> Support for process-scoped and partition-scoped invalidations
> from/for nested guests will be added separately.
> 
> Process scoped tlbie invalidations from L1 and nested guests
> need RS register for TLBIE instruction to contain both PID and
> LPID.  This patch introduces primitives that execute tlbie
> instruction with both PID and LPID set in prepartion for
> H_RPT_INVALIDATE hcall.
> 
> A description of H_RPT_INVALIDATE follows:
> 
> int64   /* H_Success: Return code on successful completion */
>     /* H_Busy - repeat the call with the same */
>     /* H_Parameter, H_P2, H_P3, H_P4, H_P5 : Invalid
>  parameters */
> hcall(const uint64 H_RPT_INVALIDATE, /* Invalidate RPT
>   translation
>   lookaside information */
>   uint64 id,    /* PID/LPID to invalidate */
>   uint64 target,    /* Invalidation target */
>   uint64 type,  /* Type of lookaside information */
>   uint64 pg_sizes,  /* Page sizes */
>   uint64 start, /* Start of Effective Address (EA)
>  range (inclusive) */
>   uint64 end)   /* End of EA range (exclusive) */
> 
> Invalidation targets (target)
> -
> Core MMU    0x01 /* All virtual processors in the
>   partition */
> Core local MMU  0x02 /* Current virtual processor */
> Nest MMU    0x04 /* All nest/accelerator agents
>   in use by the partition */
> 
> A combination of the above can be specified,
> except core and core local.
> 
> Type of translation to invalidate (type)
> ---
> NESTED   0x0001  /* invalidate nested guest partition-scope */
> TLB  0x0002  /* Invalidate TLB */
> PWC  0x0004  /* Invalidate Page Walk Cache */
> PRT  0x0008  /* Invalidate caching of Process Table
>   Entries if NESTED is clear */
> PAT  0x0008  /* Invalidate caching of Partition Table
>   Entries if NESTED is set */
> 
> A combination of the above can be specified.
> 
> Page size mask (pages)
> --
> 4K  0x01
> 64K 0x02
> 2M  0x04
> 1G  0x08
> All sizes   (-1UL)
> 
> A combination of the above can be specified.
> All page sizes can be selected with -1.
> 
> Semantics: Invalidate radix tree lookaside information
>    matching the parameters given.
> * Return H_P2, H_P3 or H_P4 if target, type, or pageSizes parameters
>   are different from the defined values.
> * Return H_PARAMETER if NESTED is set and pid is not a valid nested
>   LPID allocated to this partition
> * Return H_P5 if (start, end) doesn't form a valid range. Start and
>   end should be a valid Quadrant address and  end > start.
> * Return H_NotSupported if the partition is not in running in radix
>   translation mode.
> * May invalidate more translation information than requested.
> * If start = 0 and end = -1, set the range to cover all valid
>   addresses. Else start and end should be aligned to 4kB (lower 11
>   bits clear).
> * If NESTED is clear, then invalidate process scoped lookaside
>   information. Else pid specifies a nested LPID, and the invalidation
>   is performed   on nested guest partition table and nested guest
>   partition scope real addresses.
> * If pid = 0 and NESTED is clear, then valid addresses are quadrant 3
>   and quadrant 0 spaces, Else valid addresses are quadrant 0.
> * Pages which are fully covered by the range are to be invalidated.
>   Those which are partially covered are considered outside
>   invalidation range, which allows a caller to optimally invalidate
>   ranges that may   contain mixed page sizes.
> * Return H_SUCCESS on success.
> 
> Signed-off-by: Bharata B Rao 

Reviewed-by: David Gibson 

with the exception of one nit noted below.

> ---
>  .../include/asm/book3s/64/tlbflush-radix.h|   4 +
>  arch/powerpc/include/asm/mmu_context.h|  11 ++
>  arch/powerpc/kvm/book3s_hv.c  |  46 ++
>  arch/powerpc/mm/book3s64/radix_tlb.c

Re: [PATCH v6 4/6] KVM: PPC: Book3S HV: Nested support in H_RPT_INVALIDATE

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:37PM +0530, Bharata B Rao wrote:
> Enable support for process-scoped invalidations from nested
> guests and partition-scoped invalidations for nested guests.
> 
> Process-scoped invalidations for any level of nested guests
> are handled by implementing H_RPT_INVALIDATE handler in the
> nested guest exit path in L0.
> 
> Partition-scoped invalidation requests are forwarded to the
> right nested guest, handled there and passed down to L0
> for eventual handling.
> 
> Signed-off-by: Bharata B Rao 
> Signed-off-by: Aneesh Kumar K.V 
>   [Nested guest partition-scoped invalidation changes]

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/include/asm/kvm_book3s.h |   3 +
>  arch/powerpc/kvm/book3s_hv.c  |  71 +-
>  arch/powerpc/kvm/book3s_hv_nested.c   | 104 ++
>  3 files changed, 175 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 2f5f919f6cd3..de8fc5a4d19c 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -305,6 +305,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 
> dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
>  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
>  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> +long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
> +  unsigned long type, unsigned long pg_sizes,
> +  unsigned long start, unsigned long end);
>  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
> u64 time_limit, unsigned long lpcr);
>  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 5d008468347c..03755389efd1 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -922,6 +922,46 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
>   return yield_count;
>  }
>  
> +/*
> + * H_RPT_INVALIDATE hcall handler for nested guests.
> + *
> + * Handles only nested process-scoped invalidation requests in L0.
> + */
> +static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
> +{
> + unsigned long type = kvmppc_get_gpr(vcpu, 6);
> + unsigned long pid, pg_sizes, start, end, psize;
> + struct kvm_nested_guest *gp;
> + struct mmu_psize_def *def;
> +
> + /*
> +  * The partition-scoped invalidations aren't handled here in L0.
> +  */
> + if (type & H_RPTI_TYPE_NESTED)
> + return RESUME_HOST;
> +
> + pid = kvmppc_get_gpr(vcpu, 4);
> + pg_sizes = kvmppc_get_gpr(vcpu, 7);
> + start = kvmppc_get_gpr(vcpu, 8);
> + end = kvmppc_get_gpr(vcpu, 9);
> +
> + gp = kvmhv_get_nested(vcpu->kvm, vcpu->kvm->arch.lpid, false);
> + if (!gp)
> + goto out;
> +
> + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
> + def = _psize_defs[psize];
> + if (pg_sizes & def->h_rpt_pgsize)
> + do_h_rpt_invalidate_prt(pid, gp->shadow_lpid, type,
> + (1UL << def->shift), psize,
> + start, end);
> + }
> + kvmhv_put_nested(gp);
> +out:
> + kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
> + return RESUME_GUEST;
> +}
> +
>  static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
>   unsigned long id, unsigned long target,
>   unsigned long type, unsigned long pg_sizes,
> @@ -938,10 +978,18 @@ static long kvmppc_h_rpt_invalidate(struct kvm_vcpu 
> *vcpu,
>  
>   /*
>* Partition-scoped invalidation for nested guests.
> -  * Not yet supported
>*/
> - if (type & H_RPTI_TYPE_NESTED)
> - return H_P3;
> + if (type & H_RPTI_TYPE_NESTED) {
> + if (!nesting_enabled(vcpu->kvm))
> + return H_FUNCTION;
> +
> + /* Support only cores as target */
> + if (target != H_RPTI_TARGET_CMMU)
> + return H_P2;
> +
> + return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
> +start, end);
> + }
>  
>   /*
>* Process-scoped invalidation for L1 guests.
> @@ -1636,6 +1684,23 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
> *vcpu)
>   if (!xics_on_xive())
>  

Re: [PATCH v6 6/6] KVM: PPC: Book3S HV: Use H_RPT_INVALIDATE in nested KVM

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:39PM +0530, Bharata B Rao wrote:
> In the nested KVM case, replace H_TLB_INVALIDATE by the new hcall
> H_RPT_INVALIDATE if available. The availability of this hcall
> is determined from "hcall-rpt-invalidate" string in ibm,hypertas-functions
> DT property.
> 
> Signed-off-by: Bharata B Rao 
> Reviewed-by: Fabiano Rosas 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 27 +-
>  arch/powerpc/kvm/book3s_hv_nested.c| 12 ++--
>  2 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index e603de7ade52..1e1e55fd0ee5 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -21,6 +21,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Supported radix tree geometry.
> @@ -318,9 +319,19 @@ void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned 
> long addr,
>   }
>  
>   psi = shift_to_mmu_psize(pshift);
> - rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
> - rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
> - lpid, rb);
> +
> + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
> + rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
> + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 
> 1),
> + lpid, rb);
> + } else {
> + rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
> + H_RPTI_TYPE_NESTED |
> + H_RPTI_TYPE_TLB,
> + psize_to_rpti_pgsize(psi),
> + addr, addr + psize);
> + }
> +
>   if (rc)
>   pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
>  }
> @@ -334,8 +345,14 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, 
> unsigned int lpid)
>   return;
>   }
>  
> - rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
> - lpid, TLBIEL_INVAL_SET_LPID);
> + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
> + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 
> 1),
> + lpid, TLBIEL_INVAL_SET_LPID);
> + else
> + rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
> + H_RPTI_TYPE_NESTED |
> + H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
> + 0, -1UL);
>   if (rc)
>   pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
> b/arch/powerpc/kvm/book3s_hv_nested.c
> index adcc8e26ef22..5601b7eb9b89 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static struct patb_entry *pseries_partition_tb;
>  
> @@ -444,8 +445,15 @@ static void kvmhv_flush_lpid(unsigned int lpid)
>   return;
>   }
>  
> - rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
> - lpid, TLBIEL_INVAL_SET_LPID);
> + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
> + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 
> 1),
> + lpid, TLBIEL_INVAL_SET_LPID);
> + else
> + rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
> + H_RPTI_TYPE_NESTED |
> + H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
> + H_RPTI_TYPE_PAT,
> + H_RPTI_PAGE_ALL, 0, -1UL);
>   if (rc)
>   pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
>  }

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v6 2/6] powerpc/book3s64/radix: Add H_RPT_INVALIDATE pgsize encodings to mmu_psize_def

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:35PM +0530, Bharata B Rao wrote:
> Add a field to mmu_psize_def to store the page size encodings
> of H_RPT_INVALIDATE hcall. Initialize this while scanning the radix
> AP encodings. This will be used when invalidating with required
> page size encoding in the hcall.
> 
> Signed-off-by: Bharata B Rao 

Having the table be the source of truth and implementing
psize_to_rpti_pgsize() in terms of it would be nicer.  But... I guess
you can't really do that, because you're dynamically initializing the
table from the device tree, but the device tree doesn't include the
RPTI encodings.  Oh well.

Reveiwed-by: David Gibson 

> ---
>  arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
> b/arch/powerpc/include/asm/book3s/64/mmu.h
> index eace8c3f7b0a..c02f42d1031e 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -19,6 +19,7 @@ struct mmu_psize_def {
>   int penc[MMU_PAGE_COUNT];   /* HPTE encoding */
>   unsigned inttlbiel; /* tlbiel supported for that page size */
>   unsigned long   avpnm;  /* bits to mask out in AVPN in the HPTE */
> + unsigned long   h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
>   union {
>   unsigned long   sllp;   /* SLB L||LP (exact mask to use in 
> slbmte) */
>   unsigned long ap;   /* Ap encoding used by PowerISA 3.0 */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 98f0b243c1ab..1b749899016b 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -486,6 +486,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long 
> node,
>   def = _psize_defs[idx];
>   def->shift = shift;
>   def->ap  = ap;
> + def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
>   }
>  
>   /* needed ? */
> @@ -560,9 +561,13 @@ void __init radix__early_init_devtree(void)
>*/
>   mmu_psize_defs[MMU_PAGE_4K].shift = 12;
>   mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
> + mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
> + psize_to_rpti_pgsize(MMU_PAGE_4K);
>  
>   mmu_psize_defs[MMU_PAGE_64K].shift = 16;
>   mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
> +     mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
> + psize_to_rpti_pgsize(MMU_PAGE_64K);
>   }
>  
>   /*

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v6 1/6] KVM: PPC: Book3S HV: Fix comments of H_RPT_INVALIDATE arguments

2021-03-22 Thread David Gibson
On Thu, Mar 11, 2021 at 02:09:34PM +0530, Bharata B Rao wrote:
> From: "Aneesh Kumar K.V" 
> 
> The type values H_RPTI_TYPE_PRT and H_RPTI_TYPE_PAT indicate
> invalidating the caching of process and partition scoped entries
> respectively.
> 
> Signed-off-by: Aneesh Kumar K.V 
> Signed-off-by: Bharata B Rao 

Not sure the change really clarifies that much, but whatever

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/include/asm/hvcall.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h 
> b/arch/powerpc/include/asm/hvcall.h
> index ed6086d57b22..6af7bb3c9121 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -411,9 +411,9 @@
>  #define H_RPTI_TYPE_NESTED   0x0001  /* Invalidate nested guest 
> partition-scope */
>  #define H_RPTI_TYPE_TLB  0x0002  /* Invalidate TLB */
>  #define H_RPTI_TYPE_PWC  0x0004  /* Invalidate Page Walk Cache */
> -/* Invalidate Process Table Entries if H_RPTI_TYPE_NESTED is clear */
> +/* Invalidate caching of Process Table Entries if H_RPTI_TYPE_NESTED is 
> clear */
>  #define H_RPTI_TYPE_PRT  0x0008
> -/* Invalidate Partition Table Entries if H_RPTI_TYPE_NESTED is set */
> +/* Invalidate caching of Partition Table Entries if H_RPTI_TYPE_NESTED is 
> set */
>  #define H_RPTI_TYPE_PAT  0x0008
>  #define H_RPTI_TYPE_ALL      (H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
>H_RPTI_TYPE_PRT)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-03-22 Thread David Gibson
le (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
> + newsize *= 2;

As noted earlier, doing this without an explicit cap on the new hpt
size (of the existing size) this makes me nervous.  Less so, but doing
the calculations on memory size, rather than explictly on HPT size /
HPT order also seems kinda clunky.

> + pr_warn("Hash collision while resizing HPT\n");
> + }
> +}
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  static void __init hash_init_partition_table(phys_addr_t hash_table,
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
> b/arch/powerpc/mm/book3s64/pgtable.c
> index f1cd8af0f67f..e01681e22e00 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -199,6 +199,18 @@ void memory_batch_expand_prepare(unsigned long newsize)
>   if (!radix_enabled())
>   hash_memory_batch_expand_prepare(newsize);
>  }
> +
> +void memory_batch_shrink_begin(void)
> +{
> + if (!radix_enabled())
> + hash_memory_batch_shrink_begin();
> +}
> +
> +void memory_batch_shrink_end(void)
> +{
> + if (!radix_enabled())
> + hash_memory_batch_shrink_end();
> +}

Again, these wrappers don't seem particularly useful to me.

>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  void __init mmu_partition_table_init(void)
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 353c71249214..9182fb5b5c01 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -425,6 +425,8 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>   return -EINVAL;
>   }
>  
> + memory_batch_shrink_begin();
> +
>   for_each_drmem_lmb(lmb) {
>   rc = dlpar_remove_lmb(lmb);
>   if (rc)
> @@ -470,6 +472,8 @@ static int dlpar_memory_remove_by_count(u32 
> lmbs_to_remove)
>   rc = 0;
>   }
>  
> + memory_batch_shrink_end();
> +
>   return rc;
>  }
>  
> @@ -481,6 +485,8 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
>  
>   pr_debug("Attempting to hot-remove LMB, drc index %x\n", drc_index);
>  
> + memory_batch_shrink_begin();
> +
>   lmb_found = 0;
>   for_each_drmem_lmb(lmb) {
>   if (lmb->drc_index == drc_index) {
> @@ -502,6 +508,8 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
>   else
>   pr_debug("Memory at %llx was hot-removed\n", lmb->base_addr);
>  
> + memory_batch_shrink_end();

remove_by_index only removes a single LMB, so there's no real point to
batching here.

>   return rc;
>  }
>  
> @@ -532,6 +540,8 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   if (lmbs_available < lmbs_to_remove)
>   return -EINVAL;
>  
> + memory_batch_shrink_begin();
> +
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
>   continue;
> @@ -572,6 +582,8 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
> u32 drc_index)
>   }
>   }
>  
> + memory_batch_shrink_end();
> +
>   return rc;
>  }
>  
> @@ -700,6 +712,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>   if (lmbs_added != lmbs_to_add) {
>   pr_err("Memory hot-add failed, removing any added LMBs\n");
>  
> + memory_batch_shrink_begin();


The effect of these on the memory grow path is far from clear.

>   for_each_drmem_lmb(lmb) {
>   if (!drmem_lmb_reserved(lmb))
>   continue;
> @@ -713,6 +726,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>  
>   drmem_remove_lmb_reservation(lmb);
>   }
> + memory_batch_shrink_end();
>   rc = -EINVAL;
>   } else {
>   for_each_drmem_lmb(lmb) {
> @@ -814,6 +828,7 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>   if (rc) {
>   pr_err("Memory indexed-count-add failed, removing any added 
> LMBs\n");
>  
> + memory_batch_shrink_begin();
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (!drmem_lmb_reserved(lmb))
>   continue;
> @@ -827,6 +842,7 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>  
>   drmem_remove_lmb_reservation(lmb);
>   }
> + memory_batch_shrink_end();
>   rc = -EINVAL;
>   } else {
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-03-22 Thread David Gibson
On Fri, Mar 12, 2021 at 04:29:39AM -0300, Leonardo Bras wrote:
> Because hypervisors may need to create HPTs without knowing the guest
> page size, the smallest used page-size (4k) may be chosen, resulting in
> a HPT that is possibly bigger than needed.
> 
> On a guest with bigger page-sizes, the amount of entries for HTP may be
> too high, causing the guest to ask for a HPT resize-down on the first
> hotplug.
> 
> This becomes a problem when HPT resize-down fails, and causes the
> HPT resize to be performed on every LMB added, until HPT size is
> compatible to guest memory size, causing a major slowdown.
> 
> So, avoiding HPT resizing-down on hot-add significantly improves memory
> hotplug times.
> 
> As an example, hotplugging 256GB on a 129GB guest took 710s without this
> patch, and 21s after applied.
> 
> Signed-off-by: Leonardo Bras 

I don't love this approach.  Adding the extra flag at this level seems
a bit inelegant, and it means we're passing up an easy opportunity to
reduce our resource footprint on the host.

But... maybe we'll have to do it.  I'd like to see if we can get
things to work well enough with just the "batching" to avoid multiple
resize attempts first.

> ---
>  arch/powerpc/mm/book3s64/hash_utils.c | 36 ---
>  1 file changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 73b06adb6eeb..cfb3ec164f56 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -794,7 +794,7 @@ static unsigned long __init htab_get_table_size(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -static int resize_hpt_for_hotplug(unsigned long new_mem_size)
> +static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
>  {
>   unsigned target_hpt_shift;
>  
> @@ -803,19 +803,25 @@ static int resize_hpt_for_hotplug(unsigned long 
> new_mem_size)
>  
>   target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
>  
> - /*
> -  * To avoid lots of HPT resizes if memory size is fluctuating
> -  * across a boundary, we deliberately have some hysterisis
> -  * here: we immediately increase the HPT size if the target
> -  * shift exceeds the current shift, but we won't attempt to
> -  * reduce unless the target shift is at least 2 below the
> -  * current shift
> -  */
> - if (target_hpt_shift > ppc64_pft_size ||
> - target_hpt_shift < ppc64_pft_size - 1)
> - return mmu_hash_ops.resize_hpt(target_hpt_shift);
> + if (shrinking) {
>  
> - return 0;
> + /*
> +  * To avoid lots of HPT resizes if memory size is fluctuating
> +  * across a boundary, we deliberately have some hysterisis
> +  * here: we immediately increase the HPT size if the target
> +  * shift exceeds the current shift, but we won't attempt to
> +  * reduce unless the target shift is at least 2 below the
> +  * current shift
> +  */
> +
> + if (target_hpt_shift >= ppc64_pft_size - 1)
> + return 0;
> +
> + } else if (target_hpt_shift <= ppc64_pft_size) {
> + return 0;
> + }
> +
> + return mmu_hash_ops.resize_hpt(target_hpt_shift);
>  }
>  
>  int hash__create_section_mapping(unsigned long start, unsigned long end,
> @@ -828,7 +834,7 @@ int hash__create_section_mapping(unsigned long start, 
> unsigned long end,
>   return -1;
>   }
>  
> - resize_hpt_for_hotplug(memblock_phys_mem_size());
> + resize_hpt_for_hotplug(memblock_phys_mem_size(), false);
>  
>   rc = htab_bolt_mapping(start, end, __pa(start),
>  pgprot_val(prot), mmu_linear_psize,
> @@ -847,7 +853,7 @@ int hash__remove_section_mapping(unsigned long start, 
> unsigned long end)
>   int rc = htab_remove_mapping(start, end, mmu_linear_psize,
>        mmu_kernel_ssize);
>  
> - if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
> + if (resize_hpt_for_hotplug(memblock_phys_mem_size(), true) == -ENOSPC)
>   pr_warn("Hash collision while resizing HPT\n");
>  
>   return rc;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-03-22 Thread David Gibson
tplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -671,6 +671,8 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
>   if (lmbs_available < lmbs_to_add)
>   return -EINVAL;
>  
> + memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
> drmem_lmb_size());
> +
>   for_each_drmem_lmb(lmb) {
>   if (lmb->flags & DRCONF_MEM_ASSIGNED)
>   continue;
> @@ -734,6 +736,8 @@ static int dlpar_memory_add_by_index(u32 drc_index)
>  
>   pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
>  
> + memory_batch_expand_prepare(memblock_phys_mem_size() +
> +  drmem_info->n_lmbs * drmem_lmb_size());

This doesn't look right.  memory_add_by_index() is adding a *single*
LMB, I think using drmem_info->n_lmbs here means you're counting this
as adding again as much memory as you already have hotplugged.

>   lmb_found = 0;
>   for_each_drmem_lmb(lmb) {
>   if (lmb->drc_index == drc_index) {
> @@ -788,6 +792,8 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
> drc_index)
>   if (lmbs_available < lmbs_to_add)
>   return -EINVAL;
>  
> + memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
> drmem_lmb_size());
> +
>   for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
>   if (lmb->flags & DRCONF_MEM_ASSIGNED)
>   continue;

I don't see memory_batch_expand_prepare() suppressing any existing HPT
resizes.  Won't this just resize to the right size for the full add,
then resize several times again as we perform the add?  Or.. I guess
that will be suppressed by patch 1/3.  That's seems kinda fragile,
though.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH] powerpc/numa: Fix topology_physical_package_id() on pSeries

2021-03-21 Thread David Gibson
On Tue, Mar 16, 2021 at 01:24:37PM +0100, Cédric Le Goater wrote:
> The topology-id of a CPU in a pSeries machine can be queried from
> sysfs but under PowerVM the value is always -1 even if NUMA nodes are
> defined. This is because the topology_physical_package_id() routine is
> using the "ibm,chip-id" property which is not specified in PAPR.
> 
> Under QEMU/KVM, things are different because QEMU populates the CPU DT
> node with "ibm,chip-id" property. However, its value can be incorrect
> for uncommon SMT configuration and expose a bogus topology-id value in
> sysfs.

Incorrect in what sense?  It's still indicating the (admittedly
arbitrary) qemu socket number, isn't it?  And isn't that what it
should be?

> The use of cpu_to_node() guarantees to have a correct NUMA node id
> under both environments QEMU/KVM and PowerVM. This introduces a slight
> change for the QEMU/KVM guest, as the topology-id now matches the NUMA
> node and not the socket-id as before. Since QEMU also needs to remove
> "ibm,chip-id" property for the DT to follow the PAPR specs, both
> hypervisor environments will be in sync.
> 
> On the PowerNV side, the NUMA node id returned by cpu_to_node() is
> computed from the "ibm,associativity" property of the CPU. Its value
> is built from the OPAL chip id and is equivalent to "ibm,chip-id".

Like mpe, I'm not convinced this is the right approach.  "physical
packate" and NUMA node are not the same thing, except sometimes by
accident.

> 
> Cc: Nathan Lynch 
> Cc: Srikar Dronamraju 
> Cc: Vasant Hegde 
> Reviewed-by: Greg Kurz 
> Reviewed-by: Daniel Henrique Barboza 
> Tested-by: Daniel Henrique Barboza 
> Reviewed-by: Srikar Dronamraju 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/include/asm/topology.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index 3beeb030cd78..887c42a4e43d 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -123,7 +123,7 @@ static inline int cpu_to_coregroup_id(int cpu)
>  #ifdef CONFIG_PPC64
>  #include 
>  
> -#define topology_physical_package_id(cpu)(cpu_to_chip_id(cpu))
> +#define topology_physical_package_id(cpu)(cpu_to_node(cpu))
>  
>  #define topology_sibling_cpumask(cpu)(per_cpu(cpu_sibling_map, cpu))
>  #define topology_core_cpumask(cpu)   (cpu_cpu_mask(cpu))

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: Advice needed on SMP regression after cpu_core_mask change

2021-03-21 Thread David Gibson
 that calculated cpu_core_mask. 
> > > cpu_core_mask, despite
> > > its shortcomings that caused its removal, was giving a precise SMP 
> > > topology. And it was
> > > using physical_package_id/'ibm,chip-id' for that.
> > 
> > ibm,chip-id is a no-no on pSeries. I guess this is inherent to PAPR which
> > is hiding a lot of the underlying HW and topology. May be we are trying
> > to reconcile two orthogonal views of machine virtualization ...
> > 
> > > Checking in QEMU I can say that the ibm,chip-id calculation is the only 
> > > place in the code
> > > that cares about cores per socket information. The kernel is now ignoring 
> > > that, starting
> > > on 4bce545903fa, and now QEMU is unable to provide this info to the guest.
> > > 
> > > If we're not going to use ibm,chip-id any longer, which seems sensible 
> > > given that PAPR does
> > > not declare it, we need another way of letting the guest know how much 
> > > cores per socket
> > > we want.
> > The RTAS call "ibm,get-system-parameter" with token "Processor Module
> > Information" returns that kind of information :
> > 
> >2 byte binary number (N) of module types followed by N module specifiers 
> > of the form:
> >2 byte binary number (M) of sockets of this module type
> >2 byte binary number (L) of chips per this module type
> >2 byte binary number (K) of cores per chip in this module type.
> > 
> > See the values in these sysfs files :
> > 
> >cat /sys/devices/hv_24x7/interface/{sockets,chipspersocket,coresperchip}
> > 
> > But I am afraid these are host level information and not guest/LPAR.
> 
> 
> I believe there might be some sort of reasoning behind not having this on
> PAPR, but I'll say in advance that the virtual machine should act as the
> real hardware, as close as possible. This is the kind of hcall that could
> be used in this situation.

In the case of POWER, that's pretty much a lost battle.  The
virtualization features of the CPU don't really permit full hardware
virtualization - it has to be a paravirtualized environment.  Once
that's the case, the value of keeping secondary things the same
between the bare metal and paravirt environments isn't that compelling
any more.

> > I didn't find any LPAR level properties or hcalls in the PAPR document.
> > They need to be specified.
> > 
> > or
> > 
> > We can add extra properties like ibm,chip-id but making sure it's only
> > used under the KVM hypervisor. My understanding is that's something we
> > are trying to avoid.
> 
> We can change PAPR to add ibm,chip-id. Problem is that ibm,chip-id today, with
> the current kernel codebase, does not fix the issue because the code is
> ignoring it hehehe
> 
> 
> If we're going to change PAPR -  and I believe we should, there's a clear
> lack of proper support for SMP topologies - we're better make sure that 
> whatever
> attribute/hcall we add there fixes it in a robust way for the long term.
> 
> 
> Thanks,
> 
> 
> DHB
> 
> 
> > 
> > C.
> > 
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 1/8] powerpc/xive: Use cpu_to_node() instead of ibm,chip-id property

2021-03-11 Thread David Gibson
On Tue, 9 Mar 2021 18:26:35 +0100
Cédric Le Goater  wrote:

> On 3/9/21 6:08 PM, Daniel Henrique Barboza wrote:
> > 
> > 
> > On 3/9/21 12:33 PM, Cédric Le Goater wrote:  
> >> On 3/8/21 6:13 PM, Greg Kurz wrote:  
> >>> On Wed, 3 Mar 2021 18:48:50 +0100
> >>> Cédric Le Goater  wrote:
> >>>  
> >>>> The 'chip_id' field of the XIVE CPU structure is used to choose a
> >>>> target for a source located on the same chip when possible. This field
> >>>> is assigned on the PowerNV platform using the "ibm,chip-id" property
> >>>> on pSeries under KVM when NUMA nodes are defined but it is undefined  
> >>>
> >>> This sentence seems to have a syntax problem... like it is missing an
> >>> 'and' before 'on pSeries'.  
> >>
> >> ah yes, or simply a comma.
> >>  
> >>>> under PowerVM. The XIVE source structure has a similar field
> >>>> 'src_chip' which is only assigned on the PowerNV platform.
> >>>>
> >>>> cpu_to_node() returns a compatible value on all platforms, 0 being the
> >>>> default node. It will also give us the opportunity to set the affinity
> >>>> of a source on pSeries when we can localize them.
> >>>>  
> >>>
> >>> IIUC this relies on the fact that the NUMA node id is == to chip id
> >>> on PowerNV, i.e. xc->chip_id which is passed to OPAL remain stable
> >>> with this change.  
> >>
> >> Linux sets the NUMA node in numa_setup_cpu(). On pseries, the hcall
> >> H_HOME_NODE_ASSOCIATIVITY returns the node id if I am correct (Daniel
> >> in Cc:)  
>  [...]  
> >>
> >> On PowerNV, Linux uses "ibm,associativity" property of the CPU to find
> >> the node id. This value is built from the chip id in OPAL, so the
> >> value returned by cpu_to_node(cpu) and the value of the "ibm,chip-id"
> >> property are unlikely to be different.
> >>
> >> cpu_to_node(cpu) is used in many places to allocate the structures
> >> locally to the owning node. XIVE is not an exception (see below in the
> >> same patch), it is better to be consistent and get the same information
> >> (node id) using the same routine.
> >>
> >>
> >> In Linux, "ibm,chip-id" is only used in low level PowerNV drivers :
> >> LPC, XSCOM, RNG, VAS, NX. XIVE should be in that list also but skiboot
> >> unifies the controllers of the system to only expose one the OS. This
> >> is problematic and should be changed but it's another topic.
> >>
> >>  
> >>> On the other hand, you have the pSeries case under PowerVM that
> >>> doesn't xc->chip_id, which isn't passed to any hcall AFAICT.  
> >>
> >> yes "ibm,chip-id" is an OPAL concept unfortunately and it has no meaning
> >> under PAPR. xc->chip_id on pseries (PowerVM) will contains an invalid
> >> chip id.
> >>
> >> QEMU/KVM exposes "ibm,chip-id" but it's not used. (its value is not
> >> always correct btw)  
> > 
> > 
> > If you have a way to reliably reproduce this, let me know and I'll fix it
> > up in QEMU.  
> 
> with :
> 
>-smp 4,cores=1,maxcpus=8 -object memory-backend-ram,id=ram-node0,size=2G 
> -numa node,nodeid=0,cpus=0-1,cpus=4-5,memdev=ram-node0 -object 
> memory-backend-ram,id=ram-node1,size=2G -numa 
> node,nodeid=1,cpus=2-3,cpus=6-7,memdev=ram-node1
> 
> # dmesg | grep numa
> [0.013106] numa: Node 0 CPUs: 0-1
> [0.013136] numa: Node 1 CPUs: 2-3
> 
> # dtc -I fs /proc/device-tree/cpus/ -f | grep ibm,chip-id
>   ibm,chip-id = <0x01>;
>   ibm,chip-id = <0x02>;
>   ibm,chip-id = <0x00>;
>   ibm,chip-id = <0x03>;
> 
> with :
> 
>   -smp 4,cores=4,maxcpus=8,threads=1 -object 
> memory-backend-ram,id=ram-node0,size=2G -numa 
> node,nodeid=0,cpus=0-1,cpus=4-5,memdev=ram-node0 -object 
> memory-backend-ram,id=ram-node1,size=2G -numa 
> node,nodeid=1,cpus=2-3,cpus=6-7,memdev=ram-node1
> 
> # dmesg | grep numa
> [0.013106] numa: Node 0 CPUs: 0-1
> [0.013136] numa: Node 1 CPUs: 2-3
> 
> # dtc -I fs /proc/device-tree/cpus/ -f | grep ibm,chip-id
>   ibm,chip-id = <0x00>;
>   ibm,chip-id = <0x00>;
>   ibm,chip-id = <0x00>;
>   ibm,chip-id = <0x00>;
> 
> I think we should simply remove "ibm,chip-id" since it's not used and
> not in the PAPR spec.

As I mentioned to Daniel on our call this morning, oddly it *does*
appear to be used in the RHEL kernel, even though that's 4.18 based.
This patch seems to have caused a minor regression; not in the
identification of NUMA nodes, but in the number of sockets shown be
lscpu, etc.  See https://bugzilla.redhat.com/show_bug.cgi?id=1934421
for more information.

Since the value was used by some PAPR kernels - even if they shouldn't
have - I think we should only remove this for newer machine types.  We
also need to check what we're not supplying that the guest kernel is
showing a different number of sockets than specified on the qemu
command line.

> 
> Thanks,
> 
> C.
> 
>  
> 
>  [...]  
>  [...]  
>  [...]  
>  [...]  
>  [...]  
>  [...]  
>  [...]  
>  [...]  
>  [...]  
> 


-- 
David Gibson 
Principal Software Engineer, Virtualization, Red Hat



Re: [PATCH v5 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-03-03 Thread David Gibson
On Tue, Mar 02, 2021 at 10:28:53AM +0530, Bharata B Rao wrote:
> On Tue, Mar 02, 2021 at 12:45:18PM +1100, David Gibson wrote:
> > > diff --git a/Documentation/virt/kvm/api.rst 
> > > b/Documentation/virt/kvm/api.rst
> > > index 45fd862ac128..38ce3f21b21f 100644
> > > --- a/Documentation/virt/kvm/api.rst
> > > +++ b/Documentation/virt/kvm/api.rst
> > > @@ -6225,6 +6225,24 @@ KVM_RUN_BUS_LOCK flag is used to distinguish 
> > > between them.
> > >  This capability can be used to check / enable 2nd DAWR feature provided
> > >  by POWER10 processor.
> > >  
> > > +7.23 KVM_CAP_PPC_RPT_INVALIDATE
> > > +--
> > > +
> > > +:Capability: KVM_CAP_PPC_RPT_INVALIDATE
> > > +:Architectures: ppc
> > > +:Type: vm
> > > +
> > > +This capability indicates that the kernel is capable of handling
> > > +H_RPT_INVALIDATE hcall.
> > > +
> > > +In order to enable the use of H_RPT_INVALIDATE in the guest,
> > > +user space might have to advertise it for the guest. For example,
> > > +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
> > > +present in the "ibm,hypertas-functions" device-tree property.
> > > +
> > > +This capability is enabled for hypervisors on platforms like POWER9
> > > +that support radix MMU.
> > 
> > Does this mean that KVM will handle the hypercall, even if not
> > explicitly enabled by userspace (qemu)?  That's generally not what we
> > want, since we need to allow qemu to set up backwards compatible
> > guests.
> 
> This capability only indicates that hypervisor supports the hcall.
> 
> QEMU will check for this and conditionally enable the hcall
> (via KVM_CAP_PPC_ENABLE_HCALL ioctl). Enabling the hcall is
> conditional to cap-rpt-invalidate sPAPR machine capability being
> enabled by the user. Will post a followup QEMU patch shortly.

Ok.

> Older QEMU patch can be found here:
> https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00627.html
> 
> > 
> > > +
> > >  8. Other capabilities.
> > >  ==
> > >  
> > > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
> > > b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> > > index 8b33601cdb9d..a46fd37ad552 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> > > @@ -4,6 +4,10 @@
> > >  
> > >  #include 
> > >  
> > > +#define RIC_FLUSH_TLB 0
> > > +#define RIC_FLUSH_PWC 1
> > > +#define RIC_FLUSH_ALL 2
> > > +
> > >  struct vm_area_struct;
> > >  struct mm_struct;
> > >  struct mmu_gather;
> > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> > > b/arch/powerpc/include/asm/kvm_book3s.h
> > > index 2f5f919f6cd3..a1515f94400e 100644
> > > --- a/arch/powerpc/include/asm/kvm_book3s.h
> > > +++ b/arch/powerpc/include/asm/kvm_book3s.h
> > > @@ -305,6 +305,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, 
> > > u64 dw1);
> > >  void kvmhv_release_all_nested(struct kvm *kvm);
> > >  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> > >  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> > > +long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
> > > +  unsigned long type, unsigned long pg_sizes,
> > > +  unsigned long start, unsigned long end);
> > >  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
> > > u64 time_limit, unsigned long lpcr);
> > >  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state 
> > > *hr);
> > > diff --git a/arch/powerpc/include/asm/mmu_context.h 
> > > b/arch/powerpc/include/asm/mmu_context.h
> > > index 652ce85f9410..820caf4e01b7 100644
> > > --- a/arch/powerpc/include/asm/mmu_context.h
> > > +++ b/arch/powerpc/include/asm/mmu_context.h
> > > @@ -124,8 +124,19 @@ static inline bool need_extra_context(struct 
> > > mm_struct *mm, unsigned long ea)
> > >  
> > >  #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && 
> > > defined(CONFIG_PPC_RADIX_MMU)
> > >  extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
> > > +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> > > +  unsigned long type, unsigned long page_size,
> > > + 

Re: [PATCH v5 1/3] powerpc/book3s64/radix: Add H_RPT_INVALIDATE pgsize encodings to mmu_psize_def

2021-03-03 Thread David Gibson
On Tue, Mar 02, 2021 at 09:51:28AM +0530, Bharata B Rao wrote:
> On Tue, Mar 02, 2021 at 12:28:34PM +1100, David Gibson wrote:
> > On Wed, Feb 24, 2021 at 01:55:08PM +0530, Bharata B Rao wrote:
> > > Add a field to mmu_psize_def to store the page size encodings
> > > of H_RPT_INVALIDATE hcall. Initialize this while scanning the radix
> > > AP encodings. This will be used when invalidating with required
> > > page size encoding in the hcall.
> > > 
> > > Signed-off-by: Bharata B Rao 
> > > ---
> > >  arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
> > >  arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +
> > >  2 files changed, 6 insertions(+)
> > > 
> > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
> > > b/arch/powerpc/include/asm/book3s/64/mmu.h
> > > index eace8c3f7b0a..c02f42d1031e 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> > > @@ -19,6 +19,7 @@ struct mmu_psize_def {
> > >   int penc[MMU_PAGE_COUNT];   /* HPTE encoding */
> > >   unsigned inttlbiel; /* tlbiel supported for that page size */
> > >   unsigned long   avpnm;  /* bits to mask out in AVPN in the HPTE */
> > > + unsigned long   h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
> > >   union {
> > >   unsigned long   sllp;   /* SLB L||LP (exact mask to use in 
> > > slbmte) */
> > >   unsigned long ap;   /* Ap encoding used by PowerISA 3.0 */
> > > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> > > b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > > index 98f0b243c1ab..1b749899016b 100644
> > > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> > > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> > > @@ -486,6 +486,7 @@ static int __init radix_dt_scan_page_sizes(unsigned 
> > > long node,
> > >   def = _psize_defs[idx];
> > >   def->shift = shift;
> > >   def->ap  = ap;
> > > + def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
> > >   }
> > >  
> > >   /* needed ? */
> > > @@ -560,9 +561,13 @@ void __init radix__early_init_devtree(void)
> > >*/
> > >   mmu_psize_defs[MMU_PAGE_4K].shift = 12;
> > >   mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
> > > + mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
> > > + psize_to_rpti_pgsize(MMU_PAGE_4K);
> > 
> > Hm.  TBH, I was thinking of this as replacing psize_to_rpti_pgsize() -
> > that is, you directly put the correct codes in there, then just have
> > psize_to_rpti_pgsize() look them up in the table.
> > 
> > I guess that could be a followup change, though.
> > 
> > >  
> > >   mmu_psize_defs[MMU_PAGE_64K].shift = 16;
> > >   mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
> > > + mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
> > > + psize_to_rpti_pgsize(MMU_PAGE_64K);
> 
> Hmm if you see I got rid of rpti_pgsize_to_psize() by having the
> defines directly in mmu_psize_def[].

I realize that, but I'm talking about the reverse direction:
psize_to_rpti_pgsize().  You should be able to reduce it a table
lookup, so the mmu_psize_defs table is the only place this information
exists.

> There are two cases in the above code (radix__early_init_devtree)
> 
> 1. If radix pagesize encodings are present in the DT, we walk
> the page sizes in the loop and populate the enconding for
> H_RPT_INVALIDATE. I am not sure if we can use the direct codes
> in this case.

I'm not understanding the problem.

In any case the existing implementation of psize

Why ever not?  You can just update the mmu_psize_defs when you parse
the device tree.  Plus AFAICT, the existing psize_to_rpti
implementation doesn't take into account any device tree eencodings.

> 2. If DT doesn't have the radix pagesize encodings, 4K and 64K
> sizes are assumed as fallback sizes where we can use direct
> encodings.

Right... still not seeing the problem.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 1/3] powerpc/book3s64/radix: Add H_RPT_INVALIDATE pgsize encodings to mmu_psize_def

2021-03-01 Thread David Gibson
On Wed, Feb 24, 2021 at 01:55:08PM +0530, Bharata B Rao wrote:
> Add a field to mmu_psize_def to store the page size encodings
> of H_RPT_INVALIDATE hcall. Initialize this while scanning the radix
> AP encodings. This will be used when invalidating with required
> page size encoding in the hcall.
> 
> Signed-off-by: Bharata B Rao 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
> b/arch/powerpc/include/asm/book3s/64/mmu.h
> index eace8c3f7b0a..c02f42d1031e 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -19,6 +19,7 @@ struct mmu_psize_def {
>   int penc[MMU_PAGE_COUNT];   /* HPTE encoding */
>   unsigned inttlbiel; /* tlbiel supported for that page size */
>   unsigned long   avpnm;  /* bits to mask out in AVPN in the HPTE */
> + unsigned long   h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
>   union {
>   unsigned long   sllp;   /* SLB L||LP (exact mask to use in 
> slbmte) */
>   unsigned long ap;   /* Ap encoding used by PowerISA 3.0 */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 98f0b243c1ab..1b749899016b 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -486,6 +486,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long 
> node,
>   def = _psize_defs[idx];
>   def->shift = shift;
>   def->ap  = ap;
> + def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
>   }
>  
>   /* needed ? */
> @@ -560,9 +561,13 @@ void __init radix__early_init_devtree(void)
>*/
>   mmu_psize_defs[MMU_PAGE_4K].shift = 12;
>   mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
> + mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
> + psize_to_rpti_pgsize(MMU_PAGE_4K);

Hm.  TBH, I was thinking of this as replacing psize_to_rpti_pgsize() -
that is, you directly put the correct codes in there, then just have
psize_to_rpti_pgsize() look them up in the table.

I guess that could be a followup change, though.

>  
>   mmu_psize_defs[MMU_PAGE_64K].shift = 16;
>   mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
> +     mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
> + psize_to_rpti_pgsize(MMU_PAGE_64K);
>   }
>  
>   /*

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-03-01 Thread David Gibson
va = ((1UL << 52) - 1);
> +
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
> + }
> +
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
> + RIC_FLUSH_TLB);
> + }
> +}
>  
>  static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
>  unsigned long ap)
> @@ -344,6 +407,31 @@ static inline void _tlbie_pid(unsigned long pid, 
> unsigned long ric)
>   asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
> +unsigned long ric)
> +{
> + asm volatile("ptesync" : : : "memory");
> +
> + /*
> +  * Workaround the fact that the "ric" argument to __tlbie_pid
> +  * must be a compile-time contraint to match the "i" constraint
> +  * in the asm statement.
> +  */
> + switch (ric) {
> + case RIC_FLUSH_TLB:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> + fixup_tlbie_pid_lpid(pid, lpid);
> + break;
> + case RIC_FLUSH_PWC:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> + break;
> + case RIC_FLUSH_ALL:
> + default:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> + fixup_tlbie_pid_lpid(pid, lpid);
> + }
> + asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
>  struct tlbiel_pid {
>   unsigned long pid;
>   unsigned long ric;
> @@ -469,6 +557,20 @@ static inline void __tlbie_va_range(unsigned long start, 
> unsigned long end,
>   fixup_tlbie_va_range(addr - page_size, pid, ap);
>  }
>  
> +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long 
> end,
> +  unsigned long pid, unsigned long lpid,
> +  unsigned long page_size,
> +  unsigned long psize)
> +{
> + unsigned long addr;
> + unsigned long ap = mmu_get_ap(psize);
> +
> + for (addr = start; addr < end; addr += page_size)
> + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
> +
> + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
> +}
> +
>  static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
> unsigned long psize, unsigned long ric)
>  {
> @@ -549,6 +651,18 @@ static inline void _tlbie_va_range(unsigned long start, 
> unsigned long end,
>   asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long 
> end,
> + unsigned long pid, unsigned long lpid,
> + unsigned long page_size,
> + unsigned long psize, bool also_pwc)
> +{
> + asm volatile("ptesync" : : : "memory");
> + if (also_pwc)
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
> + asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
> +
>  static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
>   unsigned long start, unsigned long end,
>   unsigned long pid, unsigned long page_size,
> @@ -1381,4 +1495,29 @@ extern void radix_kvm_prefetch_workaround(struct 
> mm_struct *mm)
>   }
>  }
>  EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
> +
> +/*
> + * Process-scoped invalidations for a given LPID.
> + */
> +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> +  unsigned long type, unsigned long page_size,
> +  unsigned long psize, unsigned long start,
> +  unsigned long end)
> +{
> + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> + return;
> + }
> +
> + if (type & H_RPTI_TYPE_PWC)
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +
> + if (!start && end == -1) /* PID */
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> + else /* EA */
> + _tlbie_va_range_lpid(start, end, pid, lpid, page_size,
> +  psize, false);
> +}
> +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate);
> +
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 8b281f722e5b..f8c84a62e8f3 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_DIRTY_LOG_RING 192
>  #define KVM_CAP_X86_BUS_LOCK_EXIT 193
>  #define KVM_CAP_PPC_DAWR1 194
> +#define KVM_CAP_PPC_RPT_INVALIDATE 195
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v4 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-02-22 Thread David Gibson
On Mon, Feb 22, 2021 at 12:16:08PM +0530, Bharata B Rao wrote:
> On Wed, Feb 17, 2021 at 11:38:07AM +1100, David Gibson wrote:
> > On Mon, Feb 15, 2021 at 12:05:41PM +0530, Bharata B Rao wrote:
> > > Implement H_RPT_INVALIDATE hcall and add KVM capability
> > > KVM_CAP_PPC_RPT_INVALIDATE to indicate the support for the same.
> > > 
> > > This hcall does two types of TLB invalidations:
> > > 
> > > 1. Process-scoped invalidations for guests with LPCR[GTSE]=0.
> > >This is currently not used in KVM as GTSE is not usually
> > >disabled in KVM.
> > > 2. Partition-scoped invalidations that an L1 hypervisor does on
> > >behalf of an L2 guest. This replaces the uses of the existing
> > >hcall H_TLB_INVALIDATE.
> > > 
> > > In order to handle process scoped invalidations of L2, we
> > > intercept the nested exit handling code in L0 only to handle
> > > H_TLB_INVALIDATE hcall.
> > > 
> > > Signed-off-by: Bharata B Rao 
> > > ---
> > >  Documentation/virt/kvm/api.rst | 17 +
> > >  arch/powerpc/include/asm/kvm_book3s.h  |  3 +
> > >  arch/powerpc/include/asm/mmu_context.h | 11 +++
> > >  arch/powerpc/kvm/book3s_hv.c   | 91 
> > >  arch/powerpc/kvm/book3s_hv_nested.c| 96 ++
> > >  arch/powerpc/kvm/powerpc.c |  3 +
> > >  arch/powerpc/mm/book3s64/radix_tlb.c   | 25 +++
> > >  include/uapi/linux/kvm.h   |  1 +
> > >  8 files changed, 247 insertions(+)
> > > 
> > > diff --git a/Documentation/virt/kvm/api.rst 
> > > b/Documentation/virt/kvm/api.rst
> > > index 99ceb978c8b0..416c36aa35d4 100644
> > > --- a/Documentation/virt/kvm/api.rst
> > > +++ b/Documentation/virt/kvm/api.rst
> > > @@ -6038,6 +6038,23 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit 
> > > notifications which user space
> > >  can then handle to implement model specific MSR handling and/or user 
> > > notifications
> > >  to inform a user that an MSR was not handled.
> > >  
> > > +7.22 KVM_CAP_PPC_RPT_INVALIDATE
> > > +--
> > > +
> > > +:Capability: KVM_CAP_PPC_RPT_INVALIDATE
> > > +:Architectures: ppc
> > > +:Type: vm
> > > +
> > > +This capability indicates that the kernel is capable of handling
> > > +H_RPT_INVALIDATE hcall.
> > > +
> > > +In order to enable the use of H_RPT_INVALIDATE in the guest,
> > > +user space might have to advertise it for the guest. For example,
> > > +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
> > > +present in the "ibm,hypertas-functions" device-tree property.
> > > +
> > > +This capability is always enabled.
> > 
> > I guess that means it's always enabled when it's available - I'm
> > pretty sure it won't be enabled on POWER8 or on PR KVM.
> 
> Correct, will reword this and restrict this to POWER9, radix etc
> 
> > 
> > > +
> > >  8. Other capabilities.
> > >  ==
> > >  
> > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> > > b/arch/powerpc/include/asm/kvm_book3s.h
> > > index d32ec9ae73bd..0f1c5fa6e8ce 100644
> > > --- a/arch/powerpc/include/asm/kvm_book3s.h
> > > +++ b/arch/powerpc/include/asm/kvm_book3s.h
> > > @@ -298,6 +298,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, 
> > > u64 dw1);
> > >  void kvmhv_release_all_nested(struct kvm *kvm);
> > >  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> > >  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> > > +long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
> > > +  unsigned long type, unsigned long pg_sizes,
> > > +  unsigned long start, unsigned long end);
> > >  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
> > > u64 time_limit, unsigned long lpcr);
> > >  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state 
> > > *hr);
> > > diff --git a/arch/powerpc/include/asm/mmu_context.h 
> > > b/arch/powerpc/include/asm/mmu_context.h
> > > index d5821834dba9..fbf3b5b45fe9 100644
> > > --- a/arch/powerpc/include/asm/mmu_context.h
> > > +++ b/arch/powerpc/include/asm/mmu_context.h
> > > @@ -124,8 +124,19 @@ static inline bool need_extra_context(struct 
> >

Re: [PATCH v4 1/3] powerpc/book3s64/radix/tlb: tlbie primitives for process-scoped invalidations from guests

2021-02-16 Thread David Gibson
ed long ric)
>  {
> @@ -233,6 +261,22 @@ static inline void fixup_tlbie_va_range(unsigned long 
> va, unsigned long pid,
>   }
>  }
>  
> +static inline void fixup_tlbie_va_range_lpid(unsigned long va,
> +  unsigned long pid,
> +  unsigned long lpid,
> +  unsigned long ap)
> +{
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
> + }
> +
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
> + }
> +}
> +
>  static inline void fixup_tlbie_pid(unsigned long pid)
>  {
>   /*
> @@ -252,6 +296,25 @@ static inline void fixup_tlbie_pid(unsigned long pid)
>   }
>  }
>  
> +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long 
> lpid)
> +{
> + /*
> +  * We can use any address for the invalidation, pick one which is
> +  * probably unused as an optimisation.
> +  */
> + unsigned long va = ((1UL << 52) - 1);
> +
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
> + }
> +
> + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
> + asm volatile("ptesync" : : : "memory");
> + __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
> + RIC_FLUSH_TLB);
> + }
> +}
>  
>  static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
>  unsigned long ap)
> @@ -342,6 +405,31 @@ static inline void _tlbie_pid(unsigned long pid, 
> unsigned long ric)
>   asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
> +unsigned long ric)
> +{
> + asm volatile("ptesync" : : : "memory");
> +
> + /*
> +  * Workaround the fact that the "ric" argument to __tlbie_pid
> +  * must be a compile-time contraint to match the "i" constraint
> +  * in the asm statement.
> +  */
> + switch (ric) {
> + case RIC_FLUSH_TLB:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> + fixup_tlbie_pid_lpid(pid, lpid);
> + break;
> + case RIC_FLUSH_PWC:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> + break;
> + case RIC_FLUSH_ALL:
> + default:
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> + fixup_tlbie_pid_lpid(pid, lpid);
> + }
> + asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
>  struct tlbiel_pid {
>   unsigned long pid;
>   unsigned long ric;
> @@ -467,6 +555,20 @@ static inline void __tlbie_va_range(unsigned long start, 
> unsigned long end,
>   fixup_tlbie_va_range(addr - page_size, pid, ap);
>  }
>  
> +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long 
> end,
> +  unsigned long pid, unsigned long lpid,
> +  unsigned long page_size,
> +  unsigned long psize)
> +{
> + unsigned long addr;
> + unsigned long ap = mmu_get_ap(psize);
> +
> + for (addr = start; addr < end; addr += page_size)
> + __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
> +
> + fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
> +}
> +
>  static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
> unsigned long psize, unsigned long ric)
>  {
> @@ -547,6 +649,18 @@ static inline void _tlbie_va_range(unsigned long start, 
> unsigned long end,
>   asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long 
> end,
> + unsigned long pid, unsigned long lpid,
> + unsigned long page_size,
> + unsigned long psize, bool also_pwc)
> +{
> + asm volatile("ptesync" : : : "memory");
> + if (also_pwc)
> + __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> + __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
> + asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
> +
>  static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
>   unsigned long start, unsigned long end,
>   unsigned long pid, unsigned long page_size,

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation

2021-02-16 Thread David Gibson
On Tue, Feb 16, 2021 at 02:33:07PM +1100, Alexey Kardashevskiy wrote:
> Most platforms allocate IOMMU table structures (specifically it_map)
> at the boot time and when this fails - it is a valid reason for panic().
> 
> However the powernv platform allocates it_map after a device is returned
> to the host OS after being passed through and this happens long after
> the host OS booted. It is quite possible to trigger the it_map allocation
> panic() and kill the host even though it is not necessary - the host OS
> can still use the DMA bypass mode (requires a tiny fraction of it_map's
> memory) and even if that fails, the host OS is runnnable as it was without
> the device for which allocating it_map causes the panic.
> 
> Instead of immediately crashing in a powernv/ioda2 system, this prints
> an error and continues. All other platforms still call panic().
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/kernel/iommu.c   |  6 --
>  arch/powerpc/platforms/cell/iommu.c   |  3 ++-
>  arch/powerpc/platforms/pasemi/iommu.c |  4 +++-
>  arch/powerpc/platforms/powernv/pci-ioda.c | 15 ---
>  arch/powerpc/platforms/pseries/iommu.c| 10 +++---
>  arch/powerpc/sysdev/dart_iommu.c  |  3 ++-
>  6 files changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 8eb6eb0afa97..c1a5c366a664 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -728,8 +728,10 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>   sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
>   tbl->it_map = vzalloc_node(sz, nid);
> - if (!tbl->it_map)
> - panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> + if (!tbl->it_map) {
> + pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
> + return NULL;
> + }
>  
>   iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> diff --git a/arch/powerpc/platforms/cell/iommu.c 
> b/arch/powerpc/platforms/cell/iommu.c
> index 2124831cf57c..fa08699aedeb 100644
> --- a/arch/powerpc/platforms/cell/iommu.c
> +++ b/arch/powerpc/platforms/cell/iommu.c
> @@ -486,7 +486,8 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct 
> device_node *np,
>   window->table.it_size = size >> window->table.it_page_shift;
>   window->table.it_ops = _iommu_ops;
>  
> - iommu_init_table(>table, iommu->nid, 0, 0);
> + if (!iommu_init_table(>table, iommu->nid, 0, 0))
> + panic("Failed to initialize iommu table");
>  
>   pr_debug("\tioid  %d\n", window->ioid);
>   pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
> diff --git a/arch/powerpc/platforms/pasemi/iommu.c 
> b/arch/powerpc/platforms/pasemi/iommu.c
> index b500a6e47e6b..5be7242fbd86 100644
> --- a/arch/powerpc/platforms/pasemi/iommu.c
> +++ b/arch/powerpc/platforms/pasemi/iommu.c
> @@ -146,7 +146,9 @@ static void iommu_table_iobmap_setup(void)
>*/
>   iommu_table_iobmap.it_blocksize = 4;
>   iommu_table_iobmap.it_ops = _table_iobmap_ops;
> - iommu_init_table(_table_iobmap, 0, 0, 0);
> + if (!iommu_init_table(_table_iobmap, 0, 0, 0))
> + panic("Failed to initialize iommu table");
> +
>   pr_debug(" <- %s\n", __func__);
>  }
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index f0f901683a2f..66c3c3337334 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1762,7 +1762,8 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb 
> *phb,
>   tbl->it_ops = _ioda1_iommu_ops;
>   pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
>   pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
> - iommu_init_table(tbl, phb->hose->node, 0, 0);
> + if (!iommu_init_table(tbl, phb->hose->node, 0, 0))
> + panic("Failed to initialize iommu table");
>  
>   pe->dma_setup_done = true;
>   return;
> @@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct 
> pnv_ioda_pe *pe)
>   res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
>   res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
>   }
> - iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
>  
>

Re: [PATCH v4 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2021-02-16 Thread David Gibson
PAGE_1G) {
> + psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
> + ap = mmu_get_ap(psize);
> +
> + ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
> +(1UL << 30),
> +ap, start, end);
> + if (ret)
> + return H_P4;
> + }

Again it might be more elegant to step through the pagesizes from the
mmu_psize_defs side, rather than from the pg_sizes side.

> + }
> + return H_SUCCESS;
> +}
> +
>  /* Used to convert a nested guest real address to a L1 guest real address */
>  static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
>  struct kvm_nested_guest *gp,
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index cf52d26f49cd..5388cd4a206a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -678,6 +678,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   r = hv_enabled && kvmppc_hv_ops->enable_svm &&
>   !kvmppc_hv_ops->enable_svm(NULL);
>   break;
> + case KVM_CAP_PPC_RPT_INVALIDATE:
> + r = 1;
> + break;
>  #endif
>   default:
>   r = 0;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
> b/arch/powerpc/mm/book3s64/radix_tlb.c
> index 097402435303..4f746d34b420 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1400,4 +1400,29 @@ extern void radix_kvm_prefetch_workaround(struct 
> mm_struct *mm)
>   }
>  }
>  EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
> +
> +/*
> + * Process-scoped invalidations for a given LPID.
> + */
> +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> +  unsigned long type, unsigned long page_size,
> +  unsigned long psize, unsigned long start,
> +  unsigned long end)
> +{
> + if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> + return;
> + }
> +
> + if (type & H_RPTI_TYPE_PWC)
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +
> + if (!start && end == -1) /* PID */
> + _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> + else /* EA */
> + _tlbie_va_range_lpid(start, end, pid, lpid, page_size,
> +  psize, false);
> +}
> +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate);
> +
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 374c67875cdb..6fd530fae452 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
>  #define KVM_CAP_SYS_HYPERV_CPUID 191
>  #define KVM_CAP_DIRTY_LOG_RING 192
> +#define KVM_CAP_PPC_RPT_INVALIDATE 193
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH kernel 1/2] powerpc/iommu: Allocate it_map by vmalloc

2021-02-16 Thread David Gibson
On Tue, Feb 16, 2021 at 02:33:06PM +1100, Alexey Kardashevskiy wrote:
> The IOMMU table uses the it_map bitmap to keep track of allocated DMA
> pages. This has always been a contiguous array allocated at either
> the boot time or when a passed through device is returned to the host OS.
> The it_map memory is allocated by alloc_pages() which allocates
> contiguous physical memory.
> 
> Such allocation method occasionally creates a problem when there is
> no big chunk of memory available (no free memory or too fragmented).
> On powernv/ioda2 the default DMA window requires 16MB for it_map.
> 
> This replaces alloc_pages_node() with vzalloc_node() which allocates
> contiguous block but in virtual memory. This should reduce changes of
> failure but should not cause other behavioral changes as it_map is only
> used by the kernel's DMA hooks/api when MMU is on.
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/kernel/iommu.c | 15 +++
>  1 file changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index c00214a4355c..8eb6eb0afa97 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -719,7 +719,6 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>  {
>   unsigned long sz;
>   static int welcomed = 0;
> - struct page *page;
>   unsigned int i;
>   struct iommu_pool *p;
>  
> @@ -728,11 +727,9 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>   /* number of bytes needed for the bitmap */
>   sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
> - page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
> - if (!page)
> + tbl->it_map = vzalloc_node(sz, nid);
> + if (!tbl->it_map)
>   panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> - tbl->it_map = page_address(page);
> - memset(tbl->it_map, 0, sz);
>  
>   iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> @@ -774,8 +771,6 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>  
>  static void iommu_table_free(struct kref *kref)
>  {
> - unsigned long bitmap_sz;
> - unsigned int order;
>   struct iommu_table *tbl;
>  
>   tbl = container_of(kref, struct iommu_table, it_kref);
> @@ -796,12 +791,8 @@ static void iommu_table_free(struct kref *kref)
>   if (!bitmap_empty(tbl->it_map, tbl->it_size))
>   pr_warn("%s: Unexpected TCEs\n", __func__);
>  
> - /* calculate bitmap size in bytes */
> - bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
> -
>   /* free bitmap */
> - order = get_order(bitmap_sz);
> - free_pages((unsigned long) tbl->it_map, order);
> + vfree(tbl->it_map);
>  
>   /* free table */
>   kfree(tbl);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level

2021-02-07 Thread David Gibson
On Tue, Jan 19, 2021 at 12:40:31PM +0530, Shivaprasad G Bhat wrote:
> Thanks for the comments!
> 
> 
> On 12/28/20 2:08 PM, David Gibson wrote:
> 
> > On Mon, Dec 21, 2020 at 01:08:53PM +0100, Greg Kurz wrote:
> ...
> > > The overall idea looks good but I think you should consider using
> > > a thread pool to implement it. See below.
> > I am not convinced, however.  Specifically, attaching this to the DRC
> > doesn't make sense to me.  We're adding exactly one DRC related async
> > hcall, and I can't really see much call for another one.  We could
> > have other async hcalls - indeed we already have one for HPT resizing
> > - but attaching this to DRCs doesn't help for those.
> 
> The semantics of the hcall made me think, if this is going to be
> re-usable for future if implemented at DRC level.

It would only be re-usable for operations that are actually connected
to DRCs.  It doesn't seem to me particularly likely that we'll ever
have more asynchronous hcalls that are also associated with DRCs.

> Other option
> is to move the async-hcall-state/list into the NVDIMMState structure
> in include/hw/mem/nvdimm.h and handle it with machine->nvdimms_state
> at a global level.

I'm ok with either of two options:

A) Implement this ad-hoc for this specific case, making whatever
simplifications you can based on this specific case.

B) Implement a general mechanism for async hcalls that is *not* tied
to DRCs.  Then use that for the existing H_RESIZE_HPT_PREPARE call as
well as this new one.

> Hope you are okay with using the pool based approach that Greg

Honestly a thread pool seems like it might be overkill for this
application.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level

2020-12-28 Thread David Gibson
>  static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
> > diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> > index 165b281496..77f6e4386c 100644
> > --- a/include/hw/ppc/spapr_drc.h
> > +++ b/include/hw/ppc/spapr_drc.h
> > @@ -18,6 +18,7 @@
> >  #include "sysemu/runstate.h"
> >  #include "hw/qdev-core.h"
> >  #include "qapi/error.h"
> > +#include "block/thread-pool.h"
> >  
> >  #define TYPE_SPAPR_DR_CONNECTOR "spapr-dr-connector"
> >  #define SPAPR_DR_CONNECTOR_GET_CLASS(obj) \
> > @@ -168,6 +169,21 @@ typedef enum {
> >  SPAPR_DRC_STATE_PHYSICAL_CONFIGURED = 8,
> >  } SpaprDrcState;
> >  
> > +typedef struct SpaprDrc SpaprDrc;
> > +
> > +typedef int SpaprDrcAsyncHcallWorkerFunc(void *opaque);
> > +typedef struct SpaprDrcDeviceAsyncHCallState {
> > +uint64_t continue_token;
> > +bool pending;
> > +
> > +int hcall_ret;
> > +SpaprDrcAsyncHcallWorkerFunc *func;
> > +void *data;
> > +
> > +QemuThread thread;
> > +
> > +QLIST_ENTRY(SpaprDrcDeviceAsyncHCallState) node;
> > +} SpaprDrcDeviceAsyncHCallState;
> >  typedef struct SpaprDrc {
> >  /*< private >*/
> >  DeviceState parent;
> > @@ -182,6 +198,10 @@ typedef struct SpaprDrc {
> >  int ccs_offset;
> >  int ccs_depth;
> >  
> > +/* async hcall states */
> > +QemuMutex async_hcall_states_lock;
> > +QLIST_HEAD(, SpaprDrcDeviceAsyncHCallState) async_hcall_states;
> > +
> >  /* device pointer, via link property */
> >  DeviceState *dev;
> >  bool unplug_requested;
> > @@ -241,6 +261,11 @@ void spapr_drc_detach(SpaprDrc *drc);
> >  /* Returns true if a hot plug/unplug request is pending */
> >  bool spapr_drc_transient(SpaprDrc *drc);
> >  
> > +uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc);
> > +void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
> > +   SpaprDrcAsyncHcallWorkerFunc, void *data);
> > +int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token);
> > +
> >  static inline bool spapr_drc_unplug_requested(SpaprDrc *drc)
> >  {
> >  return drc->unplug_requested;
> > 
> > 
> > 
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 1/2] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE

2020-12-16 Thread David Gibson
i_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
> + ap = mmu_get_ap(psize);
> +
> + ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
> +(1UL << 30),
> +ap, start, end);
> + if (ret)
> + return H_P4;
> + }
> + }
> + return H_SUCCESS;
> +}
> +
>  /* Used to convert a nested guest real address to a L1 guest real address */
>  static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
>  struct kvm_nested_guest *gp,
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 13999123b735..172a89187116 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -678,6 +678,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   r = hv_enabled && kvmppc_hv_ops->enable_svm &&
>   !kvmppc_hv_ops->enable_svm(NULL);
>   break;
> + case KVM_CAP_PPC_RPT_INVALIDATE:
> + r = 1;
> + break;
>  #endif
>   default:
>   r = 0;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
> b/arch/powerpc/mm/book3s64/radix_tlb.c
> index b487b489d4b6..3a2b12d1d49b 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -18,10 +18,6 @@
>  #include 
>  #include 
>  
> -#define RIC_FLUSH_TLB 0
> -#define RIC_FLUSH_PWC 1
> -#define RIC_FLUSH_ALL 2
> -
>  /*
>   * tlbiel instruction for radix, set invalidation
>   * i.e., r=1 and is=01 or is=10 or is=11
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ca41220b40b8..c9ece825299e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1053,6 +1053,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_X86_USER_SPACE_MSR 188
>  #define KVM_CAP_X86_MSR_FILTER 189
>  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
> +#define KVM_CAP_PPC_RPT_INVALIDATE 191
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2 0/2] Support for H_RPT_INVALIDATE in PowerPC KVM

2020-12-16 Thread David Gibson
On Wed, Dec 16, 2020 at 02:24:45PM +0530, Bharata B Rao wrote:
> This patchset adds support for the new hcall H_RPT_INVALIDATE
> and replaces the nested tlb flush calls with this new hcall
> if support for the same exists.
> 
> Changes in v2:
> -
> - Not enabling the hcall by default now, userspace can enable it when
>   required.
> - Added implementation for process-scoped invalidations in the hcall.
> 
> v1: 
> https://lore.kernel.org/linuxppc-dev/20201019112642.53016-1-bhar...@linux.ibm.com/T/#t
> 
> H_RPT_INVALIDATE
> 
> Syntax:
> int64   /* H_Success: Return code on successful completion */
>     /* H_Busy - repeat the call with the same */
>     /* H_Parameter, H_P2, H_P3, H_P4, H_P5 : Invalid parameters */
>     hcall(const uint64 H_RPT_INVALIDATE, /* Invalidate RPT translation 
> lookaside information */
>   uint64 pid,   /* PID/LPID to invalidate */
>   uint64 target,    /* Invalidation target */
>   uint64 type,  /* Type of lookaside information */
>   uint64 pageSizes, /* Page sizes */
>   uint64 start, /* Start of Effective Address (EA) range 
> (inclusive) */
>   uint64 end)   /* End of EA range (exclusive) */
> 
> Invalidation targets (target)
> -
> Core MMU    0x01 /* All virtual processors in the partition */
> Core local MMU  0x02 /* Current virtual processor */
> Nest MMU    0x04 /* All nest/accelerator agents in use by the partition */
> 
> A combination of the above can be specified, except core and core local.
> 
> Type of translation to invalidate (type)
> ---
> NESTED   0x0001  /* Invalidate nested guest partition-scope */
> TLB  0x0002  /* Invalidate TLB */
> PWC  0x0004  /* Invalidate Page Walk Cache */
> PRT  0x0008  /* Invalidate Process Table Entries if NESTED is clear */
> PAT  0x0008  /* Invalidate Partition Table Entries if NESTED is set */
> 
> A combination of the above can be specified.
> 
> Page size mask (pageSizes)
> --
> 4K  0x01
> 64K 0x02
> 2M  0x04
> 1G  0x08
> All sizes   (-1UL)

PAPR really has a real talent for tying its own shoelaces together.
They could have just made the bit for each pagesize be... the size of
the page, but why use something obviously extensible to any future
pagesizes when we can make it both less flexible and more complicated
to deal with.  Sigh.

> 
> A combination of the above can be specified.
> All page sizes can be selected with -1.
> 
> Semantics: Invalidate radix tree lookaside information
>    matching the parameters given.
> * Return H_P2, H_P3 or H_P4 if target, type, or pageSizes parameters are
>   different from the defined values.
> * Return H_PARAMETER if NESTED is set and pid is not a valid nested
>   LPID allocated to this partition
> * Return H_P5 if (start, end) doesn't form a valid range. Start and end
>   should be a valid Quadrant address and  end > start.
> * Return H_NotSupported if the partition is not in running in radix
>   translation mode.
> * May invalidate more translation information than requested.
> * If start = 0 and end = -1, set the range to cover all valid addresses.
>   Else start and end should be aligned to 4kB (lower 11 bits clear).
> * If NESTED is clear, then invalidate process scoped lookaside information.
>   Else pid specifies a nested LPID, and the invalidation is performed
>   on nested guest partition table and nested guest partition scope real
>   addresses.
> * If pid = 0 and NESTED is clear, then valid addresses are quadrant 3 and
>   quadrant 0 spaces, Else valid addresses are quadrant 0.
> * Pages which are fully covered by the range are to be invalidated.
>   Those which are partially covered are considered outside invalidation
>   range, which allows a caller to optimally invalidate ranges that may
>   contain mixed page sizes.
> * Return H_SUCCESS on success.
> 
> Bharata B Rao (2):
>   KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE
>   KVM: PPC: Book3S HV: Use H_RPT_INVALIDATE in nested KVM
> 
>  Documentation/virt/kvm/api.rst|  17 +++
>  .../include/asm/book3s/64/tlbflush-radix.h|  18 +++
>  arch/powerpc/include/asm/kvm_book3s.h |   3 +
>  arch/powerpc/kvm/book3s_64_mmu_radix.c|  27 +++-
>  arch/powerpc/kvm/book3s_hv.c  | 121 ++
>  arch/powerpc/kvm/book3s_hv_nested.c   | 106 ++-
>  arch/powerpc/kvm/powerpc.c|   3 +
>  arch/powerpc/mm/book3s64/radix_tlb.

Re: [PATCH v1 1/2] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE (nested case only)

2020-12-14 Thread David Gibson
On Fri, Dec 11, 2020 at 04:03:36PM +0530, Bharata B Rao wrote:
> On Mon, Oct 19, 2020 at 04:56:41PM +0530, Bharata B Rao wrote:
> > Implements H_RPT_INVALIDATE hcall and supports only nested case
> > currently.
> > 
> > A KVM capability KVM_CAP_RPT_INVALIDATE is added to indicate the
> > support for this hcall.
> 
> As Paul mentioned in the thread, this hcall does both process scoped
> invalidations and partition scoped invalidations for L2 guest.
> I am adding KVM_CAP_RPT_INVALIDATE capability with only partition
> scoped invalidations (nested case) implemented in the hcall as we
> don't see the need for KVM to implement process scoped invalidation
> function as KVM may never run with LPCR[GTSE]=0.
> 
> I am wondering if enabling the capability with only partial
> implementation of the hcall is the correct thing to do. In future
> if we ever want process scoped invalidations support in this hcall,
> we may not be able to differentiate the availability of two functions
> cleanly from QEMU.

Yeah, it's not ideal.

> So does it make sense to implement the process scoped invalidation
> function also now itself even if it is not going to be used in
> KVM?

That might be a good idea, if it's not excessively difficult.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v1 1/2] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE (nested case only)

2020-12-14 Thread David Gibson
On Fri, Dec 11, 2020 at 04:27:44PM +1100, Paul Mackerras wrote:
> On Fri, Dec 11, 2020 at 12:16:39PM +1100, David Gibson wrote:
> > On Thu, Dec 10, 2020 at 09:54:18AM +0530, Bharata B Rao wrote:
> > > On Wed, Dec 09, 2020 at 03:15:42PM +1100, Paul Mackerras wrote:
> > > > On Mon, Oct 19, 2020 at 04:56:41PM +0530, Bharata B Rao wrote:
> > > > > Implements H_RPT_INVALIDATE hcall and supports only nested case
> > > > > currently.
> > > > > 
> > > > > A KVM capability KVM_CAP_RPT_INVALIDATE is added to indicate the
> > > > > support for this hcall.
> > > > 
> > > > I have a couple of questions about this patch:
> > > > 
> > > > 1. Is this something that is useful today, or is it something that may
> > > > become useful in the future depending on future product plans? In
> > > > other words, what advantage is there to forcing L2 guests to use this
> > > > hcall instead of doing tlbie themselves?
> > > 
> > > H_RPT_INVALIDATE will replace the use of the existing H_TLB_INVALIDATE
> > > for nested partition scoped invalidations. Implementations that want to
> > > off-load invalidations to the host (when GTSE=0) would have to bother
> > > about only one hcall (H_RPT_INVALIDATE)
> > > 
> > > > 
> > > > 2. Why does it need to be added to the default-enabled hcall list?
> > > > 
> > > > There is a concern that if this is enabled by default we could get the
> > > > situation where a guest using it gets migrated to a host that doesn't
> > > > support it, which would be bad.  That is the reason that all new
> > > > things like this are disabled by default and only enabled by userspace
> > > > (i.e. QEMU) in situations where we can enforce that it is available on
> > > > all hosts to which the VM might be migrated.
> > > 
> > > As you suggested privately, I am thinking of falling back to
> > > H_TLB_INVALIDATE in case where this new hcall fails due to not being
> > > present. That should address the migration case that you mention
> > > above. With that and leaving the new hcall enabled by default
> > > is good okay?
> > 
> > No.  Assuming that guests will have some fallback is not how the qemu
> > migration compatibility model works.  If we specify an old machine
> > type, we need to provide the same environment that the older host
> > would have.
> 
> I misunderstood what this patchset is about when I first looked at
> it.  H_RPT_INVALIDATE has two separate functions; one is to do
> process-scoped invalidations for a guest when LPCR[GTSE] = 0 (i.e.,
> when the guest is not permitted to do tlbie itself), and the other is
> to do partition-scoped invalidations that an L1 hypervisor needs to do
> on behalf of an L2 guest.  The second function is a replacement and
> standardization of the existing H_TLB_INVALIDATE which was introduced
> with the nested virtualization code (using a hypercall number from the
> platform-specific range).
> 
> This patchset only implements the second function, not the first.  The
> first function remains unimplemented in KVM at present.
> 
> Given that QEMU will need changes for a guest to be able to exploit
> H_RPT_INVALIDATE (at a minimum, adding a device tree property), it
> doesn't seem onerous for QEMU to have to enable the hcall with
> KVM_CAP_PPC_ENABLE_HCALL.  I think that the control on whether the
> hcall is handled in KVM along with the control on nested hypervisor
> function provides adequate control for QEMU without needing a writable
> capability.  The read-only capability to say whether the hcall exists
> does seem useful.
> 
> Given all that, I'm veering towards taking Bharata's patchset pretty
> much as-is, minus the addition of H_RPT_INVALIDATE to the
> default-enabled set.

Yes, that's fine.  I was only the suggestion that it be on the
default-enabled set I was objecting to.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v1 1/2] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE (nested case only)

2020-12-10 Thread David Gibson
On Thu, Dec 10, 2020 at 09:54:18AM +0530, Bharata B Rao wrote:
> On Wed, Dec 09, 2020 at 03:15:42PM +1100, Paul Mackerras wrote:
> > On Mon, Oct 19, 2020 at 04:56:41PM +0530, Bharata B Rao wrote:
> > > Implements H_RPT_INVALIDATE hcall and supports only nested case
> > > currently.
> > > 
> > > A KVM capability KVM_CAP_RPT_INVALIDATE is added to indicate the
> > > support for this hcall.
> > 
> > I have a couple of questions about this patch:
> > 
> > 1. Is this something that is useful today, or is it something that may
> > become useful in the future depending on future product plans? In
> > other words, what advantage is there to forcing L2 guests to use this
> > hcall instead of doing tlbie themselves?
> 
> H_RPT_INVALIDATE will replace the use of the existing H_TLB_INVALIDATE
> for nested partition scoped invalidations. Implementations that want to
> off-load invalidations to the host (when GTSE=0) would have to bother
> about only one hcall (H_RPT_INVALIDATE)
> 
> > 
> > 2. Why does it need to be added to the default-enabled hcall list?
> > 
> > There is a concern that if this is enabled by default we could get the
> > situation where a guest using it gets migrated to a host that doesn't
> > support it, which would be bad.  That is the reason that all new
> > things like this are disabled by default and only enabled by userspace
> > (i.e. QEMU) in situations where we can enforce that it is available on
> > all hosts to which the VM might be migrated.
> 
> As you suggested privately, I am thinking of falling back to
> H_TLB_INVALIDATE in case where this new hcall fails due to not being
> present. That should address the migration case that you mention
> above. With that and leaving the new hcall enabled by default
> is good okay?

No.  Assuming that guests will have some fallback is not how the qemu
migration compatibility model works.  If we specify an old machine
type, we need to provide the same environment that the older host
would have.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH] KVM: PPC: Book3S HV: Do not allocate HPT for a nested guest

2020-09-14 Thread David Gibson
On Fri, Sep 11, 2020 at 01:16:07AM -0300, Fabiano Rosas wrote:
> The current nested KVM code does not support HPT guests. This is
> informed/enforced in some ways:
> 
> - Hosts < P9 will not be able to enable the nested HV feature;
> 
> - The nested hypervisor MMU capabilities will not contain
>   KVM_CAP_PPC_MMU_HASH_V3;
> 
> - QEMU reflects the MMU capabilities in the
>   'ibm,arch-vec-5-platform-support' device-tree property;
> 
> - The nested guest, at 'prom_parse_mmu_model' ignores the
>   'disable_radix' kernel command line option if HPT is not supported;
> 
> - The KVM_PPC_CONFIGURE_V3_MMU ioctl will fail if trying to use HPT.
> 
> There is, however, still a way to start a HPT guest by using
> max-compat-cpu=power8 at the QEMU machine options. This leads to the
> guest being set to use hash after QEMU calls the KVM_PPC_ALLOCATE_HTAB
> ioctl.
> 
> With the guest set to hash, the nested hypervisor goes through the
> entry path that has no knowledge of nesting (kvmppc_run_vcpu) and
> crashes when it tries to execute an hypervisor-privileged (mtspr
> HDEC) instruction at __kvmppc_vcore_entry:
> 
> root@L1:~ $ qemu-system-ppc64 -machine pseries,max-cpu-compat=power8 ...
> 
> 
> [  538.543303] CPU: 83 PID: 25185 Comm: CPU 0/KVM Not tainted 5.9.0-rc4 #1
> [  538.543355] NIP:  c0080753f388 LR: c0080753f368 CTR: 
> c01e5ec0
> [  538.543417] REGS: c013e91e33b0 TRAP: 0700   Not tainted  (5.9.0-rc4)
> [  538.543470] MSR:  82843033   CR: 
> 22422882  XER: 2004
> [  538.543546] CFAR: c0080753f4b0 IRQMASK: 3
>GPR00: c008075397a0 c013e91e3640 c0080755e600 
> 8000
>GPR04:  c013eab19800 c01394de 
> 0043a054db72
>GPR08: 003b1652   
> c008075502e0
>GPR12: c01e5ec0 c007ffa74200 c013eab19800 
> 0008
>GPR16:  c0139676c6c0 c1d23948 
> c013e91e38b8
>GPR20: 0053  0001 
> 
>GPR24: 0001 0001  
> 0001
>GPR28: 0001 0053 c013eab19800 
> 0001
> [  538.544067] NIP [c0080753f388] __kvmppc_vcore_entry+0x90/0x104 [kvm_hv]
> [  538.544121] LR [c0080753f368] __kvmppc_vcore_entry+0x70/0x104 [kvm_hv]
> [  538.544173] Call Trace:
> [  538.544196] [c013e91e3640] [c013e91e3680] 0xc013e91e3680 
> (unreliable)
> [  538.544260] [c013e91e3820] [c008075397a0] 
> kvmppc_run_core+0xbc8/0x19d0 [kvm_hv]
> [  538.544325] [c013e91e39e0] [c0080753d99c] 
> kvmppc_vcpu_run_hv+0x404/0xc00 [kvm_hv]
> [  538.544394] [c013e91e3ad0] [c008072da4fc] 
> kvmppc_vcpu_run+0x34/0x48 [kvm]
> [  538.544472] [c013e91e3af0] [c008072d61b8] 
> kvm_arch_vcpu_ioctl_run+0x310/0x420 [kvm]
> [  538.544539] [c013e91e3b80] [c008072c7450] 
> kvm_vcpu_ioctl+0x298/0x778 [kvm]
> [  538.544605] [c013e91e3ce0] [c04b8c2c] sys_ioctl+0x1dc/0xc90
> [  538.544662] [c013e91e3dc0] [c002f9a4] 
> system_call_exception+0xe4/0x1c0
> [  538.544726] [c013e91e3e20] [c000d140] 
> system_call_common+0xf0/0x27c
> [  538.544787] Instruction dump:
> [  538.544821] f86d1098 6000 6000 4899 e8ad0fe8 e8c500a0 e9264140 
> 75290002
> [  538.544886] 7d1602a6 7cec42a6 40820008 7d0807b4 <7d164ba6> 7d083a14 
> f90d10a0 480104fd
> [  538.544953] ---[ end trace 74423e2b948c2e0c ]---
> 
> This patch makes the KVM_PPC_ALLOCATE_HTAB ioctl fail when running in
> the nested hypervisor, causing QEMU to abort.
> 
> Reported-by: Satheesh Rajendran 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/kvm/book3s_hv.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 4ba06a2a306c..764b6239ef72 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5250,6 +5250,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
>   case KVM_PPC_ALLOCATE_HTAB: {
>   u32 htab_order;
>  
> + /* If we're a nested hypervisor, we currently only support 
> radix */
> + if (kvmhv_on_pseries()) {
> + r = -EOPNOTSUPP;
> + break;
> + }
> +
>   r = -EFAULT;
>   if (get_user(htab_order, (u32 __user *)argp))
>   break;

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[PATCH] powerpc: kvm: Increase HDEC threshold to enter guest

2020-08-10 Thread David Gibson
Before entering a guest, we need to set the HDEC to pull us out again
when the guest's time is up.  This needs some care, though, because the
HDEC is edge triggered, which means that if it expires before entering the
guest, the interrupt will be lost, meaning we stay in the guest
indefinitely (in practice, until the the hard lockup detector pulls us out
with an NMI).

For the POWER9, independent threads mode specific path, we attempt to
prevent that, by testing time has already expired before setting the HDEC
in kvmhv_load_regs_and_go().  However, that doesn't account for the case
where the timer expires between that test and the actual guest entry.
Preliminary instrumentation suggests that can take as long as 1.5µs under
certain load conditions, and simply checking the HDEC value we're going to
load is positive isn't enough to guarantee that leeway.

That test here is sometimes masked by a test in kvmhv_p9_guest_entry(), its
caller.  That checks that the remaining time is at 1µs.  However as noted
above that doesn't appear to be sufficient in all circumstances even
from the point HDEC is set, let alone this earlier point.

Therefore, increase the threshold we check for in both locations to 4µs
(2048 timebase ticks).  This is a pretty crude approach, but it addresses
a real problem where guest load can trigger a host hard lockup.

We're hoping to refine this in future by gathering more data on exactly
how long these paths can take, and possibly by moving the check closer to
the actual guest entry point to reduce the variance.  Getting the details
for that might take some time however.

NOTE: For reasons I haven't yet tracked down yet, I haven't actually
managed to reproduce this on current upstream.  I have reproduced it on
RHEL kernels without obvious differences in this area.  I'm still trying
to determine what the cause of that difference is, but I think it's worth
applying this change as a precaution in the interim.

Signed-off-by: David Gibson 
---
 arch/powerpc/kvm/book3s_hv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0f83f39a2bd2..65a92dd890cb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3435,7 +3435,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
unsigned long host_pidr = mfspr(SPRN_PID);
 
hdec = time_limit - mftb();
-   if (hdec < 0)
+   if (hdec < 2048)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
mtspr(SPRN_HDEC, hdec);
 
@@ -3564,7 +3564,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 
time_limit,
 
dec = mfspr(SPRN_DEC);
tb = mftb();
-   if (dec < 512)
+   if (dec < 2048)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
local_paca->kvm_hstate.dec_expires = dec + tb;
if (local_paca->kvm_hstate.dec_expires < time_limit)
-- 
2.26.2



Re: [PATCH v3 0/4] powerpc/mm/radix: Memory unplug fixes

2020-07-25 Thread David Gibson
On Fri, Jul 24, 2020 at 09:52:14PM +1000, Michael Ellerman wrote:
> Bharata B Rao  writes:
> > On Tue, Jul 21, 2020 at 10:25:58PM +1000, Michael Ellerman wrote:
> >> Bharata B Rao  writes:
> >> > On Tue, Jul 21, 2020 at 11:45:20AM +1000, Michael Ellerman wrote:
> >> >> Nathan Lynch  writes:
> >> >> > "Aneesh Kumar K.V"  writes:
> >> >> >> This is the next version of the fixes for memory unplug on radix.
> >> >> >> The issues and the fix are described in the actual patches.
> >> >> >
> >> >> > I guess this isn't actually causing problems at runtime right now, 
> >> >> > but I
> >> >> > notice calls to resize_hpt_for_hotplug() from arch_add_memory() and
> >> >> > arch_remove_memory(), which ought to be mmu-agnostic:
> >> >> >
> >> >> > int __ref arch_add_memory(int nid, u64 start, u64 size,
> >> >> > struct mhp_params *params)
> >> >> > {
> >> >> >   unsigned long start_pfn = start >> PAGE_SHIFT;
> >> >> >   unsigned long nr_pages = size >> PAGE_SHIFT;
> >> >> >   int rc;
> >> >> >
> >> >> >   resize_hpt_for_hotplug(memblock_phys_mem_size());
> >> >> >
> >> >> >   start = (unsigned long)__va(start);
> >> >> >   rc = create_section_mapping(start, start + size, nid,
> >> >> >   params->pgprot);
> >> >> > ...
> >> >> 
> >> >> Hmm well spotted.
> >> >> 
> >> >> That does return early if the ops are not setup:
> >> >> 
> >> >> int resize_hpt_for_hotplug(unsigned long new_mem_size)
> >> >> {
> >> >> unsigned target_hpt_shift;
> >> >> 
> >> >> if (!mmu_hash_ops.resize_hpt)
> >> >> return 0;
> >> >> 
> >> >> 
> >> >> And:
> >> >> 
> >> >> void __init hpte_init_pseries(void)
> >> >> {
> >> >> ...
> >> >> if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
> >> >> mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
> >> >> 
> >> >> And that comes in via ibm,hypertas-functions:
> >> >> 
> >> >> {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
> >> >> 
> >> >> 
> >> >> But firmware is not necessarily going to add/remove that call based on
> >> >> whether we're using hash/radix.
> >> >
> >> > Correct but hpte_init_pseries() will not be called for radix guests.
> >> 
> >> Yeah, duh. You'd think the function name would have been a sufficient
> >> clue for me :)
> >> 
> >> >> So I think a follow-up patch is needed to make this more robust.
> >> >> 
> >> >> Aneesh/Bharata what platform did you test this series on? I'm curious
> >> >> how this didn't break.
> >> >
> >> > I have tested memory hotplug/unplug for radix guest on zz platform and
> >> > sanity-tested this for hash guest on P8.
> >> >
> >> > As noted above, mmu_hash_ops.resize_hpt will not be set for radix
> >> > guest and hence we won't see any breakage.
> >> 
> >> OK.
> >> 
> >> That's probably fine as it is then. Or maybe just a comment in
> >> resize_hpt_for_hotplug() pointing out that resize_hpt will be NULL if
> >> we're using radix.
> >
> > Or we could move these calls to hpt-only routines like below?
> 
> That looks like it would be equivalent, and would nicely isolate those
> calls in hash specific code. So yeah I think that's worth sending as a
> proper patch, even better if you can test it.
> 
> > David - Do you remember if there was any particular reason to have
> > these two hpt-resize calls within powerpc-generic memory hotplug code?
> 
> I think the HPT resizing was developed before or concurrently with the
> radix support, so I would guess it was just not something we thought
> about at the time.

Sounds about right; I don't remember for certain.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v3 0/4] powerpc/mm/radix: Memory unplug fixes

2020-07-22 Thread David Gibson
On Wed, Jul 22, 2020 at 11:35:06AM +0530, Bharata B Rao wrote:
> On Tue, Jul 21, 2020 at 10:25:58PM +1000, Michael Ellerman wrote:
> > Bharata B Rao  writes:
> > > On Tue, Jul 21, 2020 at 11:45:20AM +1000, Michael Ellerman wrote:
> > >> Nathan Lynch  writes:
> > >> > "Aneesh Kumar K.V"  writes:
> > >> >> This is the next version of the fixes for memory unplug on radix.
> > >> >> The issues and the fix are described in the actual patches.
> > >> >
> > >> > I guess this isn't actually causing problems at runtime right now, but 
> > >> > I
> > >> > notice calls to resize_hpt_for_hotplug() from arch_add_memory() and
> > >> > arch_remove_memory(), which ought to be mmu-agnostic:
> > >> >
> > >> > int __ref arch_add_memory(int nid, u64 start, u64 size,
> > >> >  struct mhp_params *params)
> > >> > {
> > >> >unsigned long start_pfn = start >> PAGE_SHIFT;
> > >> >unsigned long nr_pages = size >> PAGE_SHIFT;
> > >> >int rc;
> > >> >
> > >> >resize_hpt_for_hotplug(memblock_phys_mem_size());
> > >> >
> > >> >start = (unsigned long)__va(start);
> > >> >rc = create_section_mapping(start, start + size, nid,
> > >> >params->pgprot);
> > >> > ...
> > >> 
> > >> Hmm well spotted.
> > >> 
> > >> That does return early if the ops are not setup:
> > >> 
> > >> int resize_hpt_for_hotplug(unsigned long new_mem_size)
> > >> {
> > >>  unsigned target_hpt_shift;
> > >> 
> > >>  if (!mmu_hash_ops.resize_hpt)
> > >>  return 0;
> > >> 
> > >> 
> > >> And:
> > >> 
> > >> void __init hpte_init_pseries(void)
> > >> {
> > >>  ...
> > >>  if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
> > >>  mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
> > >> 
> > >> And that comes in via ibm,hypertas-functions:
> > >> 
> > >>  {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
> > >> 
> > >> 
> > >> But firmware is not necessarily going to add/remove that call based on
> > >> whether we're using hash/radix.
> > >
> > > Correct but hpte_init_pseries() will not be called for radix guests.
> > 
> > Yeah, duh. You'd think the function name would have been a sufficient
> > clue for me :)
> > 
> > >> So I think a follow-up patch is needed to make this more robust.
> > >> 
> > >> Aneesh/Bharata what platform did you test this series on? I'm curious
> > >> how this didn't break.
> > >
> > > I have tested memory hotplug/unplug for radix guest on zz platform and
> > > sanity-tested this for hash guest on P8.
> > >
> > > As noted above, mmu_hash_ops.resize_hpt will not be set for radix
> > > guest and hence we won't see any breakage.
> > 
> > OK.
> > 
> > That's probably fine as it is then. Or maybe just a comment in
> > resize_hpt_for_hotplug() pointing out that resize_hpt will be NULL if
> > we're using radix.
> 
> Or we could move these calls to hpt-only routines like below?
> 
> David - Do you remember if there was any particular reason to have
> these two hpt-resize calls within powerpc-generic memory hotplug code?

I don't remember, sorry.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH] KVM: PPC: Book3S HV: Use feature flag CPU_FTR_P9_TIDR when accessing TIDR

2020-07-20 Thread David Gibson
On Tue, Jul 21, 2020 at 03:04:45PM +1000, Paul Mackerras wrote:
> On Tue, Jun 23, 2020 at 06:50:27PM +0200, Cédric Le Goater wrote:
> > The TIDR register is only available on POWER9 systems and code
> > accessing this register is not always protected by the CPU_FTR_P9_TIDR
> > flag. Fix that to make sure POWER10 systems won't use it as TIDR has
> > been removed.
> 
> I'm concerned about what this patch would do if we are trying to
> migrate from a P9 guest to a guest on P10 in P9-compat mode, in that
> the destination QEMU would get an error on doing the SET_ONE_REG for
> the TIDR.  I don't think the lack of TIDR is worth failing the
> migration for given that TIDR only actually does anything if you are
> using an accelerator, and KVM has never supported use of accelerators
> in guests.  I'm cc'ing David Gibson for his comments on the
> compatibility and migration issues.

Having thought about this a bit more, I don't think it matters.  We're
going to have to update qemu to handle POWER10 anyway.  If this causes
a problem, this would just add one small thing to whatever we need to
fix there.

> In any case, given that both move to and move from TIDR will be no-ops
> on P10 (for privileged code), I don't think there is a great urgency
> for this patch.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[PATCHv2] tpm: ibmvtpm: Wait for ready buffer before probing for TPM2 attributes

2020-06-18 Thread David Gibson
The tpm2_get_cc_attrs_tbl() call will result in TPM commands being issued,
which will need the use of the internal command/response buffer.  But,
we're issuing this *before* we've waited to make sure that buffer is
allocated.

This can result in intermittent failures to probe if the hypervisor / TPM
implementation doesn't respond quickly enough.  I find it fails almost
every time with an 8 vcpu guest under KVM with software emulated TPM.

To fix it, just move the tpm2_get_cc_attrs_tlb() call after the
existing code to wait for initialization, which will ensure the buffer
is allocated.

Fixes: 18b3670d79ae9 ("tpm: ibmvtpm: Add support for TPM2")
Signed-off-by: David Gibson 
---

Changes from v1:
 * Fixed a formatting error in the commit message
 * Added some more detail to the commit message
 
drivers/char/tpm/tpm_ibmvtpm.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c
index 09fe45246b8cc..994385bf37c0c 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/drivers/char/tpm/tpm_ibmvtpm.c
@@ -683,13 +683,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
if (rc)
goto init_irq_cleanup;
 
-   if (!strcmp(id->compat, "IBM,vtpm20")) {
-   chip->flags |= TPM_CHIP_FLAG_TPM2;
-   rc = tpm2_get_cc_attrs_tbl(chip);
-   if (rc)
-   goto init_irq_cleanup;
-   }
-
if (!wait_event_timeout(ibmvtpm->crq_queue.wq,
ibmvtpm->rtce_buf != NULL,
HZ)) {
@@ -697,6 +690,13 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
goto init_irq_cleanup;
}
 
+   if (!strcmp(id->compat, "IBM,vtpm20")) {
+   chip->flags |= TPM_CHIP_FLAG_TPM2;
+   rc = tpm2_get_cc_attrs_tbl(chip);
+   if (rc)
+   goto init_irq_cleanup;
+   }
+
return tpm_chip_register(chip);
 init_irq_cleanup:
do {
-- 
2.26.2



[PATCH] tpm: ibmvtpm: Wait for ready buffer before probing for TPM2 attributes

2020-06-05 Thread David Gibson
The tpm2_get_cc_attrs_tbl() call will result in TPM commands being issued,
which will need the use of the internal command/response buffer.  But,
we're issuing this *before* we've waited to make sure that buffer is
allocated.

This can result in intermittent failures to probe if the hypervisor / TPM
implementation doesn't respond quickly enough.  I find it fails almost
every time with an 8 vcpu guest under KVM with software emulated TPM.

Fixes: 18b3670d79ae9 "tpm: ibmvtpm: Add support for TPM2"
Signed-off-by: David Gibson 
---
 drivers/char/tpm/tpm_ibmvtpm.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c
index 09fe45246b8c..994385bf37c0 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/drivers/char/tpm/tpm_ibmvtpm.c
@@ -683,13 +683,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
if (rc)
goto init_irq_cleanup;
 
-   if (!strcmp(id->compat, "IBM,vtpm20")) {
-   chip->flags |= TPM_CHIP_FLAG_TPM2;
-   rc = tpm2_get_cc_attrs_tbl(chip);
-   if (rc)
-   goto init_irq_cleanup;
-   }
-
if (!wait_event_timeout(ibmvtpm->crq_queue.wq,
ibmvtpm->rtce_buf != NULL,
HZ)) {
@@ -697,6 +690,13 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
goto init_irq_cleanup;
}
 
+   if (!strcmp(id->compat, "IBM,vtpm20")) {
+   chip->flags |= TPM_CHIP_FLAG_TPM2;
+   rc = tpm2_get_cc_attrs_tbl(chip);
+   if (rc)
+   goto init_irq_cleanup;
+   }
+
return tpm_chip_register(chip);
 init_irq_cleanup:
do {
-- 
2.26.2



Re: [PATCH] target/ppc: Fix mtmsr(d) L=1 variant that loses interrupts

2020-04-16 Thread David Gibson
 does not need any synchronisation */
> +if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> +gen_io_start();
> +}
> +if (ctx->opcode & 0x0001) {
> +/* L=1 form only updates EE and RI */
>  TCGv t0 = tcg_temp_new();
> +TCGv t1 = tcg_temp_new();
>  tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)],
>  (1 << MSR_RI) | (1 << MSR_EE));
> -tcg_gen_andi_tl(cpu_msr, cpu_msr,
> +tcg_gen_andi_tl(t1, cpu_msr,
>  ~(target_ulong)((1 << MSR_RI) | (1 << MSR_EE)));
> -tcg_gen_or_tl(cpu_msr, cpu_msr, t0);
> +tcg_gen_or_tl(t1, t1, t0);
> +
> +gen_helper_store_msr(cpu_env, t1);
>  tcg_temp_free(t0);
> +tcg_temp_free(t1);
> +
>  } else {
>  TCGv msr = tcg_temp_new();
>  
> @@ -4411,9 +4423,6 @@ static void gen_mtmsr(DisasContext *ctx)
>   *  power saving mode, we will exit the loop directly from
>   *  ppc_store_msr
>   */
> -if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> -gen_io_start();
> -}
>  gen_update_nip(ctx, ctx->base.pc_next);
>  #if defined(TARGET_PPC64)
>  tcg_gen_deposit_tl(msr, cpu_msr, cpu_gpr[rS(ctx->opcode)], 0, 32);
> @@ -4422,10 +4431,9 @@ static void gen_mtmsr(DisasContext *ctx)
>  #endif
>  gen_helper_store_msr(cpu_env, msr);
>  tcg_temp_free(msr);
> -/* Must stop the translation as machine state (may have) changed */
> -/* Note that mtmsr is not always defined as context-synchronizing */
> -gen_stop_exception(ctx);
>  }
> +/* Must stop the translation as machine state (may have) changed */
> +gen_stop_exception(ctx);
>  #endif
>  }
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


  1   2   3   4   5   6   7   8   9   10   >