[PATCH v4] powerpc: Replace setup_irq() by request_irq()

2020-03-11 Thread afzal mohammed
request_irq() is preferred over setup_irq(). Invocations of setup_irq()
occur after memory allocators are ready.

Per tglx[1], setup_irq() existed in olden days when allocators were not
ready by the time early interrupts were initialized.

Hence replace setup_irq() by request_irq().

[1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos

Signed-off-by: afzal mohammed 
---

v4:
 * pass non-NULL dev_id while requesting shared irq in mpc85xx_cds.c, as
request_irq() can fail due to sanity check, which is not done in
setup_irq()
v3:
 * Split out from tree wide series, as Thomas suggested to get it thr'
respective maintainers
 * Modify pr_err displayed in case of error
 * Re-arrange code & choose pr_err args as required to improve readability
 * Remove irrelevant parts from commit message & improve
 
v2:
 * Replace pr_err("request_irq() on %s failed" by
   pr_err("%s: request_irq() failed"
 * Commit message massage

 arch/powerpc/platforms/85xx/mpc85xx_cds.c | 11 -
 arch/powerpc/platforms/8xx/cpm1.c |  9 ++-
 arch/powerpc/platforms/8xx/m8xx_setup.c   |  9 ++-
 arch/powerpc/platforms/chrp/setup.c   | 14 ---
 arch/powerpc/platforms/powermac/pic.c | 29 +--
 arch/powerpc/platforms/powermac/smp.c | 12 --
 6 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c 
b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
index 6b1436abe9b1..915ab6710b93 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
@@ -218,12 +218,6 @@ static irqreturn_t mpc85xx_8259_cascade_action(int irq, 
void *dev_id)
 {
return IRQ_HANDLED;
 }
-
-static struct irqaction mpc85xxcds_8259_irqaction = {
-   .handler = mpc85xx_8259_cascade_action,
-   .flags = IRQF_SHARED | IRQF_NO_THREAD,
-   .name = "8259 cascade",
-};
 #endif /* PPC_I8259 */
 #endif /* CONFIG_PCI */
 
@@ -271,7 +265,10 @@ static int mpc85xx_cds_8259_attach(void)
 *  disabled when the last user of the shared IRQ line frees their
 *  interrupt.
 */
-   if ((ret = setup_irq(cascade_irq, &mpc85xxcds_8259_irqaction))) {
+   ret = request_irq(cascade_irq, mpc85xx_8259_cascade_action,
+ IRQF_SHARED | IRQF_NO_THREAD, "8259 cascade",
+ cascade_node);
+   if (ret) {
printk(KERN_ERR "Failed to setup cascade interrupt\n");
return ret;
}
diff --git a/arch/powerpc/platforms/8xx/cpm1.c 
b/arch/powerpc/platforms/8xx/cpm1.c
index a43ee7d1ff85..4db4ca2e1222 100644
--- a/arch/powerpc/platforms/8xx/cpm1.c
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -120,12 +120,6 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
return IRQ_HANDLED;
 }
 
-static struct irqaction cpm_error_irqaction = {
-   .handler = cpm_error_interrupt,
-   .flags = IRQF_NO_THREAD,
-   .name = "error",
-};
-
 static const struct irq_domain_ops cpm_pic_host_ops = {
.map = cpm_pic_host_map,
 };
@@ -187,7 +181,8 @@ unsigned int __init cpm_pic_init(void)
if (!eirq)
goto end;
 
-   if (setup_irq(eirq, &cpm_error_irqaction))
+   if (request_irq(eirq, cpm_error_interrupt, IRQF_NO_THREAD, "error",
+   NULL))
printk(KERN_ERR "Could not allocate CPM error IRQ!");
 
setbits32(&cpic_reg->cpic_cicr, CICR_IEN);
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c 
b/arch/powerpc/platforms/8xx/m8xx_setup.c
index f1c805c8adbc..df4d57d07f9a 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -39,12 +39,6 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
return IRQ_HANDLED;
 }
 
-static struct irqaction tbint_irqaction = {
-   .handler = timebase_interrupt,
-   .flags = IRQF_NO_THREAD,
-   .name = "tbint",
-};
-
 /* per-board overridable init_internal_rtc() function. */
 void __init __attribute__ ((weak))
 init_internal_rtc(void)
@@ -157,7 +151,8 @@ void __init mpc8xx_calibrate_decr(void)
(TBSCR_TBF | TBSCR_TBE));
immr_unmap(sys_tmr2);
 
-   if (setup_irq(virq, &tbint_irqaction))
+   if (request_irq(virq, timebase_interrupt, IRQF_NO_THREAD, "tbint",
+   NULL))
panic("Could not allocate timer IRQ!");
 }
 
diff --git a/arch/powerpc/platforms/chrp/setup.c 
b/arch/powerpc/platforms/chrp/setup.c
index fcf6f2342ef4..8328cd5817b0 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -451,13 +451,6 @@ static void __init chrp_find_openpic(void)
of_node_put(np);
 }
 
-#if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_XMON)
-static struct irqaction xmon_irqaction = {
-   .handler = xmon_irq,
-   .name = "XMON break",
-};
-#endif
-
 static void __init chr

Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-11 Thread kajoljain



On 3/11/20 12:04 AM, Arnaldo Carvalho de Melo wrote:
> Em Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain escreveu:
>> Patch enhances current metric infrastructure to handle "?" in the metric
>> expression. The "?" can be use for parameters whose value not known while
>> creating metric events and which can be replace later at runtime to
>> the proper value. It also add flexibility to create multiple events out
>> of single metric event added in json file.
>>
>> Patch adds function 'arch_get_runtimeparam' which is a arch specific
>> function, returns the count of metric events need to be created.
>> By default it return 1.
>>
>> One loop is added in function 'metricgroup__add_metric', which create
>> multiple events at run time depend on return value of
>> 'arch_get_runtimeparam' and merge that event in 'group_list'.
>>
>> This infrastructure needed for hv_24x7 socket/chip level events.
>> "hv_24x7" chip level events needs specific chip-id to which the
>> data is requested. Function 'arch_get_runtimeparam' implemented
>> in header.c which extract number of sockets from sysfs file
>> "sockets" under "/sys/devices/hv_24x7/interface/".
>>
>> Signed-off-by: Kajol Jain 
>> ---
>>  tools/perf/arch/powerpc/util/header.c |  22 +
>>  tools/perf/util/expr.h|   1 +
>>  tools/perf/util/expr.l|  19 +++-
>>  tools/perf/util/metricgroup.c | 124 --
>>  tools/perf/util/metricgroup.h |   1 +
>>  tools/perf/util/stat-shadow.c |   8 ++
>>  6 files changed, 148 insertions(+), 27 deletions(-)
>>
>> diff --git a/tools/perf/arch/powerpc/util/header.c 
>> b/tools/perf/arch/powerpc/util/header.c
>> index 3b4cdfc5efd6..036f6b2ce202 100644
>> --- a/tools/perf/arch/powerpc/util/header.c
>> +++ b/tools/perf/arch/powerpc/util/header.c
>> @@ -7,6 +7,11 @@
>>  #include 
>>  #include 
>>  #include "header.h"
>> +#include "metricgroup.h"
>> +#include "evlist.h"
>> +#include 
>> +#include "pmu.h"
>> +#include 
>>  
>>  #define mfspr(rn)   ({unsigned long rval; \
>>   asm volatile("mfspr %0," __stringify(rn) \
>> @@ -16,6 +21,8 @@
>>  #define PVR_VER(pvr)(((pvr) >>  16) & 0x) /* Version field */
>>  #define PVR_REV(pvr)(((pvr) >>   0) & 0x) /* Revison field */
>>  
>> +#define SOCKETS_INFO_FILE_PATH "/devices/hv_24x7/interface/"
>> +
>>  int
>>  get_cpuid(char *buffer, size_t sz)
>>  {
>> @@ -44,3 +51,18 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
>>  
>>  return bufp;
>>  }
>> +
>> +int arch_get_runtimeparam(void)
>> +{
>> +int count;
>> +char path[PATH_MAX];
>> +char filename[] = "sockets";
>> +
>> +snprintf(path, PATH_MAX,
>> + SOCKETS_INFO_FILE_PATH "%s", filename);
>> +
>> +if (sysfs__read_ull(path, (unsigned long long *)&count) < 0)
>> +return 1;
>> +else
>> +return count;
> 
> Why this cast dance? We have sysfs__read_int(path, &count).
> 
> Also this is more compact:
> 
>   return sysfs__read_int(path, &count) < 0 ? 1 : count;

Hi Arnaldo,
 Yes it will be better to use like that.
> 
>> +}
>> diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
>> index 9377538f4097..d17664e628db 100644
>> --- a/tools/perf/util/expr.h
>> +++ b/tools/perf/util/expr.h
>> @@ -15,6 +15,7 @@ struct parse_ctx {
>>  struct parse_id ids[MAX_PARSE_ID];
>>  };
>>  
>> +int expr__runtimeparam;
>>  void expr__ctx_init(struct parse_ctx *ctx);
>>  void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
>>  int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
>> diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
>> index 1928f2a3dddc..ec4b00671f67 100644
>> --- a/tools/perf/util/expr.l
>> +++ b/tools/perf/util/expr.l
>> @@ -45,6 +45,21 @@ static char *normalize(char *str)
>>  *dst++ = '/';
>>  else if (*str == '\\')
>>  *dst++ = *++str;
>> +else if (*str == '?') {
>> +
>> +int size = snprintf(NULL, 0, "%d", expr__runtimeparam);
> 
> TIL that C99 allows for a NULL str to format and return the number of
> bytes it would write if the string was large enough... wonders never
> cease :-)
> 
>> +char * paramval = (char *)malloc(size);
> 
> No need for the cast, malloc returns void *, or has that changed? 8-)
> and please no space before the variable name.
> 

Okk, Will take care of that.

> Humm this is all complicated, why not use asprintf and have something
> like:
> 
>> +int i = 0;
>> +
>> +if(!paramval)
>> +*dst++ = '0';
>> +else {
>> +sprintf(paramval, "%d", expr__runtimeparam);
>> +while(i < size)
>> +*dst++ = paramval[i++];
>> +free(paramval);
>> +}
> 
>   cha

Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record

2020-03-11 Thread Michael Ellerman
Tyrel Datwyler  writes:
> On 3/10/20 10:25 AM, Nathan Lynch wrote:
>> Tyrel Datwyler  writes:
>>> The expectation is that when calling of_read_drc_info_cell()
>>> repeatedly to parse multiple drc-info records that the in/out curval
>>> parameter points at the start of the next record on return. However,
>>> the current behavior has curval still pointing at the final value of
>>> the record just parsed. The result of which is that if the
>>> ibm,drc-info property contains multiple properties the parsed value
>>> of the drc_type for any record after the first has the power_domain
>>> value of the previous record appended to the type string.
>>>
>>> Ex: observed the following 0x prepended to PHB
>>>
>>> [   69.485037] drc-info: type: \xff\xff\xff\xffPHB, prefix: PHB , 
>>> index_start: 0x2001
>>> [   69.485038] drc-info: suffix_start: 1, sequential_elems: 3072, 
>>> sequential_inc: 1
>>> [   69.485038] drc-info: power-domain: 0x, last_index: 0x2c00
>>>
>>> Fix by incrementing curval past the power_domain value to point at
>>> drc_type string of next record.
>>>
>>> Fixes: a29396653b8bf ("pseries/drc-info: Search DRC properties for CPU 
>>> indexes")
>> 
>> I have a different commit hash for that:
>> e83636ac3334 pseries/drc-info: Search DRC properties for CPU indexes
>
> Oof, looks like I grabbed the commit hash from the SLES 15 SP1 branch. You 
> have
> the correct upstream commit.
>
> Fixes: e83636ac3334 ("pseries/drc-info: Search DRC properties for CPU 
> indexes")
>
> Michael, let me know if you want me to resubmit, or if you will fixup the 
> Fixes
> tag on your end?

I can update the Fixes tag.

What's the practical effect of this bug? It seems like it should break
all the hotplug stuff, but presumably it doesn't in practice for some
reason?

It would also be *really* nice if we had some unit tests for this
parsing code, it's demonstrably very bug prone.

cheers


Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-11 Thread Srikar Dronamraju
* Michal Hocko  [2020-03-11 12:57:35]:

> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
> > A Powerpc system with multiple possible nodes and with CONFIG_NUMA
> > enabled always used to have a node 0, even if node 0 does not any cpus
> > or memory attached to it. As per PAPR, node affinity of a cpu is only
> > available once its present / online. For all cpus that are possible but
> > not present, cpu_to_node() would point to node 0.
> > 
> > To ensure a cpuless, memoryless dummy node is not online, powerpc need
> > to make sure all possible but not present cpu_to_node are set to a
> > proper node.
> 
> Just curious, is this somehow related to
> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
> 

The issue I am trying to fix is a known issue in Powerpc since many years.
So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate
shrinker_map on appropriate NUMA node"). 

I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the
kernel. Will work with Sachin/Abdul (reporters of the issue).


> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: linux...@kvack.org
> > Cc: linux-ker...@vger.kernel.org
> > Cc: Michal Hocko 
> > Cc: Mel Gorman 
> > Cc: Vlastimil Babka 
> > Cc: "Kirill A. Shutemov" 
> > Cc: Christopher Lameter 
> > Cc: Michael Ellerman 
> > Cc: Andrew Morton 
> > Cc: Linus Torvalds 
> > Signed-off-by: Srikar Dronamraju 
> > ---
> >  arch/powerpc/mm/numa.c | 16 ++--
> >  1 file changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > index 8a399db..54dcd49 100644
> > --- a/arch/powerpc/mm/numa.c
> > +++ b/arch/powerpc/mm/numa.c
> > @@ -931,8 +931,20 @@ void __init mem_topology_setup(void)
> >  
> > reset_numa_cpu_lookup_table();
> >  
> > -   for_each_present_cpu(cpu)
> > -   numa_setup_cpu(cpu);
> > +   for_each_possible_cpu(cpu) {
> > +   /*
> > +* Powerpc with CONFIG_NUMA always used to have a node 0,
> > +* even if it was memoryless or cpuless. For all cpus that
> > +* are possible but not present, cpu_to_node() would point
> > +* to node 0. To remove a cpuless, memoryless dummy node,
> > +* powerpc need to make sure all possible but not present
> > +* cpu_to_node are set to a proper node.
> > +*/
> > +   if (cpu_present(cpu))
> > +   numa_setup_cpu(cpu);
> > +   else
> > +   set_cpu_numa_node(cpu, first_online_node);
> > +   }
> >  }
> >  
> >  void __init initmem_init(void)
> > -- 
> > 1.8.3.1
> 
> -- 
> Michal Hocko
> SUSE Labs

-- 
Thanks and Regards
Srikar Dronamraju



Re: [RFC PATCH v1] pseries/drmem: don't cache node id in drmem_lmb struct

2020-03-11 Thread Michal Suchánek
On Wed, Mar 11, 2020 at 06:08:15PM -0500, Scott Cheloha wrote:
> At memory hot-remove time we can retrieve an LMB's nid from its
> corresponding memory_block.  There is no need to store the nid
> in multiple locations.
> 
> Signed-off-by: Scott Cheloha 
> ---
> The linear search in powerpc's memory_add_physaddr_to_nid() has become a
> bottleneck at boot on systems with many LMBs.
> 
> As described in this patch here:
> 
> https://lore.kernel.org/linuxppc-dev/20200221172901.1596249-2-chel...@linux.ibm.com/
> 
> the linear search seriously cripples drmem_init().
> 
> The obvious solution (shown in that patch) is to just make the search
> in memory_add_physaddr_to_nid() faster.  An XArray seems well-suited
> to the task of mapping an address range to an LMB object.
> 
> The less obvious approach is to just call memory_add_physaddr_to_nid()
> in fewer places.
> 
> I'm not sure which approach is correct, hence the RFC.

You basically revert the below which will likely cause the very error
that was fixed there:

commit b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40
Author: Nathan Fontenot 
Date:   Tue Oct 2 10:35:59 2018 -0500

powerpc/pseries: Track LMB nid instead of using device tree

When removing memory we need to remove the memory from the node
it was added to instead of looking up the node it should be in
in the device tree.

During testing we have seen scenarios where the affinity for a
LMB changes due to a partition migration or PRRN event. In these
cases the node the LMB exists in may not match the node the device
tree indicates it belongs in. This can lead to a system crash
when trying to DLPAR remove the LMB after a migration or PRRN
event. The current code looks up the node in the device tree to
remove the LMB from, the crash occurs when we try to offline this
node and it does not have any data, i.e. node_data[nid] == NULL.

Thanks

Michal


Re: [PATCH v3 23/27] powerpc/powernv/pmem: Add debug IOCTLs

2020-03-11 Thread Alastair D'Silva
On Thu, 2020-03-05 at 14:11 +1100, Andrew Donnellan wrote:
> On 21/2/20 2:27 pm, Alastair D'Silva wrote:
> > From: Alastair D'Silva 
> > 
> > These IOCTLs provide low level access to the card to aid in
> > debugging
> > controller/FPGA firmware.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/platforms/powernv/pmem/Kconfig |   6 +
> >   arch/powerpc/platforms/powernv/pmem/ocxl.c  | 249
> > 
> >   include/uapi/nvdimm/ocxl-pmem.h |  32 +++
> >   3 files changed, 287 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pmem/Kconfig
> > b/arch/powerpc/platforms/powernv/pmem/Kconfig
> > index c5d927520920..3f44429d70c9 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/Kconfig
> > +++ b/arch/powerpc/platforms/powernv/pmem/Kconfig
> > @@ -12,4 +12,10 @@ config OCXL_PMEM
> >   
> >   Select N if unsure.
> >   
> > +config OCXL_PMEM_DEBUG
> > +   bool "OpenCAPI Persistent Memory debugging"
> > +   depends on OCXL_PMEM
> > +   help
> > + Enables low level IOCTLs for OpenCAPI Persistent Memory
> > firmware development
> > +
> 
> How dangerous are these ioctls and does that need to be pointed out
> in 
> this description?

Good point, I'll elaborate.

> 
> >   endif
> > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > index e01f6f9fc180..d4ce5e9e0521 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > @@ -1050,6 +1050,235 @@ int req_controller_health_perf(struct
> > ocxlpmem *ocxlpmem)
> >   GLOBAL_MMIO_HCI_REQ_HEALTH_PERF);
> >   }
> >   
> > +#ifdef CONFIG_OCXL_PMEM_DEBUG
> > +/**
> > + * enable_fwdebug() - Enable FW debug on the controller
> > + * @ocxlpmem: the device metadata
> > + * Return: 0 on success, negative on failure
> > + */
> > +static int enable_fwdebug(const struct ocxlpmem *ocxlpmem)
> > +{
> > +   return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu,
> > GLOBAL_MMIO_HCI,
> > + OCXL_LITTLE_ENDIAN,
> > + GLOBAL_MMIO_HCI_FW_DEBUG);
> > +}
> > +
> > +/**
> > + * disable_fwdebug() - Disable FW debug on the controller
> > + * @ocxlpmem: the device metadata
> > + * Return: 0 on success, negative on failure
> > + */
> > +static int disable_fwdebug(const struct ocxlpmem *ocxlpmem)
> > +{
> > +   return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu,
> > GLOBAL_MMIO_HCIC,
> > + OCXL_LITTLE_ENDIAN,
> > + GLOBAL_MMIO_HCI_FW_DEBUG);
> > +}
> > +
> > +static int ioctl_fwdebug(struct ocxlpmem *ocxlpmem,
> > +struct ioctl_ocxl_pmem_fwdebug __user
> > *uarg)
> > +{
> > +   struct ioctl_ocxl_pmem_fwdebug args;
> > +   u64 val;
> > +   int i;
> > +   int rc;
> > +
> > +   if (copy_from_user(&args, uarg, sizeof(args)))
> > +   return -EFAULT;
> > +
> > +   // Buffer size must be a multiple of 8
> > +   if ((args.buf_size & 0x07))
> > +   return -EINVAL;
> > +
> > +   if (args.buf_size > ocxlpmem->admin_command.data_size)
> > +   return -EINVAL;
> > +
> > +   mutex_lock(&ocxlpmem->admin_command.lock);
> > +
> > +   rc = enable_fwdebug(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_request(ocxlpmem, ADMIN_COMMAND_FW_DEBUG);
> > +   if (rc)
> > +   goto out;
> > +
> > +   // Write DebugAction & FunctionCode
> > +   val = ((u64)args.debug_action << 56) | ((u64)args.function_code
> > << 40);
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x08,
> > + OCXL_LITTLE_ENDIAN, val);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x10,
> > + OCXL_LITTLE_ENDIAN,
> > args.debug_parameter_1);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x18,
> > + OCXL_LITTLE_ENDIAN,
> > args.debug_parameter_2);
> > +   if (rc)
> > +   goto out;
> > +
> > +   for (i = 0x20; i < 0x38; i += 0x08)
> 
> Comparison should be <=, the request block ends at 0x40.
> 
> But in any case, scm_command_request() should I think already handle
> the 
> clearing of the request block?
> 
Correct.

> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + i,
> > + OCXL_LITTLE_ENDIAN, 0);
> > +   if (rc)
> > +   goto out;
> > +
> > +
> > +   // Populate admin command buffer
> > +   if (args.buf_size) {

Re: [PATCH v3 19/27] powerpc/powernv/pmem: Add an IOCTL to report controller statistics

2020-03-11 Thread Alastair D'Silva
On Thu, 2020-03-05 at 11:46 +1100, Andrew Donnellan wrote:
> On 21/2/20 2:27 pm, Alastair D'Silva wrote:
> > From: Alastair D'Silva 
> > 
> > The controller can report a number of statistics that are useful
> > in evaluating the performance and reliability of the card.
> > 
> > This patch exposes this information via an IOCTL.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/platforms/powernv/pmem/ocxl.c | 185
> > +
> >   include/uapi/nvdimm/ocxl-pmem.h|  17 ++
> >   2 files changed, 202 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > index 2cabafe1fc58..009d4fd29e7d 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > @@ -758,6 +758,186 @@ static int
> > ioctl_controller_dump_complete(struct ocxlpmem *ocxlpmem)
> > GLOBAL_MMIO_HCI_CONTROLLER_DUMP_COL
> > LECTED);
> >   }
> >   
> > +/**
> > + * controller_stats_header_parse() - Parse the first 64 bits of
> > the controller stats admin command response
> > + * @ocxlpmem: the device metadata
> > + * @length: out, returns the number of bytes in the response
> > (excluding the 64 bit header)
> > + */
> > +static int controller_stats_header_parse(struct ocxlpmem
> > *ocxlpmem,
> > +   u32 *length)
> > +{
> > +   int rc;
> > +   u64 val;
> > +
> > +   u16 data_identifier;
> > +   u32 data_length;
> > +
> > +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu,
> > +ocxlpmem-
> > >admin_command.data_offset,
> > +OCXL_LITTLE_ENDIAN, &val);
> > +   if (rc)
> > +   return rc;
> > +
> > +   data_identifier = val >> 48;
> > +   data_length = val & 0x;
> > +
> > +   if (data_identifier != 0x4353) { // 'CS'
> > +   dev_err(&ocxlpmem->dev,
> > +   "Bad data identifier for controller stats,
> > expected 'CS', got '%-.*s'\n",
> > +   2, (char *)&data_identifier);
> > +   return -EINVAL;
> 
> Same comment as earlier patches re EINVAL
> 

I don't think I've seen a comment yet on these particular blocks. Can
you suggest a better return value?

> > +   }
> > +
> > +   *length = data_length;
> > +   return 0;
> > +}
> > +
> > +static int ioctl_controller_stats(struct ocxlpmem *ocxlpmem,
> > + struct
> > ioctl_ocxl_pmem_controller_stats __user *uarg)
> > +{
> > +   struct ioctl_ocxl_pmem_controller_stats args;
> > +   u32 length;
> > +   int rc;
> > +   u64 val;
> > +
> > +   memset(&args, '\0', sizeof(args));
> > +
> > +   mutex_lock(&ocxlpmem->admin_command.lock);
> > +
> > +   rc = admin_command_request(ocxlpmem,
> > ADMIN_COMMAND_CONTROLLER_STATS);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x08,
> > + OCXL_LITTLE_ENDIAN, 0);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_execute(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +
> > +   rc = admin_command_complete_timeout(ocxlpmem,
> > +   ADMIN_COMMAND_CONTROLLER_ST
> > ATS);
> > +   if (rc < 0) {
> > +   dev_warn(&ocxlpmem->dev, "Controller stats timed
> > out\n");
> > +   goto out;
> > +   }
> > +
> > +   rc = admin_response(ocxlpmem);
> > +   if (rc < 0)
> > +   goto out;
> > +   if (rc != STATUS_SUCCESS) {
> > +   warn_status(ocxlpmem,
> > +   "Unexpected status from controller stats",
> > rc);
> > +   goto out;
> > +   }
> > +
> > +   rc = controller_stats_header_parse(ocxlpmem, &length);
> > +   if (rc)
> > +   goto out;
> > +
> > +   if (length != 0x140)
> > +   warn_status(ocxlpmem,
> > +   "Unexpected length for controller stats
> > data, expected 0x140, got 0x%x",
> > +   length);
> 
> Might be worth a comment to explain where 0x140 comes from (it looks 
> correct from my reading of the spec)

Ok

> 
> > +
> > +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu,
> > +ocxlpmem-
> > >admin_command.data_offset + 0x08 + 0x08,
> > +OCXL_LITTLE_ENDIAN, &val);
> > +   if (rc)
> > +   goto out;
> > +
> > +   args.reset_count = val >> 32;
> > +   args.reset_uptime = val & 0x;
> > +
> > +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu,
> > +ocxlpmem-
> > >admin_command.data_offset + 0x08 + 0x10,
> > +OCXL_LITTLE_ENDIAN, &val);
> > +   if (rc)
> > +   goto out;
> > +
> > +   args.power_on_uptime = val >> 32;
> 
> We're not collecting life remaining?
> 

It looks like my implementation is out of date. I'll bring it in line
with

Re: [PATCH v3 23/27] powerpc/powernv/pmem: Add debug IOCTLs

2020-03-11 Thread Alastair D'Silva
On Wed, 2020-03-04 at 16:21 +0100, Frederic Barrat wrote:
> 
> Le 21/02/2020 à 04:27, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > These IOCTLs provide low level access to the card to aid in
> > debugging
> > controller/FPGA firmware.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/platforms/powernv/pmem/Kconfig |   6 +
> >   arch/powerpc/platforms/powernv/pmem/ocxl.c  | 249
> > 
> >   include/uapi/nvdimm/ocxl-pmem.h |  32 +++
> >   3 files changed, 287 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pmem/Kconfig
> > b/arch/powerpc/platforms/powernv/pmem/Kconfig
> > index c5d927520920..3f44429d70c9 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/Kconfig
> > +++ b/arch/powerpc/platforms/powernv/pmem/Kconfig
> > @@ -12,4 +12,10 @@ config OCXL_PMEM
> >   
> >   Select N if unsure.
> >   
> > +config OCXL_PMEM_DEBUG
> > +   bool "OpenCAPI Persistent Memory debugging"
> > +   depends on OCXL_PMEM
> > +   help
> > + Enables low level IOCTLs for OpenCAPI Persistent Memory
> > firmware development
> > +
> >   endif
> > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > index e01f6f9fc180..d4ce5e9e0521 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > @@ -1050,6 +1050,235 @@ int req_controller_health_perf(struct
> > ocxlpmem *ocxlpmem)
> >   GLOBAL_MMIO_HCI_REQ_HEALTH_PERF);
> >   }
> >   
> > +#ifdef CONFIG_OCXL_PMEM_DEBUG
> > +/**
> > + * enable_fwdebug() - Enable FW debug on the controller
> > + * @ocxlpmem: the device metadata
> > + * Return: 0 on success, negative on failure
> > + */
> > +static int enable_fwdebug(const struct ocxlpmem *ocxlpmem)
> > +{
> > +   return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu,
> > GLOBAL_MMIO_HCI,
> > + OCXL_LITTLE_ENDIAN,
> > + GLOBAL_MMIO_HCI_FW_DEBUG);
> > +}
> > +
> > +/**
> > + * disable_fwdebug() - Disable FW debug on the controller
> > + * @ocxlpmem: the device metadata
> > + * Return: 0 on success, negative on failure
> > + */
> > +static int disable_fwdebug(const struct ocxlpmem *ocxlpmem)
> > +{
> > +   return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu,
> > GLOBAL_MMIO_HCIC,
> > + OCXL_LITTLE_ENDIAN,
> > + GLOBAL_MMIO_HCI_FW_DEBUG);
> > +}
> > +
> > +static int ioctl_fwdebug(struct ocxlpmem *ocxlpmem,
> > +struct ioctl_ocxl_pmem_fwdebug __user
> > *uarg)
> > +{
> > +   struct ioctl_ocxl_pmem_fwdebug args;
> > +   u64 val;
> > +   int i;
> > +   int rc;
> > +
> > +   if (copy_from_user(&args, uarg, sizeof(args)))
> > +   return -EFAULT;
> > +
> > +   // Buffer size must be a multiple of 8
> > +   if ((args.buf_size & 0x07))
> > +   return -EINVAL;
> > +
> > +   if (args.buf_size > ocxlpmem->admin_command.data_size)
> > +   return -EINVAL;
> > +
> > +   mutex_lock(&ocxlpmem->admin_command.lock);
> > +
> > +   rc = enable_fwdebug(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_request(ocxlpmem, ADMIN_COMMAND_FW_DEBUG);
> > +   if (rc)
> > +   goto out;
> > +
> > +   // Write DebugAction & FunctionCode
> > +   val = ((u64)args.debug_action << 56) | ((u64)args.function_code
> > << 40);
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x08,
> > + OCXL_LITTLE_ENDIAN, val);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x10,
> > + OCXL_LITTLE_ENDIAN,
> > args.debug_parameter_1);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x18,
> > + OCXL_LITTLE_ENDIAN,
> > args.debug_parameter_2);
> > +   if (rc)
> > +   goto out;
> > +
> > +   for (i = 0x20; i < 0x38; i += 0x08)
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + i,
> > + OCXL_LITTLE_ENDIAN, 0);
> > +   if (rc)
> > +   goto out;
> 
> rc is the for loop body. The rc test is not.
> 
Whoops :)

> 
> > +
> > +
> > +   // Populate admin command buffer
> > +   if (args.buf_size) {
> > +   for (i = 0; i < args.buf_size; i += sizeof(u64)) {
> > +   u64 val;
> > +
> > +   if (copy_from_user(&val, &args.buf[i],
> > sizeof(u64)))
> > +   return -EFAULT;
> 
> need 

Re: [PATCH v4 1/5] mm/memremap_pages: Introduce memremap_compat_align()

2020-03-11 Thread Michael Ellerman
Dan Williams  writes:
> The "sub-section memory hotplug" facility allows memremap_pages() users
> like libnvdimm to compensate for hardware platforms like x86 that have a
> section size larger than their hardware memory mapping granularity.  The
> compensation that sub-section support affords is being tolerant of
> physical memory resources shifting by units smaller (64MiB on x86) than
> the memory-hotplug section size (128 MiB). Where the platform
> physical-memory mapping granularity is limited by the number and
> capability of address-decode-registers in the memory controller.
>
> While the sub-section support allows memremap_pages() to operate on
> sub-section (2MiB) granularity, the Power architecture may still
> require 16MiB alignment on "!radix_enabled()" platforms.
>
> In order for libnvdimm to be able to detect and manage this per-arch
> limitation, introduce memremap_compat_align() as a common minimum
> alignment across all driver-facing memory-mapping interfaces, and let
> Power override it to 16MiB in the "!radix_enabled()" case.
>
> The assumption / requirement for 16MiB to be a viable
> memremap_compat_align() value is that Power does not have platforms
> where its equivalent of address-decode-registers never hardware remaps a
> persistent memory resource on smaller than 16MiB boundaries. Note that I
> tried my best to not add a new Kconfig symbol, but header include
> entanglements defeated the #ifndef memremap_compat_align design pattern
> and the need to export it defeats the __weak design pattern for arch
> overrides.
>
> Based on an initial patch by Aneesh.
>
> Link: 
> http://lore.kernel.org/r/capcyv4gbgnp95apyabcsocea50tqj9b5h__83vgngjq3oug...@mail.gmail.com
> Reported-by: Aneesh Kumar K.V 
> Reported-by: Jeff Moyer 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Reviewed-by: Aneesh Kumar K.V 
> Signed-off-by: Dan Williams 
> ---
>  arch/powerpc/Kconfig  |1 +
>  arch/powerpc/mm/ioremap.c |   21 +
>  drivers/nvdimm/pfn_devs.c |2 +-
>  include/linux/memremap.h  |8 
>  include/linux/mmzone.h|1 +
>  lib/Kconfig   |3 +++
>  mm/memremap.c |   23 +++
>  7 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 497b7d0b2d7e..e6ffe905e2b9 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -122,6 +122,7 @@ config PPC
>   select ARCH_HAS_GCOV_PROFILE_ALL
>   select ARCH_HAS_KCOV
>   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
> + select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
>   select ARCH_HAS_MMIOWB  if PPC64
>   select ARCH_HAS_PHYS_TO_DMA
>   select ARCH_HAS_PMEM_API
> diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
> index fc669643ce6a..b1a0aebe8c48 100644
> --- a/arch/powerpc/mm/ioremap.c
> +++ b/arch/powerpc/mm/ioremap.c
> @@ -2,6 +2,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  
> @@ -97,3 +98,23 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t 
> offset, unsigned long size,
>  
>   return NULL;
>  }
> +
> +#ifdef CONFIG_ZONE_DEVICE
> +/*
> + * Override the generic version in mm/memremap.c.
> + *
> + * With hash translation, the direct-map range is mapped with just one
> + * page size selected by htab_init_page_sizes(). Consult
> + * mmu_psize_defs[] to determine the minimum page size alignment.
> +*/
> +unsigned long memremap_compat_align(void)
> +{
> + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
> +
> + if (radix_enabled())
> + return SUBSECTION_SIZE;
> + return max(SUBSECTION_SIZE, 1UL << shift);
> +
> +}
> +EXPORT_SYMBOL_GPL(memremap_compat_align);
> +#endif

LGTM.

Acked-by: Michael Ellerman  (powerpc)

cheers


Re: [PATCH v3 19/27] powerpc/powernv/pmem: Add an IOCTL to report controller statistics

2020-03-11 Thread Alastair D'Silva
On Wed, 2020-03-04 at 10:25 +0100, Frederic Barrat wrote:
> 
> Le 21/02/2020 à 04:27, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > The controller can report a number of statistics that are useful
> > in evaluating the performance and reliability of the card.
> > 
> > This patch exposes this information via an IOCTL.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/platforms/powernv/pmem/ocxl.c | 185
> > +
> >   include/uapi/nvdimm/ocxl-pmem.h|  17 ++
> >   2 files changed, 202 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > index 2cabafe1fc58..009d4fd29e7d 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c
> > @@ -758,6 +758,186 @@ static int
> > ioctl_controller_dump_complete(struct ocxlpmem *ocxlpmem)
> > GLOBAL_MMIO_HCI_CONTROLLER_DUMP_COL
> > LECTED);
> >   }
> >   
> > +/**
> > + * controller_stats_header_parse() - Parse the first 64 bits of
> > the controller stats admin command response
> > + * @ocxlpmem: the device metadata
> > + * @length: out, returns the number of bytes in the response
> > (excluding the 64 bit header)
> > + */
> > +static int controller_stats_header_parse(struct ocxlpmem
> > *ocxlpmem,
> > +   u32 *length)
> > +{
> > +   int rc;
> > +   u64 val;
> > +
> 
> unexpected empty line
> 

Ok

> 
> > +   u16 data_identifier;
> > +   u32 data_length;
> > +
> > +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu,
> > +ocxlpmem-
> > >admin_command.data_offset,
> > +OCXL_LITTLE_ENDIAN, &val);
> > +   if (rc)
> > +   return rc;
> > +
> > +   data_identifier = val >> 48;
> > +   data_length = val & 0x;
> > +
> > +   if (data_identifier != 0x4353) { // 'CS'
> > +   dev_err(&ocxlpmem->dev,
> > +   "Bad data identifier for controller stats,
> > expected 'CS', got '%-.*s'\n",
> > +   2, (char *)&data_identifier);
> 
> 
> Wow, I'm clueless what that string format looks like :-)
> 2 arguments? Did you check the kernel string formatter does what you
> want?
> You may consider unifying the format though, the error log patch uses
> a 
> simpler (better?) format for a similar message.
> 

Sorry, force of habit from my old job where we dealt with a lot of
variable length, non-NULL terminated buffers. FYI - it takes the string
length from the first argument.

I'll change it to a fixed length string like the others :)

> 
> 
> > +   return -EINVAL;
> > +   }
> > +
> > +   *length = data_length;
> > +   return 0;
> > +}
> > +
> > +static int ioctl_controller_stats(struct ocxlpmem *ocxlpmem,
> > + struct
> > ioctl_ocxl_pmem_controller_stats __user *uarg)
> > +{
> > +   struct ioctl_ocxl_pmem_controller_stats args;
> > +   u32 length;
> > +   int rc;
> > +   u64 val;
> > +
> > +   memset(&args, '\0', sizeof(args));
> > +
> > +   mutex_lock(&ocxlpmem->admin_command.lock);
> > +
> > +   rc = admin_command_request(ocxlpmem,
> > ADMIN_COMMAND_CONTROLLER_STATS);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu,
> > + ocxlpmem-
> > >admin_command.request_offset + 0x08,
> > + OCXL_LITTLE_ENDIAN, 0);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_execute(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +
> > +   rc = admin_command_complete_timeout(ocxlpmem,
> > +   ADMIN_COMMAND_CONTROLLER_ST
> > ATS);
> > +   if (rc < 0) {
> > +   dev_warn(&ocxlpmem->dev, "Controller stats timed
> > out\n");
> > +   goto out;
> > +   }
> > +
> > +   rc = admin_response(ocxlpmem);
> > +   if (rc < 0)
> > +   goto out;
> > +   if (rc != STATUS_SUCCESS) {
> > +   warn_status(ocxlpmem,
> > +   "Unexpected status from controller stats",
> > rc);
> > +   goto out;
> > +   }
> 
> All those ioctls commands follow the same pattern:
> 1. admin_command_request()
> 2. optionnaly, set some mmio registers specific to the command
> 3. admin_command_execute()
> 4. admin_command_complete_timeout()
> 5. admin_response()
> 
> By swapping 1 and 2, we could then factorize steps 1, 3, 4 and 5 in
> a 
> function and simplify/shorten the code each time a command is called.
> 
> Regarding step 2 (and that's true for all similar patches), a
> comment 
> about what the mmio tuning does would help and avoid looking up the 
> spec. Looking up the spec during the review is expected, but it will 
> ease reading the code 6 months from now.
> 
> 

I'll rework this and add a wrapper in the Admin Commands patch.

> 
> > +
> > +   rc = controller_stats_header_parse(ocxlpmem, &length);
> > +   if (rc)
> > +  

[RFC PATCH v1] pseries/drmem: don't cache node id in drmem_lmb struct

2020-03-11 Thread Scott Cheloha
At memory hot-remove time we can retrieve an LMB's nid from its
corresponding memory_block.  There is no need to store the nid
in multiple locations.

Signed-off-by: Scott Cheloha 
---
The linear search in powerpc's memory_add_physaddr_to_nid() has become a
bottleneck at boot on systems with many LMBs.

As described in this patch here:

https://lore.kernel.org/linuxppc-dev/20200221172901.1596249-2-chel...@linux.ibm.com/

the linear search seriously cripples drmem_init().

The obvious solution (shown in that patch) is to just make the search
in memory_add_physaddr_to_nid() faster.  An XArray seems well-suited
to the task of mapping an address range to an LMB object.

The less obvious approach is to just call memory_add_physaddr_to_nid()
in fewer places.

I'm not sure which approach is correct, hence the RFC.

The attached patch is an example of how we could just eliminate the
memory_add_physaddr_to_nid() calls during drmem_init().  Unless this
is somehow an abuse of the memory_block I think retrieving the nid in
this way deduplicates memoization.

There is another spot where we don't *need* to search by address to
find the node id.  In dlpar_memory_add() we're calling lmb_set_nid(),
which is just memory_add_physaddr_to_nid().  But we could easily bypass
the search and call of_drconf_to_nid_single() directly with the LMB we
already have a pointer to.

Then again, I see callers like the xen balloon driver and probe_store()
that seemingly need to do an address-to-nid lookup.

So, does memory_add_physaddr_to_nid() need to be fast?  Should I bite
the bullet and make it fast so we can leverage it without slowdown?
Or is calling it in fewer places an acceptable approach?

 arch/powerpc/include/asm/drmem.h  | 21 ---
 arch/powerpc/mm/drmem.c   |  6 +-
 .../platforms/pseries/hotplug-memory.c| 19 ++---
 3 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 3d76e1c388c2..4e7b6b70e366 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -13,9 +13,6 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
-#ifdef CONFIG_MEMORY_HOTPLUG
-   int nid;
-#endif
 };
 
 struct drmem_lmb_info {
@@ -103,22 +100,4 @@ static inline void 
invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
lmb->aa_index = 0x;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static inline void lmb_set_nid(struct drmem_lmb *lmb)
-{
-   lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr);
-}
-static inline void lmb_clear_nid(struct drmem_lmb *lmb)
-{
-   lmb->nid = -1;
-}
-#else
-static inline void lmb_set_nid(struct drmem_lmb *lmb)
-{
-}
-static inline void lmb_clear_nid(struct drmem_lmb *lmb)
-{
-}
-#endif
-
 #endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 59327cefbc6a..873fcfc7b875 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -362,10 +362,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop)
if (!drmem_info->lmbs)
return;
 
-   for_each_drmem_lmb(lmb) {
+   for_each_drmem_lmb(lmb)
read_drconf_v1_cell(lmb, &prop);
-   lmb_set_nid(lmb);
-   }
 }
 
 static void __init init_drmem_v2_lmbs(const __be32 *prop)
@@ -410,8 +408,6 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
-
-   lmb_set_nid(lmb);
}
}
 }
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index a4d40a3ceea3..f6d4236286cf 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -376,25 +376,29 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
+   struct memory_block *mem_block;
unsigned long block_sz;
int rc;
 
if (!lmb_is_removable(lmb))
return -EINVAL;
 
+   mem_block = lmb_to_memblock(lmb);
+   if (mem_block == NULL)
+   return -EINVAL;
+
rc = dlpar_offline_lmb(lmb);
if (rc)
return rc;
 
block_sz = pseries_memory_block_size();
 
-   __remove_memory(lmb->nid, lmb->base_addr, block_sz);
+   __remove_memory(mem_block->nid, lmb->base_addr, block_sz);
 
/* Update memory regions for memory remove */
memblock_remove(lmb->base_addr, block_sz);
 
invalidate_lmb_associativity_index(lmb);
-   lmb_clear_nid(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
 
return 0;
@@ -651,7 +655,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
u32 drc_index)
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
  

Re: [PATCH v3] ima: add a new CONFIG for loading arch-specific policies

2020-03-11 Thread Philipp Rudo
On Sun,  8 Mar 2020 20:57:51 -0400
Nayna Jain  wrote:

> From: Nayna Jain 
> 
> Every time a new architecture defines the IMA architecture specific
> functions - arch_ima_get_secureboot() and arch_ima_get_policy(), the IMA
> include file needs to be updated. To avoid this "noise", this patch
> defines a new IMA Kconfig IMA_SECURE_AND_OR_TRUSTED_BOOT option, allowing
> the different architectures to select it.
> 
> Suggested-by: Linus Torvalds 
> Signed-off-by: Nayna Jain 
> Acked-by: Ard Biesheuvel 
> Cc: Ard Biesheuvel 
> Cc: Philipp Rudo 
> Cc: Michael Ellerman 
> ---
> v3:
> * Removes CONFIG_IMA dependency. Thanks Ard.
> * Updated the patch with improvements suggested by Michael. It now uses
> "imply" instead of "select". Thanks Michael.
> * Replaced the CONFIG_IMA in x86 and s390 with new config, else it was
> resulting in redefinition when the IMA_SECURE_AND_OR_TRUSTED_BOOT
> is not enabled. Thanks to Mimi for identifying the problem.
> * Removed "#ifdef EFI" check in the arch/x86/Makefile for compiling
> ima_arch.c file.
> * Ard, Thanks for your Acked-by. I have changed the arch/x86/Makefile in
> this version. Can you please review again and confirm ?
> * Rudo, Thanks for your review. I have changed arch/s390/Makefile as well.
> Can you also please review again ?

Looks good to me

for the s390 part
Acked-by: Philipp Rudo 

Thanks
Philipp

> 
> v2:
> * Fixed the issue identified by Mimi. Thanks Mimi, Ard, Heiko and Michael for
> discussing the fix.
> 
>  arch/powerpc/Kconfig   | 1 +
>  arch/s390/Kconfig  | 1 +
>  arch/s390/kernel/Makefile  | 2 +-
>  arch/x86/Kconfig   | 1 +
>  arch/x86/kernel/Makefile   | 4 +---
>  include/linux/ima.h| 3 +--
>  security/integrity/ima/Kconfig | 7 +++
>  7 files changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 497b7d0b2d7e..5b9f1cba2a44 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -979,6 +979,7 @@ config PPC_SECURE_BOOT
>   bool
>   depends on PPC_POWERNV
>   depends on IMA_ARCH_POLICY
> + imply IMA_SECURE_AND_OR_TRUSTED_BOOT
>   help
> Systems with firmware secure boot enabled need to define security
> policies to extend secure boot to the OS. This config allows a user
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 8abe77536d9d..59c216af6264 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -195,6 +195,7 @@ config S390
>   select ARCH_HAS_FORCE_DMA_UNENCRYPTED
>   select SWIOTLB
>   select GENERIC_ALLOCATOR
> + imply IMA_SECURE_AND_OR_TRUSTED_BOOT
>  
>  
>  config SCHED_OMIT_FRAME_POINTER
> diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
> index 2b1203cf7be6..578a6fa82ea4 100644
> --- a/arch/s390/kernel/Makefile
> +++ b/arch/s390/kernel/Makefile
> @@ -70,7 +70,7 @@ obj-$(CONFIG_JUMP_LABEL)+= jump_label.o
>  obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
>  obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o
>  
> -obj-$(CONFIG_IMA)+= ima_arch.o
> +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
>  
>  obj-$(CONFIG_PERF_EVENTS)+= perf_event.o perf_cpum_cf_common.o
>  obj-$(CONFIG_PERF_EVENTS)+= perf_cpum_cf.o perf_cpum_sf.o
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index beea77046f9b..dcf5b1729f7c 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -230,6 +230,7 @@ config X86
>   select VIRT_TO_BUS
>   select X86_FEATURE_NAMESif PROC_FS
>   select PROC_PID_ARCH_STATUS if PROC_FS
> + imply IMA_SECURE_AND_OR_TRUSTED_BOOTif EFI
>  
>  config INSTRUCTION_DECODER
>   def_bool y
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 9b294c13809a..cfef37a27def 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -154,6 +154,4 @@ ifeq ($(CONFIG_X86_64),y)
>   obj-y   += vsmp_64.o
>  endif
>  
> -ifdef CONFIG_EFI
> -obj-$(CONFIG_IMA)+= ima_arch.o
> -endif
> +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
> diff --git a/include/linux/ima.h b/include/linux/ima.h
> index 1659217e9b60..aefe758f4466 100644
> --- a/include/linux/ima.h
> +++ b/include/linux/ima.h
> @@ -30,8 +30,7 @@ extern void ima_kexec_cmdline(const void *buf, int size);
>  extern void ima_add_kexec_buffer(struct kimage *image);
>  #endif
>  
> -#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) \
> - || defined(CONFIG_PPC_SECURE_BOOT)
> +#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT
>  extern bool arch_ima_get_secureboot(void);
>  extern const char * const *arch_get_ima_policy(void);
>  #else
> diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
> index 3f3ee4e2eb0d..edde88dbe576 100644
> --- a/security/integrity/ima/Kconfig
> +++ b/security/integrity/ima/Kconfig
> @@ -327,3 +327,10 @@ config IMA_QUEUE_EAR

Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread David Hildenbrand
On 11.03.20 17:55, Vitaly Kuznetsov wrote:
> David Hildenbrand  writes:
> 
>> For now, distributions implement advanced udev rules to essentially
>> - Don't online any hotplugged memory (s390x)
>> - Online all memory to ZONE_NORMAL (e.g., most virt environments like
>>   hyperv)
>> - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken
>>   care of (e.g., bare metal, special virt environments)
>>
>> In summary: All memory is usually onlined the same way, however, the
>> kernel always has to ask userspace to come up with the same answer.
>> E.g., HyperV always waits for a memory block to get onlined before
>> continuing, otherwise it might end up adding memory faster than
>> hotplugging it, which can result in strange OOM situations.
>>
>> Let's allow to specify a default online_type, not just "online" and
>> "offline". This allows distributions to configure the default online_type
>> when booting up and be done with it.
>>
>> We can now specify "offline", "online", "online_movable" and
>> "online_kernel" via
>> - "memhp_default_state=" on the kernel cmdline
>> - /sys/devices/systemn/memory/auto_online_blocks
>> just like we are able to specify for a single memory block via
>> /sys/devices/systemn/memory/memoryX/state
>>
> 
> Thank you for picking this up! 
> 
> It's been awhile since I've added CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
> but I vaguely recall one problem: memory hotplug may happen *very* early
> (just because some memory is presented to a VM as hotplug memory, it is
> not in e820). It happens way before we launch userspace (including
> udev). The question is -- which ZONE will this memory be assigned too?

If it's added via add_memory() ("hot/cold plugged memory") like ACPI
DIMMs not part of e820, Hyper-V balloon added memory, XEN balloon added
memory, s390x standby memory etc. the memory will be onlined as
configured via memhp_default_online_type. Assume that one is set to
"offline".

*If* userspace changes memhp_default_online_type (as in my script in the
cover letter), userspace has to online all memory that has been added
before userspace was active itself (again, as done in my script).

Memory not added via add_memory() is considered "initial memory" and not
as hot/cold plugged memory.

Same handling as for now using udev rules. (once userspace is up, udev
rules for all early added memory is triggered as well)

> 
> 'memhp_default_state=' resolves the issue but nobody likes additional
> kernel parameters for anything but
> debug. CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE was supposed to help, but it
> is binary and distro-wide (so *all* deployments will get the same
> default and as you validly stated we want it differently).
> 
> We could've added something like your example onlining script to the
> kernel itself but this is likely going to be hard to sell: "policies
> belong to userspace!" will likely be the answer. 

Exactly my thought.

-- 
Thanks,

David / dhildenb



Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread Vitaly Kuznetsov
David Hildenbrand  writes:

> For now, distributions implement advanced udev rules to essentially
> - Don't online any hotplugged memory (s390x)
> - Online all memory to ZONE_NORMAL (e.g., most virt environments like
>   hyperv)
> - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken
>   care of (e.g., bare metal, special virt environments)
>
> In summary: All memory is usually onlined the same way, however, the
> kernel always has to ask userspace to come up with the same answer.
> E.g., HyperV always waits for a memory block to get onlined before
> continuing, otherwise it might end up adding memory faster than
> hotplugging it, which can result in strange OOM situations.
>
> Let's allow to specify a default online_type, not just "online" and
> "offline". This allows distributions to configure the default online_type
> when booting up and be done with it.
>
> We can now specify "offline", "online", "online_movable" and
> "online_kernel" via
> - "memhp_default_state=" on the kernel cmdline
> - /sys/devices/systemn/memory/auto_online_blocks
> just like we are able to specify for a single memory block via
> /sys/devices/systemn/memory/memoryX/state
>

Thank you for picking this up! 

It's been awhile since I've added CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
but I vaguely recall one problem: memory hotplug may happen *very* early
(just because some memory is presented to a VM as hotplug memory, it is
not in e820). It happens way before we launch userspace (including
udev). The question is -- which ZONE will this memory be assigned too?

'memhp_default_state=' resolves the issue but nobody likes additional
kernel parameters for anything but
debug. CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE was supposed to help, but it
is binary and distro-wide (so *all* deployments will get the same
default and as you validly stated we want it differently).

We could've added something like your example onlining script to the
kernel itself but this is likely going to be hard to sell: "policies
belong to userspace!" will likely be the answer. 

So if we don't want to start the endless discussions (again), your
proposal is 'good enough'.


> Cc: Greg Kroah-Hartman 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: "Rafael J. Wysocki" 
> Cc: Baoquan He 
> Cc: Wei Yang 
> Signed-off-by: David Hildenbrand 
> ---
>  drivers/base/memory.c  | 11 +--
>  include/linux/memory_hotplug.h |  2 ++
>  mm/memory_hotplug.c|  8 
>  3 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 8d3e16dab69f..2b09b68b9f78 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = {
>   [MMOP_ONLINE_MOVABLE] = "online_movable",
>  };
>  
> -static int memhp_online_type_from_str(const char *str)
> +int memhp_online_type_from_str(const char *str)
>  {
>   int i;
>  
> @@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device 
> *dev,
>   struct device_attribute *attr,
>   const char *buf, size_t count)
>  {
> - if (sysfs_streq(buf, "online"))
> - memhp_default_online_type = MMOP_ONLINE;
> - else if (sysfs_streq(buf, "offline"))
> - memhp_default_online_type = MMOP_OFFLINE;
> - else
> + const int online_type = memhp_online_type_from_str(buf);
> +
> + if (online_type < 0)
>   return -EINVAL;
>  
> + memhp_default_online_type = online_type;
>   return count;
>  }
>  
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index c6e090b34c4b..ef55115320fb 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
>   struct mhp_restrictions *restrictions);
>  extern u64 max_mem_size;
>  
> +extern int memhp_online_type_from_str(const char *str);
> +
>  /* Default online_type (MMOP_*) when new memory blocks are added. */
>  extern int memhp_default_online_type;
>  /* If movable_node boot option specified */
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 01443c70aa27..4a96273eafa7 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type);
>  
>  static int __init setup_memhp_default_state(char *str)
>  {
> - if (!strcmp(str, "online"))
> - memhp_default_online_type = MMOP_ONLINE;
> - else if (!strcmp(str, "offline"))
> - memhp_default_online_type = MMOP_OFFLINE;
> + const int online_type = memhp_online_type_from_str(str);
> +
> + if (online_type >= 0)
> + memhp_default_online_type = online_type;
>  
>   return 1;
>  }

-- 
Vitaly



Re: [RFC 00/11] perf: Enhancing perf to export processor hazard information

2020-03-11 Thread Ravi Bangoria

Hi Kim,

On 3/6/20 3:36 AM, Kim Phillips wrote:

On 3/3/20 3:55 AM, Kim Phillips wrote:

On 3/2/20 2:21 PM, Stephane Eranian wrote:

On Mon, Mar 2, 2020 at 2:13 AM Peter Zijlstra  wrote:


On Mon, Mar 02, 2020 at 10:53:44AM +0530, Ravi Bangoria wrote:

Modern processors export such hazard data in Performance
Monitoring Unit (PMU) registers. Ex, 'Sampled Instruction Event
Register' on IBM PowerPC[1][2] and 'Instruction-Based Sampling' on
AMD[3] provides similar information.

Implementation detail:

A new sample_type called PERF_SAMPLE_PIPELINE_HAZ is introduced.
If it's set, kernel converts arch specific hazard information
into generic format:

    struct perf_pipeline_haz_data {
   /* Instruction/Opcode type: Load, Store, Branch  */
   __u8    itype;
   /* Instruction Cache source */
   __u8    icache;
   /* Instruction suffered hazard in pipeline stage */
   __u8    hazard_stage;
   /* Hazard reason */
   __u8    hazard_reason;
   /* Instruction suffered stall in pipeline stage */
   __u8    stall_stage;
   /* Stall reason */
   __u8    stall_reason;
   __u16   pad;
    };


Kim, does this format indeed work for AMD IBS?


It's not really 1:1, we don't have these separations of stages
and reasons, for example: we have missed in L2 cache, for example.
So IBS output is flatter, with more cycle latency figures than
IBM's AFAICT.


AMD IBS captures pipeline latency data incase Fetch sampling like the
Fetch latency, tag to retire latency, completion to retire latency and
so on. Yes, Ops sampling do provide more data on load/store centric
information. But it also captures more detailed data for Branch instructions.
And we also looked at ARM SPE, which also captures more details pipeline
data and latency information.


Personally, I don't like the term hazard. This is too IBM Power
specific. We need to find a better term, maybe stall or penalty.


Right, IBS doesn't have a filter to only count stalled or otherwise
bad events.  IBS' PPR descriptions has one occurrence of the
word stall, and no penalty.  The way I read IBS is it's just
reporting more sample data than just the precise IP: things like
hits, misses, cycle latencies, addresses, types, etc., so words
like 'extended', or the 'auxiliary' already used today even
are more appropriate for IBS, although I'm the last person to
bikeshed.


We are thinking of using "pipeline" word instead of Hazard.


Hm, the word 'pipeline' occurs 0 times in IBS documentation.


NP. We thought pipeline is generic hw term so we proposed "pipeline"
word. We are open to term which can be generic enough.



I realize there are a couple of core pipeline-specific pieces
of information coming out of it, but the vast majority
are addresses, latencies of various components in the memory
hierarchy, and various component hit/miss bits.


Yes. we should capture core pipeline specific details. For example,
IBS generates Branch unit information(IbsOpData1) and Icahce related
data(IbsFetchCtl) which is something that shouldn't be extended as
part of perf-mem, IMO.



What's needed here is a vendor-specific extended
sample information that all these technologies gather,
of which things like e.g., 'L1 TLB cycle latency' we
all should have in common.


Yes. We will include fields to capture the latency cycles (like Issue
latency, Instruction completion latency etc..) along with other pipeline
details in the proposed structure.



I'm not sure why a new PERF_SAMPLE_PIPELINE_HAZ is needed
either.  Can we use PERF_SAMPLE_AUX instead?


We took a look at PERF_SAMPLE_AUX. IIUC, PERF_SAMPLE_AUX is intended when
large volume of data needs to be captured as part of perf.data without
frequent PMIs. But proposed type is to address the capture of pipeline
information on each sample using PMI at periodic intervals. Hence proposing
PERF_SAMPLE_PIPELINE_HAZ.


 Take a look at
commit 98dcf14d7f9c "perf tools: Add kernel AUX area sampling
definitions".  The sample identifier can be used to determine
which vendor's sampling IP's data is in it, and events can
be recorded just by copying the content of the SIER, etc.
registers, and then events get synthesized from the aux
sample at report/inject/annotate etc. time.  This allows
for less sample recording overhead, and moves all the vendor
specific decoding and common event conversions for userspace
to figure out.


When AUX buffer data is structured, tool side changes added to present the
pipeline data can be re-used.




Also worth considering is the support of ARM SPE (Statistical
Profiling Extension) which is their version of IBS.
Whatever gets added need to cover all three with no limitations.


I thought Intel's various LBR, PEBS, and PT supported providing
similar sample data in perf already, like with perf mem/c2c?


perf-mem is more of data centric in my opinion. It is more towards
memory profiling. So proposal here is to expose pipeline related
det

Re: [PATCH v3] ima: add a new CONFIG for loading arch-specific policies

2020-03-11 Thread Mimi Zohar
On Sun, 2020-03-08 at 20:57 -0400, Nayna Jain wrote:
> From: Nayna Jain 
> 
> Every time a new architecture defines the IMA architecture specific
> functions - arch_ima_get_secureboot() and arch_ima_get_policy(), the IMA
> include file needs to be updated. To avoid this "noise", this patch
> defines a new IMA Kconfig IMA_SECURE_AND_OR_TRUSTED_BOOT option, allowing
> the different architectures to select it.
> 
> Suggested-by: Linus Torvalds 
> Signed-off-by: Nayna Jain 
> Acked-by: Ard Biesheuvel 
> Cc: Philipp Rudo 
> Cc: Michael Ellerman 

Thanks, Michael for the suggestion of using "imply".  Seems to be
working nicely.  Thanks, Nayna.  I pushed this patch out to next-
integrity-testing.  Could we get some tags on this version of the
patch?

thanks,

Mimi



Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread David Hildenbrand
On 11.03.20 15:26, Wei Yang wrote:
> On Wed, Mar 11, 2020 at 01:30:26PM +0100, David Hildenbrand wrote:
>> For now, distributions implement advanced udev rules to essentially
>> - Don't online any hotplugged memory (s390x)
>> - Online all memory to ZONE_NORMAL (e.g., most virt environments like
>>  hyperv)
>> - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken
>>  care of (e.g., bare metal, special virt environments)
>>
>> In summary: All memory is usually onlined the same way, however, the
>> kernel always has to ask userspace to come up with the same answer.
>> E.g., HyperV always waits for a memory block to get onlined before
>> continuing, otherwise it might end up adding memory faster than
>> hotplugging it, which can result in strange OOM situations.
>>
>> Let's allow to specify a default online_type, not just "online" and
>> "offline". This allows distributions to configure the default online_type
>> when booting up and be done with it.
>>
>> We can now specify "offline", "online", "online_movable" and
>> "online_kernel" via
>> - "memhp_default_state=" on the kernel cmdline
>> - /sys/devices/systemn/memory/auto_online_blocks
>> just like we are able to specify for a single memory block via
>> /sys/devices/systemn/memory/memoryX/state
>>
>> Cc: Greg Kroah-Hartman 
>> Cc: Andrew Morton 
>> Cc: Michal Hocko 
>> Cc: Oscar Salvador 
>> Cc: "Rafael J. Wysocki" 
>> Cc: Baoquan He 
>> Cc: Wei Yang 
>> Signed-off-by: David Hildenbrand 
> 
> Ok, I got the reason to leave the change on string compare here.

Thanks for your *very fast* review! :)


-- 
Thanks,

David / dhildenb



Re: [PATCH] hwmon: (ibmpowernv) Use scnprintf() for avoiding potential buffer overflow

2020-03-11 Thread Guenter Roeck
On Wed, Mar 11, 2020 at 08:39:44AM +0100, Takashi Iwai wrote:
> Since snprintf() returns the would-be-output size instead of the
> actual output size, the succeeding calls may go beyond the given
> buffer limit.  Fix it by replacing with scnprintf().
> 
> Signed-off-by: Takashi Iwai 

Applied to hwmon-next.

Thanks,
Guenter

> ---
>  drivers/hwmon/ibmpowernv.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
> index 0e525cfbdfc5..a750647e66a4 100644
> --- a/drivers/hwmon/ibmpowernv.c
> +++ b/drivers/hwmon/ibmpowernv.c
> @@ -186,7 +186,7 @@ static void make_sensor_label(struct device_node *np,
>   u32 id;
>   size_t n;
>  
> - n = snprintf(sdata->label, sizeof(sdata->label), "%s", label);
> + n = scnprintf(sdata->label, sizeof(sdata->label), "%s", label);
>  
>   /*
>* Core temp pretty print
> @@ -199,11 +199,11 @@ static void make_sensor_label(struct device_node *np,
>* The digital thermal sensors are associated
>* with a core.
>*/
> - n += snprintf(sdata->label + n,
> + n += scnprintf(sdata->label + n,
> sizeof(sdata->label) - n, " %d",
> cpuid);
>   else
> - n += snprintf(sdata->label + n,
> + n += scnprintf(sdata->label + n,
> sizeof(sdata->label) - n, " phy%d", id);
>   }
>  
> @@ -211,7 +211,7 @@ static void make_sensor_label(struct device_node *np,
>* Membuffer pretty print
>*/
>   if (!of_property_read_u32(np, "ibm,chip-id", &id))
> - n += snprintf(sdata->label + n, sizeof(sdata->label) - n,
> + n += scnprintf(sdata->label + n, sizeof(sdata->label) - n,
> " %d", id & 0x);
>  }
>  
> -- 
> 2.16.4
> 


Re: [PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 02:20:02PM +, Wei Yang wrote:
>On Wed, Mar 11, 2020 at 01:30:24PM +0100, David Hildenbrand wrote:
>>Let's use a simple array which we can reuse soon. While at it, move the
>>string->mmop conversion out of the device hotplug lock.
>>
>>Cc: Greg Kroah-Hartman 
>>Cc: Andrew Morton 
>>Cc: Michal Hocko 
>>Cc: Oscar Salvador 
>>Cc: "Rafael J. Wysocki" 
>>Cc: Baoquan He 
>>Cc: Wei Yang 
>>Signed-off-by: David Hildenbrand 

Ok, I got the reason.

Reviewed-by: Wei Yang 

>>---
>> drivers/base/memory.c | 38 +++---
>> 1 file changed, 23 insertions(+), 15 deletions(-)
>>
>>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>>index e7e77cafef80..8a7f29c0bf97 100644
>>--- a/drivers/base/memory.c
>>+++ b/drivers/base/memory.c
>>@@ -28,6 +28,24 @@
>> 
>> #define MEMORY_CLASS_NAME"memory"
>> 
>>+static const char *const online_type_to_str[] = {
>>+ [MMOP_OFFLINE] = "offline",
>>+ [MMOP_ONLINE] = "online",
>>+ [MMOP_ONLINE_KERNEL] = "online_kernel",
>>+ [MMOP_ONLINE_MOVABLE] = "online_movable",
>>+};
>>+
>>+static int memhp_online_type_from_str(const char *str)
>>+{
>>+ int i;
>>+
>>+ for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
>>+ if (sysfs_streq(str, online_type_to_str[i]))
>>+ return i;
>>+ }
>>+ return -EINVAL;
>>+}
>>+
>> #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
>> 
>> static int sections_per_block;
>>@@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev)
>> static ssize_t state_store(struct device *dev, struct device_attribute *attr,
>> const char *buf, size_t count)
>> {
>>+ const int online_type = memhp_online_type_from_str(buf);
>
>In your following patch, you did the same conversion. Is it possible to merge
>them into this one?
>
>>  struct memory_block *mem = to_memory_block(dev);
>>- int ret, online_type;
>>+ int ret;
>>+
>>+ if (online_type < 0)
>>+ return -EINVAL;
>> 
>>  ret = lock_device_hotplug_sysfs();
>>  if (ret)
>>  return ret;
>> 
>>- if (sysfs_streq(buf, "online_kernel"))
>>- online_type = MMOP_ONLINE_KERNEL;
>>- else if (sysfs_streq(buf, "online_movable"))
>>- online_type = MMOP_ONLINE_MOVABLE;
>>- else if (sysfs_streq(buf, "online"))
>>- online_type = MMOP_ONLINE;
>>- else if (sysfs_streq(buf, "offline"))
>>- online_type = MMOP_OFFLINE;
>>- else {
>>- ret = -EINVAL;
>>- goto err;
>>- }
>>-
>>  switch (online_type) {
>>  case MMOP_ONLINE_KERNEL:
>>  case MMOP_ONLINE_MOVABLE:
>>@@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct 
>>device_attribute *attr,
>>  ret = -EINVAL; /* should never happen */
>>  }
>> 
>>-err:
>>  unlock_device_hotplug();
>> 
>>  if (ret < 0)
>>-- 
>>2.24.1
>
>-- 
>Wei Yang
>Help you, Help me

-- 
Wei Yang
Help you, Help me


Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 01:30:26PM +0100, David Hildenbrand wrote:
>For now, distributions implement advanced udev rules to essentially
>- Don't online any hotplugged memory (s390x)
>- Online all memory to ZONE_NORMAL (e.g., most virt environments like
>  hyperv)
>- Online all memory to ZONE_MOVABLE in case the zone imbalance is taken
>  care of (e.g., bare metal, special virt environments)
>
>In summary: All memory is usually onlined the same way, however, the
>kernel always has to ask userspace to come up with the same answer.
>E.g., HyperV always waits for a memory block to get onlined before
>continuing, otherwise it might end up adding memory faster than
>hotplugging it, which can result in strange OOM situations.
>
>Let's allow to specify a default online_type, not just "online" and
>"offline". This allows distributions to configure the default online_type
>when booting up and be done with it.
>
>We can now specify "offline", "online", "online_movable" and
>"online_kernel" via
>- "memhp_default_state=" on the kernel cmdline
>- /sys/devices/systemn/memory/auto_online_blocks
>just like we are able to specify for a single memory block via
>/sys/devices/systemn/memory/memoryX/state
>
>Cc: Greg Kroah-Hartman 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: "Rafael J. Wysocki" 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 

Ok, I got the reason to leave the change on string compare here.

Reviewed-by: Wei Yang 

>---
> drivers/base/memory.c  | 11 +--
> include/linux/memory_hotplug.h |  2 ++
> mm/memory_hotplug.c|  8 
> 3 files changed, 11 insertions(+), 10 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 8d3e16dab69f..2b09b68b9f78 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = {
>   [MMOP_ONLINE_MOVABLE] = "online_movable",
> };
> 
>-static int memhp_online_type_from_str(const char *str)
>+int memhp_online_type_from_str(const char *str)
> {
>   int i;
> 
>@@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device 
>*dev,
>   struct device_attribute *attr,
>   const char *buf, size_t count)
> {
>-  if (sysfs_streq(buf, "online"))
>-  memhp_default_online_type = MMOP_ONLINE;
>-  else if (sysfs_streq(buf, "offline"))
>-  memhp_default_online_type = MMOP_OFFLINE;
>-  else
>+  const int online_type = memhp_online_type_from_str(buf);
>+
>+  if (online_type < 0)
>   return -EINVAL;
> 
>+  memhp_default_online_type = online_type;
>   return count;
> }
> 
>diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>index c6e090b34c4b..ef55115320fb 100644
>--- a/include/linux/memory_hotplug.h
>+++ b/include/linux/memory_hotplug.h
>@@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
>   struct mhp_restrictions *restrictions);
> extern u64 max_mem_size;
> 
>+extern int memhp_online_type_from_str(const char *str);
>+
> /* Default online_type (MMOP_*) when new memory blocks are added. */
> extern int memhp_default_online_type;
> /* If movable_node boot option specified */
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index 01443c70aa27..4a96273eafa7 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type);
> 
> static int __init setup_memhp_default_state(char *str)
> {
>-  if (!strcmp(str, "online"))
>-  memhp_default_online_type = MMOP_ONLINE;
>-  else if (!strcmp(str, "offline"))
>-  memhp_default_online_type = MMOP_OFFLINE;
>+  const int online_type = memhp_online_type_from_str(str);
>+
>+  if (online_type >= 0)
>+  memhp_default_online_type = online_type;
> 
>   return 1;
> }
>-- 
>2.24.1

-- 
Wei Yang
Help you, Help me


Re: [PATCH v1 4/5] mm/memory_hotplug: convert memhp_auto_online to store an online_type

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 01:30:25PM +0100, David Hildenbrand wrote:
>... and rename it to memhp_default_online_type. This is a preparation
>for more detailed default online behavior.
>
>Cc: Greg Kroah-Hartman 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: "Rafael J. Wysocki" 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Cc: Benjamin Herrenschmidt 
>Cc: Paul Mackerras 
>Cc: Michael Ellerman 
>Cc: "K. Y. Srinivasan" 
>Cc: Haiyang Zhang 
>Cc: Stephen Hemminger 
>Cc: Wei Liu 
>Cc: Thomas Gleixner 
>Cc: linuxppc-dev@lists.ozlabs.org
>Cc: linux-hyp...@vger.kernel.org
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> arch/powerpc/platforms/powernv/memtrace.c |  2 +-
> drivers/base/memory.c | 10 --
> drivers/hv/hv_balloon.c   |  2 +-
> include/linux/memory_hotplug.h|  3 ++-
> mm/memory_hotplug.c   | 13 +++--
> 5 files changed, 15 insertions(+), 15 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
>b/arch/powerpc/platforms/powernv/memtrace.c
>index d6d64f8718e6..e15a600cfa4d 100644
>--- a/arch/powerpc/platforms/powernv/memtrace.c
>+++ b/arch/powerpc/platforms/powernv/memtrace.c
>@@ -235,7 +235,7 @@ static int memtrace_online(void)
>* If kernel isn't compiled with the auto online option
>* we need to online the memory ourselves.
>*/
>-  if (!memhp_auto_online) {
>+  if (memhp_default_online_type == MMOP_OFFLINE) {
>   lock_device_hotplug();
>   walk_memory_blocks(ent->start, ent->size, NULL,
>  online_mem_block);
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 8a7f29c0bf97..8d3e16dab69f 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -386,10 +386,8 @@ static DEVICE_ATTR_RO(block_size_bytes);
> static ssize_t auto_online_blocks_show(struct device *dev,
>  struct device_attribute *attr, char *buf)
> {
>-  if (memhp_auto_online)
>-  return sprintf(buf, "online\n");
>-  else
>-  return sprintf(buf, "offline\n");
>+  return sprintf(buf, "%s\n",
>+ online_type_to_str[memhp_default_online_type]);
> }
> 
> static ssize_t auto_online_blocks_store(struct device *dev,
>@@ -397,9 +395,9 @@ static ssize_t auto_online_blocks_store(struct device *dev,
>   const char *buf, size_t count)
> {
>   if (sysfs_streq(buf, "online"))
>-  memhp_auto_online = true;
>+  memhp_default_online_type = MMOP_ONLINE;
>   else if (sysfs_streq(buf, "offline"))
>-  memhp_auto_online = false;
>+  memhp_default_online_type = MMOP_OFFLINE;
>   else
>   return -EINVAL;
> 
>diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
>index a02ce43d778d..3b90fd12e0c5 100644
>--- a/drivers/hv/hv_balloon.c
>+++ b/drivers/hv/hv_balloon.c
>@@ -727,7 +727,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned 
>long size,
>   spin_unlock_irqrestore(&dm_device.ha_lock, flags);
> 
>   init_completion(&dm_device.ol_waitevent);
>-  dm_device.ha_waiting = !memhp_auto_online;
>+  dm_device.ha_waiting = memhp_default_online_type == 
>MMOP_OFFLINE;
> 
>   nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
>   ret = add_memory(nid, PFN_PHYS((start_pfn)),
>diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>index c2e06ed5e0e9..c6e090b34c4b 100644
>--- a/include/linux/memory_hotplug.h
>+++ b/include/linux/memory_hotplug.h
>@@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
>   struct mhp_restrictions *restrictions);
> extern u64 max_mem_size;
> 
>-extern bool memhp_auto_online;
>+/* Default online_type (MMOP_*) when new memory blocks are added. */
>+extern int memhp_default_online_type;
> /* If movable_node boot option specified */
> extern bool movable_node_enabled;
> static inline bool movable_node_is_enabled(void)
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index 1a00b5a37ef6..01443c70aa27 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -67,18 +67,18 @@ void put_online_mems(void)
> bool movable_node_enabled = false;
> 
> #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
>-bool memhp_auto_online;
>+int memhp_default_online_type = MMOP_OFFLINE;
> #else
>-bool memhp_auto_online = true;
>+int memhp_default_online_type = MMOP_ONLINE;
> #endif
>-EXPORT_SYMBOL_GPL(memhp_auto_online);
>+EXPORT_SYMBOL_GPL(memhp_default_online_type);
> 
> static int __init setup_memhp_default_state(char *str)
> {
>   if (!strcmp(str, "online"))
>-  memhp_auto_online = true;
>+  memhp_default_online_type = MMOP_ONLINE;
>   else if (!strcmp(str, "offline"))
>-  memh

Re: [PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 01:30:24PM +0100, David Hildenbrand wrote:
>Let's use a simple array which we can reuse soon. While at it, move the
>string->mmop conversion out of the device hotplug lock.
>
>Cc: Greg Kroah-Hartman 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: "Rafael J. Wysocki" 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 
>---
> drivers/base/memory.c | 38 +++---
> 1 file changed, 23 insertions(+), 15 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index e7e77cafef80..8a7f29c0bf97 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -28,6 +28,24 @@
> 
> #define MEMORY_CLASS_NAME "memory"
> 
>+static const char *const online_type_to_str[] = {
>+  [MMOP_OFFLINE] = "offline",
>+  [MMOP_ONLINE] = "online",
>+  [MMOP_ONLINE_KERNEL] = "online_kernel",
>+  [MMOP_ONLINE_MOVABLE] = "online_movable",
>+};
>+
>+static int memhp_online_type_from_str(const char *str)
>+{
>+  int i;
>+
>+  for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
>+  if (sysfs_streq(str, online_type_to_str[i]))
>+  return i;
>+  }
>+  return -EINVAL;
>+}
>+
> #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
> 
> static int sections_per_block;
>@@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev)
> static ssize_t state_store(struct device *dev, struct device_attribute *attr,
>  const char *buf, size_t count)
> {
>+  const int online_type = memhp_online_type_from_str(buf);

In your following patch, you did the same conversion. Is it possible to merge
them into this one?

>   struct memory_block *mem = to_memory_block(dev);
>-  int ret, online_type;
>+  int ret;
>+
>+  if (online_type < 0)
>+  return -EINVAL;
> 
>   ret = lock_device_hotplug_sysfs();
>   if (ret)
>   return ret;
> 
>-  if (sysfs_streq(buf, "online_kernel"))
>-  online_type = MMOP_ONLINE_KERNEL;
>-  else if (sysfs_streq(buf, "online_movable"))
>-  online_type = MMOP_ONLINE_MOVABLE;
>-  else if (sysfs_streq(buf, "online"))
>-  online_type = MMOP_ONLINE;
>-  else if (sysfs_streq(buf, "offline"))
>-  online_type = MMOP_OFFLINE;
>-  else {
>-  ret = -EINVAL;
>-  goto err;
>-  }
>-
>   switch (online_type) {
>   case MMOP_ONLINE_KERNEL:
>   case MMOP_ONLINE_MOVABLE:
>@@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct 
>device_attribute *attr,
>   ret = -EINVAL; /* should never happen */
>   }
> 
>-err:
>   unlock_device_hotplug();
> 
>   if (ret < 0)
>-- 
>2.24.1

-- 
Wei Yang
Help you, Help me


Re: [PATCH v1 2/5] drivers/base/memory: map MMOP_OFFLINE to 0

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 01:30:23PM +0100, David Hildenbrand wrote:
>I have no idea why we have to start at -1. Just treat 0 as the special
>case. Clarify a comment (which was wrong, when we come via
>device_online() the first time, the online_type would have been 0 /
>MEM_ONLINE). The default is now always MMOP_OFFLINE.
>
>This is a preparation to use the online_type as an array index.
>
>Cc: Greg Kroah-Hartman 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: "Rafael J. Wysocki" 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/base/memory.c  | 11 ---
> include/linux/memory_hotplug.h |  2 +-
> 2 files changed, 5 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 8c5ce42c0fc3..e7e77cafef80 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -211,17 +211,14 @@ static int memory_subsys_online(struct device *dev)
>   return 0;
> 
>   /*
>-   * If we are called from state_store(), online_type will be
>-   * set >= 0 Otherwise we were called from the device online
>-   * attribute and need to set the online_type.
>+   * When called via device_online() without configuring the online_type,
>+   * we want to default to MMOP_ONLINE.
>*/
>-  if (mem->online_type < 0)
>+  if (mem->online_type == MMOP_OFFLINE)
>   mem->online_type = MMOP_ONLINE;
> 
>   ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
>-
>-  /* clear online_type */
>-  mem->online_type = -1;
>+  mem->online_type = MMOP_OFFLINE;
> 
>   return ret;
> }
>diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>index 261dbf010d5d..c2e06ed5e0e9 100644
>--- a/include/linux/memory_hotplug.h
>+++ b/include/linux/memory_hotplug.h
>@@ -48,7 +48,7 @@ enum {
> /* Types for control the zone type of onlined and offlined memory */
> enum {
>   /* Offline the memory. */
>-  MMOP_OFFLINE = -1,
>+  MMOP_OFFLINE = 0,
>   /* Online the memory. Zone depends, see default_zone_for_pfn(). */
>   MMOP_ONLINE,
>   /* Online the memory to ZONE_NORMAL. */
>-- 
>2.24.1

-- 
Wei Yang
Help you, Help me


Re: [PATCH v1 1/5] drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE

2020-03-11 Thread Wei Yang
On Wed, Mar 11, 2020 at 01:30:22PM +0100, David Hildenbrand wrote:
>The name is misleading. Let's just name it like the online_type name we
>expose to user space ("online").
>
>Add some documentation to the types.
>
>Cc: Greg Kroah-Hartman 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: "Rafael J. Wysocki" 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/base/memory.c  | 9 +
> include/linux/memory_hotplug.h | 6 +-
> 2 files changed, 10 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 6448c9ece2cb..8c5ce42c0fc3 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -216,7 +216,7 @@ static int memory_subsys_online(struct device *dev)
>* attribute and need to set the online_type.
>*/
>   if (mem->online_type < 0)
>-  mem->online_type = MMOP_ONLINE_KEEP;
>+  mem->online_type = MMOP_ONLINE;
> 
>   ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
> 
>@@ -251,7 +251,7 @@ static ssize_t state_store(struct device *dev, struct 
>device_attribute *attr,
>   else if (sysfs_streq(buf, "online_movable"))
>   online_type = MMOP_ONLINE_MOVABLE;
>   else if (sysfs_streq(buf, "online"))
>-  online_type = MMOP_ONLINE_KEEP;
>+  online_type = MMOP_ONLINE;
>   else if (sysfs_streq(buf, "offline"))
>   online_type = MMOP_OFFLINE;
>   else {
>@@ -262,7 +262,7 @@ static ssize_t state_store(struct device *dev, struct 
>device_attribute *attr,
>   switch (online_type) {
>   case MMOP_ONLINE_KERNEL:
>   case MMOP_ONLINE_MOVABLE:
>-  case MMOP_ONLINE_KEEP:
>+  case MMOP_ONLINE:
>   /* mem->online_type is protected by device_hotplug_lock */
>   mem->online_type = online_type;
>   ret = device_online(&mem->dev);
>@@ -342,7 +342,8 @@ static ssize_t valid_zones_show(struct device *dev,
>   }
> 
>   nid = mem->nid;
>-  default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, 
>nr_pages);
>+  default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
>+nr_pages);
>   strcat(buf, default_zone->name);
> 
>   print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
>diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>index f4d59155f3d4..261dbf010d5d 100644
>--- a/include/linux/memory_hotplug.h
>+++ b/include/linux/memory_hotplug.h
>@@ -47,9 +47,13 @@ enum {
> 
> /* Types for control the zone type of onlined and offlined memory */
> enum {
>+  /* Offline the memory. */
>   MMOP_OFFLINE = -1,
>-  MMOP_ONLINE_KEEP,
>+  /* Online the memory. Zone depends, see default_zone_for_pfn(). */
>+  MMOP_ONLINE,
>+  /* Online the memory to ZONE_NORMAL. */
>   MMOP_ONLINE_KERNEL,
>+  /* Online the memory to ZONE_MOVABLE. */
>   MMOP_ONLINE_MOVABLE,
> };
> 
>-- 
>2.24.1

-- 
Wei Yang
Help you, Help me


[PATCH] ocxl: control via sysfs whether the FPGA is reloaded on a link reset

2020-03-11 Thread Philippe Bergheaud
Some opencapi FPGA images allow to control if the FPGA should be reloaded
on the next adapter reset. If it is supported, the image specifies it
through a Vendor Specific DVSEC in the config space of function 0.

This patch adds an interface to sysfs to control that behavior, if possible.

Signed-off-by: Philippe Bergheaud 
---
 Documentation/ABI/testing/sysfs-class-ocxl | 10 
 drivers/misc/ocxl/config.c | 59 +-
 drivers/misc/ocxl/ocxl_internal.h  |  6 +++
 drivers/misc/ocxl/sysfs.c  | 35 +
 include/misc/ocxl-config.h |  1 +
 5 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-class-ocxl 
b/Documentation/ABI/testing/sysfs-class-ocxl
index b5b1fa197592..b9ea671d5805 100644
--- a/Documentation/ABI/testing/sysfs-class-ocxl
+++ b/Documentation/ABI/testing/sysfs-class-ocxl
@@ -33,3 +33,13 @@ Date:January 2018
 Contact:   linuxppc-dev@lists.ozlabs.org
 Description:   read/write
Give access the global mmio area for the AFU
+
+What:  /sys/class/ocxl//reload_on_reset
+Date:  February 2020
+Contact:   linuxppc-dev@lists.ozlabs.org
+Description:   read/write
+   Control whether the FPGA is reloaded on a link reset
+   0   Do not reload FPGA image from flash
+   1   Reload FPGA image from flash
+   unavailable
+   The device does not support this capability
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index c8e19bfb5ef9..3488463c1640 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -71,6 +71,20 @@ static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 
afu_idx)
return 0;
 }
 
+/**
+ * get_function_0() - Find a related PCI device (function 0)
+ * @device: PCI device to match
+ *
+ * Returns a pointer to the related device, or null if not found
+ */
+static struct pci_dev *get_function_0(struct pci_dev *dev)
+{
+   unsigned int devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+
+   return pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
+  dev->bus->number, devfn);
+}
+
 static void read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn)
 {
u16 val;
@@ -159,7 +173,7 @@ static int read_dvsec_afu_info(struct pci_dev *dev, struct 
ocxl_fn_config *fn)
 static int read_dvsec_vendor(struct pci_dev *dev)
 {
int pos;
-   u32 cfg, tlx, dlx;
+   u32 cfg, tlx, dlx, reset_reload;
 
/*
 * vendor specific DVSEC is optional
@@ -178,11 +192,54 @@ static int read_dvsec_vendor(struct pci_dev *dev)
pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_CFG_VERS, &cfg);
pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_TLX_VERS, &tlx);
pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_DLX_VERS, &dlx);
+   pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, 
&reset_reload);
 
dev_dbg(&dev->dev, "Vendor specific DVSEC:\n");
dev_dbg(&dev->dev, "  CFG version = 0x%x\n", cfg);
dev_dbg(&dev->dev, "  TLX version = 0x%x\n", tlx);
dev_dbg(&dev->dev, "  DLX version = 0x%x\n", dlx);
+   dev_dbg(&dev->dev, "  ResetReload = 0x%x\n", reset_reload);
+   return 0;
+}
+
+int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val)
+{
+   int reset_reload = -1;
+   int pos = 0;
+   struct pci_dev *dev0 = get_function_0(dev);
+
+   if (dev0)
+   pos = find_dvsec(dev0, OCXL_DVSEC_VENDOR_ID);
+
+   if (pos)
+   pci_read_config_dword(dev0,
+ pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+ &reset_reload);
+   if (reset_reload == -1)
+   return reset_reload;
+
+   *val = reset_reload & BIT(0);
+   return 0;
+}
+
+int ocxl_config_set_reset_reload(struct pci_dev *dev, int val)
+{
+   int reset_reload = -1;
+   int pos = 0;
+   struct pci_dev *dev0 = get_function_0(dev);
+
+   if (dev0)
+   pos = find_dvsec(dev0, OCXL_DVSEC_VENDOR_ID);
+
+   if (pos)
+   pci_read_config_dword(dev0,
+ pos + OCXL_DVSEC_VENDOR_RESET_RELOAD,
+ &reset_reload);
+   if (reset_reload == -1)
+   return reset_reload;
+
+   val &= BIT(0);
+   pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, val);
return 0;
 }
 
diff --git a/drivers/misc/ocxl/ocxl_internal.h 
b/drivers/misc/ocxl/ocxl_internal.h
index 345bf843a38e..af9a84aeee6f 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -112,6 +112,12 @@ void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, 
u32 size);
  */
 int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count);
 
+/*
+ * Con

[PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread David Hildenbrand
For now, distributions implement advanced udev rules to essentially
- Don't online any hotplugged memory (s390x)
- Online all memory to ZONE_NORMAL (e.g., most virt environments like
  hyperv)
- Online all memory to ZONE_MOVABLE in case the zone imbalance is taken
  care of (e.g., bare metal, special virt environments)

In summary: All memory is usually onlined the same way, however, the
kernel always has to ask userspace to come up with the same answer.
E.g., HyperV always waits for a memory block to get onlined before
continuing, otherwise it might end up adding memory faster than
hotplugging it, which can result in strange OOM situations.

Let's allow to specify a default online_type, not just "online" and
"offline". This allows distributions to configure the default online_type
when booting up and be done with it.

We can now specify "offline", "online", "online_movable" and
"online_kernel" via
- "memhp_default_state=" on the kernel cmdline
- /sys/devices/systemn/memory/auto_online_blocks
just like we are able to specify for a single memory block via
/sys/devices/systemn/memory/memoryX/state

Cc: Greg Kroah-Hartman 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: "Rafael J. Wysocki" 
Cc: Baoquan He 
Cc: Wei Yang 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 11 +--
 include/linux/memory_hotplug.h |  2 ++
 mm/memory_hotplug.c|  8 
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8d3e16dab69f..2b09b68b9f78 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = {
[MMOP_ONLINE_MOVABLE] = "online_movable",
 };
 
-static int memhp_online_type_from_str(const char *str)
+int memhp_online_type_from_str(const char *str)
 {
int i;
 
@@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device 
*dev,
struct device_attribute *attr,
const char *buf, size_t count)
 {
-   if (sysfs_streq(buf, "online"))
-   memhp_default_online_type = MMOP_ONLINE;
-   else if (sysfs_streq(buf, "offline"))
-   memhp_default_online_type = MMOP_OFFLINE;
-   else
+   const int online_type = memhp_online_type_from_str(buf);
+
+   if (online_type < 0)
return -EINVAL;
 
+   memhp_default_online_type = online_type;
return count;
 }
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c6e090b34c4b..ef55115320fb 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions);
 extern u64 max_mem_size;
 
+extern int memhp_online_type_from_str(const char *str);
+
 /* Default online_type (MMOP_*) when new memory blocks are added. */
 extern int memhp_default_online_type;
 /* If movable_node boot option specified */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 01443c70aa27..4a96273eafa7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type);
 
 static int __init setup_memhp_default_state(char *str)
 {
-   if (!strcmp(str, "online"))
-   memhp_default_online_type = MMOP_ONLINE;
-   else if (!strcmp(str, "offline"))
-   memhp_default_online_type = MMOP_OFFLINE;
+   const int online_type = memhp_online_type_from_str(str);
+
+   if (online_type >= 0)
+   memhp_default_online_type = online_type;
 
return 1;
 }
-- 
2.24.1



[PATCH v1 4/5] mm/memory_hotplug: convert memhp_auto_online to store an online_type

2020-03-11 Thread David Hildenbrand
... and rename it to memhp_default_online_type. This is a preparation
for more detailed default online behavior.

Cc: Greg Kroah-Hartman 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: "Rafael J. Wysocki" 
Cc: Baoquan He 
Cc: Wei Yang 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Cc: Thomas Gleixner 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-hyp...@vger.kernel.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c |  2 +-
 drivers/base/memory.c | 10 --
 drivers/hv/hv_balloon.c   |  2 +-
 include/linux/memory_hotplug.h|  3 ++-
 mm/memory_hotplug.c   | 13 +++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index d6d64f8718e6..e15a600cfa4d 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -235,7 +235,7 @@ static int memtrace_online(void)
 * If kernel isn't compiled with the auto online option
 * we need to online the memory ourselves.
 */
-   if (!memhp_auto_online) {
+   if (memhp_default_online_type == MMOP_OFFLINE) {
lock_device_hotplug();
walk_memory_blocks(ent->start, ent->size, NULL,
   online_mem_block);
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8a7f29c0bf97..8d3e16dab69f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -386,10 +386,8 @@ static DEVICE_ATTR_RO(block_size_bytes);
 static ssize_t auto_online_blocks_show(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
-   if (memhp_auto_online)
-   return sprintf(buf, "online\n");
-   else
-   return sprintf(buf, "offline\n");
+   return sprintf(buf, "%s\n",
+  online_type_to_str[memhp_default_online_type]);
 }
 
 static ssize_t auto_online_blocks_store(struct device *dev,
@@ -397,9 +395,9 @@ static ssize_t auto_online_blocks_store(struct device *dev,
const char *buf, size_t count)
 {
if (sysfs_streq(buf, "online"))
-   memhp_auto_online = true;
+   memhp_default_online_type = MMOP_ONLINE;
else if (sysfs_streq(buf, "offline"))
-   memhp_auto_online = false;
+   memhp_default_online_type = MMOP_OFFLINE;
else
return -EINVAL;
 
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index a02ce43d778d..3b90fd12e0c5 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -727,7 +727,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned 
long size,
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
 
init_completion(&dm_device.ol_waitevent);
-   dm_device.ha_waiting = !memhp_auto_online;
+   dm_device.ha_waiting = memhp_default_online_type == 
MMOP_OFFLINE;
 
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c2e06ed5e0e9..c6e090b34c4b 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions);
 extern u64 max_mem_size;
 
-extern bool memhp_auto_online;
+/* Default online_type (MMOP_*) when new memory blocks are added. */
+extern int memhp_default_online_type;
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
 static inline bool movable_node_is_enabled(void)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1a00b5a37ef6..01443c70aa27 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,18 +67,18 @@ void put_online_mems(void)
 bool movable_node_enabled = false;
 
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-bool memhp_auto_online;
+int memhp_default_online_type = MMOP_OFFLINE;
 #else
-bool memhp_auto_online = true;
+int memhp_default_online_type = MMOP_ONLINE;
 #endif
-EXPORT_SYMBOL_GPL(memhp_auto_online);
+EXPORT_SYMBOL_GPL(memhp_default_online_type);
 
 static int __init setup_memhp_default_state(char *str)
 {
if (!strcmp(str, "online"))
-   memhp_auto_online = true;
+   memhp_default_online_type = MMOP_ONLINE;
else if (!strcmp(str, "offline"))
-   memhp_auto_online = false;
+   memhp_default_online_type = MMOP_OFFLINE;
 
return 1;
 }
@@ -991,6 +991,7 @@ static int check_hotplug_memory_range(u64 start, u64 size

[PATCH v1 0/5] mm/memory_hotplug: allow to specify a default online_type

2020-03-11 Thread David Hildenbrand
Distributions nowadays use udev rules ([1] [2]) to specify if and
how to online hotplugged memory. The rules seem to get more complex with
many special cases. Due to the various special cases,
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug
is handled via udev rules.

Everytime we hotplug memory, the udev rule will come to the same
conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of
memory in separate memory blocks and wait for memory to get onlined by user
space before continuing to add more memory blocks (to not add memory faster
than it is getting onlined). This of course slows down the whole memory
hotplug process.

To make the job of distributions easier and to avoid udev rules that get
more and more complicated, let's extend the mechanism provided by
- /sys/devices/system/memory/auto_online_blocks
- "memhp_default_state=" on the kernel cmdline
to be able to specify also "online_movable" as well as "online_kernel"

=== Example /usr/libexec/config-memhotplug ===

#!/bin/bash

VIRT=`systemd-detect-virt --vm`
ARCH=`uname -p`

sense_virtio_mem() {
  if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then
DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc 
-l`
if [ $DEVICES != "0" ]; then
return 0
fi
  fi
  return 1
}

if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then
  echo "Memory hotplug configuration support missing in the kernel"
  exit 1
fi

if grep "memhp_default_state=" /proc/cmdline > /dev/null; then
  echo "Memory hotplug configuration overridden in kernel cmdline 
(memhp_default_state=)"
  exit 1
fi

if [ $VIRT == "microsoft" ]; then
  echo "Detected Hyper-V on $ARCH"
  # Hyper-V wants all memory in ZONE_NORMAL
  ONLINE_TYPE="online_kernel"
elif sense_virtio_mem; then
  echo "Detected virtio-mem on $ARCH"
  # virtio-mem wants all memory in ZONE_NORMAL
  ONLINE_TYPE="online_kernel"
elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then
  echo "Detected $ARCH"
  # standby memory should not be onlined automatically
  ONLINE_TYPE="offline"
elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then
  echo "Detected" $ARCH
  # PPC64 is handled via a user space daemon AFAIK
  ONLINE_TYPE="offline"
elif [ $VIRT == "none" ]; then
  echo "Detected bare-metal on $ARCH"
  # Bare metal users expect hotplugged memory to be unpluggable. We assume
  # that ZONE imbalances on such enterpise servers cannot happen and is
  # properly documented
  ONLINE_TYPE="online_movable"
else
  # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE
  # imbalances won't happen
  echo "Detected $VIRT on $ARCH"
  # Usually, ballooning is used in virtual environments, so memory should go to
  # ZONE_NORMAL. However, sometimes "movable_node" is relevant.
  ONLINE_TYPE="online"
fi

echo "Selected online_type:" $ONLINE_TYPE

# Configure what to do with memory that will be hotplugged in the future
echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks
if [ $? != "0" ]; then
  echo "Memory hotplug cannot be configured (e.g., old kernel or missing 
permissions)"
  # A backup udev rule should handle old kernels if necessary
  exit 1
fi

# Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or 
virtio-mem)
if [ $ONLINE_TYPE != "offline" ]; then
  for MEMORY in /sys/devices/system/memory/memory*; do
STATE=`cat $MEMORY/state`
if [ $STATE == "offline" ]; then
echo $ONLINE_TYPE > $MEMORY/state
fi
  done
fi


=== Example /usr/lib/systemd/system/config-memhotplug.service ===

[Unit]
Description=Configure memory hotplug behavior
DefaultDependencies=no
Conflicts=shutdown.target
Before=sysinit.target shutdown.target
After=systemd-modules-load.service
ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks

[Service]
ExecStart=/usr/libexec/config-memhotplug
Type=oneshot
TimeoutSec=0
RemainAfterExit=yes

[Install]
WantedBy=sysinit.target


=== Example modification to the 40-redhat.rules [2] ===

diff --git a/40-redhat.rules b/40-redhat.rules-new
index 2c690e5..168fd03 100644
--- a/40-redhat.rules
+++ b/40-redhat.rules-new
@@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", 
ATTR{online}=="0", ATTR{online}
 # Memory hotadd request
 SUBSYSTEM!="memory", GOTO="memory_hotplug_end"
 ACTION!="add", GOTO="memory_hotplug_end"
+# memory hotplug behavior configured
+PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", 
GOTO="memory_hotplug_end"
+
 PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end"

 ENV{.state}="online"

===


[1] https://github.com/lnykryn/systemd-rhel/pull/281
[2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules

Cc: Vitaly Kuznetsov 
Cc: Yumei Huang 
Cc: Igor Mammedov 
Cc: Baoquan He 
Cc: Eduardo Habkost 
Cc: Milan Zamazal 

David Hildenbrand (5):
  drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE
  drivers/base/memory: map MMOP_OFFLINE to 0
  drivers/base/memory: store ma

[PATCH v1 2/5] drivers/base/memory: map MMOP_OFFLINE to 0

2020-03-11 Thread David Hildenbrand
I have no idea why we have to start at -1. Just treat 0 as the special
case. Clarify a comment (which was wrong, when we come via
device_online() the first time, the online_type would have been 0 /
MEM_ONLINE). The default is now always MMOP_OFFLINE.

This is a preparation to use the online_type as an array index.

Cc: Greg Kroah-Hartman 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: "Rafael J. Wysocki" 
Cc: Baoquan He 
Cc: Wei Yang 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 11 ---
 include/linux/memory_hotplug.h |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8c5ce42c0fc3..e7e77cafef80 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -211,17 +211,14 @@ static int memory_subsys_online(struct device *dev)
return 0;
 
/*
-* If we are called from state_store(), online_type will be
-* set >= 0 Otherwise we were called from the device online
-* attribute and need to set the online_type.
+* When called via device_online() without configuring the online_type,
+* we want to default to MMOP_ONLINE.
 */
-   if (mem->online_type < 0)
+   if (mem->online_type == MMOP_OFFLINE)
mem->online_type = MMOP_ONLINE;
 
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
-
-   /* clear online_type */
-   mem->online_type = -1;
+   mem->online_type = MMOP_OFFLINE;
 
return ret;
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 261dbf010d5d..c2e06ed5e0e9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -48,7 +48,7 @@ enum {
 /* Types for control the zone type of onlined and offlined memory */
 enum {
/* Offline the memory. */
-   MMOP_OFFLINE = -1,
+   MMOP_OFFLINE = 0,
/* Online the memory. Zone depends, see default_zone_for_pfn(). */
MMOP_ONLINE,
/* Online the memory to ZONE_NORMAL. */
-- 
2.24.1



[PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array

2020-03-11 Thread David Hildenbrand
Let's use a simple array which we can reuse soon. While at it, move the
string->mmop conversion out of the device hotplug lock.

Cc: Greg Kroah-Hartman 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: "Rafael J. Wysocki" 
Cc: Baoquan He 
Cc: Wei Yang 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c | 38 +++---
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e7e77cafef80..8a7f29c0bf97 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -28,6 +28,24 @@
 
 #define MEMORY_CLASS_NAME  "memory"
 
+static const char *const online_type_to_str[] = {
+   [MMOP_OFFLINE] = "offline",
+   [MMOP_ONLINE] = "online",
+   [MMOP_ONLINE_KERNEL] = "online_kernel",
+   [MMOP_ONLINE_MOVABLE] = "online_movable",
+};
+
+static int memhp_online_type_from_str(const char *str)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
+   if (sysfs_streq(str, online_type_to_str[i]))
+   return i;
+   }
+   return -EINVAL;
+}
+
 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
 
 static int sections_per_block;
@@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev)
 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
   const char *buf, size_t count)
 {
+   const int online_type = memhp_online_type_from_str(buf);
struct memory_block *mem = to_memory_block(dev);
-   int ret, online_type;
+   int ret;
+
+   if (online_type < 0)
+   return -EINVAL;
 
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
 
-   if (sysfs_streq(buf, "online_kernel"))
-   online_type = MMOP_ONLINE_KERNEL;
-   else if (sysfs_streq(buf, "online_movable"))
-   online_type = MMOP_ONLINE_MOVABLE;
-   else if (sysfs_streq(buf, "online"))
-   online_type = MMOP_ONLINE;
-   else if (sysfs_streq(buf, "offline"))
-   online_type = MMOP_OFFLINE;
-   else {
-   ret = -EINVAL;
-   goto err;
-   }
-
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
@@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct 
device_attribute *attr,
ret = -EINVAL; /* should never happen */
}
 
-err:
unlock_device_hotplug();
 
if (ret < 0)
-- 
2.24.1



[PATCH v1 1/5] drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE

2020-03-11 Thread David Hildenbrand
The name is misleading. Let's just name it like the online_type name we
expose to user space ("online").

Add some documentation to the types.

Cc: Greg Kroah-Hartman 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: "Rafael J. Wysocki" 
Cc: Baoquan He 
Cc: Wei Yang 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 9 +
 include/linux/memory_hotplug.h | 6 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6448c9ece2cb..8c5ce42c0fc3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -216,7 +216,7 @@ static int memory_subsys_online(struct device *dev)
 * attribute and need to set the online_type.
 */
if (mem->online_type < 0)
-   mem->online_type = MMOP_ONLINE_KEEP;
+   mem->online_type = MMOP_ONLINE;
 
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 
@@ -251,7 +251,7 @@ static ssize_t state_store(struct device *dev, struct 
device_attribute *attr,
else if (sysfs_streq(buf, "online_movable"))
online_type = MMOP_ONLINE_MOVABLE;
else if (sysfs_streq(buf, "online"))
-   online_type = MMOP_ONLINE_KEEP;
+   online_type = MMOP_ONLINE;
else if (sysfs_streq(buf, "offline"))
online_type = MMOP_OFFLINE;
else {
@@ -262,7 +262,7 @@ static ssize_t state_store(struct device *dev, struct 
device_attribute *attr,
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
-   case MMOP_ONLINE_KEEP:
+   case MMOP_ONLINE:
/* mem->online_type is protected by device_hotplug_lock */
mem->online_type = online_type;
ret = device_online(&mem->dev);
@@ -342,7 +342,8 @@ static ssize_t valid_zones_show(struct device *dev,
}
 
nid = mem->nid;
-   default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, 
nr_pages);
+   default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
+ nr_pages);
strcat(buf, default_zone->name);
 
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f4d59155f3d4..261dbf010d5d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -47,9 +47,13 @@ enum {
 
 /* Types for control the zone type of onlined and offlined memory */
 enum {
+   /* Offline the memory. */
MMOP_OFFLINE = -1,
-   MMOP_ONLINE_KEEP,
+   /* Online the memory. Zone depends, see default_zone_for_pfn(). */
+   MMOP_ONLINE,
+   /* Online the memory to ZONE_NORMAL. */
MMOP_ONLINE_KERNEL,
+   /* Online the memory to ZONE_MOVABLE. */
MMOP_ONLINE_MOVABLE,
 };
 
-- 
2.24.1



[PATCH 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-03-11 Thread Srikar Dronamraju
Currently Linux kernel with CONFIG_NUMA on a system with multiple
possible nodes, marks node 0 as online at boot.  However in practice,
there are systems which have node 0 as memoryless and cpuless.

This can cause numa_balancing to be enabled on systems with only one node
with memory and CPUs. The existence of this dummy node which is cpuless and
memoryless node can confuse users/scripts looking at output of lscpu /
numactl.

Lets stop assuming that Node 0 is always online.

v5.6-rc4
 available: 2 nodes (0,2)
 node 0 cpus:
 node 0 size: 0 MB
 node 0 free: 0 MB
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31490 MB
 node distances:
 node   0   2
   0:  10  20
   2:  20  10

proc and sys files
--
 /sys/devices/system/node/online:0,2
 /proc/sys/kernel/numa_balancing:1
 /sys/devices/system/node/has_cpu:   2
 /sys/devices/system/node/has_memory:2
 /sys/devices/system/node/has_normal_memory: 2
 /sys/devices/system/node/possible:  0-31

v5.6-rc4 + patch
--
 available: 1 nodes (2)
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31487 MB
 node distances:
 node   2
   2:  10

proc and sys files
--
/sys/devices/system/node/online:2
/proc/sys/kernel/numa_balancing:0
/sys/devices/system/node/has_cpu:   2
/sys/devices/system/node/has_memory:2
/sys/devices/system/node/has_normal_memory: 2
/sys/devices/system/node/possible:  0-31

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: "Kirill A. Shutemov" 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Srikar Dronamraju 
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb75..68e635f4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -116,8 +116,10 @@ struct pcpu_drain {
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
+#ifdef CONFIG_NUMA
+   [N_ONLINE] = NODE_MASK_NONE,
+#else
[N_ONLINE] = { { [0] = 1UL } },
-#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
-- 
1.8.3.1



Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-11 Thread Michal Hocko
On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
> A Powerpc system with multiple possible nodes and with CONFIG_NUMA
> enabled always used to have a node 0, even if node 0 does not any cpus
> or memory attached to it. As per PAPR, node affinity of a cpu is only
> available once its present / online. For all cpus that are possible but
> not present, cpu_to_node() would point to node 0.
> 
> To ensure a cpuless, memoryless dummy node is not online, powerpc need
> to make sure all possible but not present cpu_to_node are set to a
> proper node.

Just curious, is this somehow related to
http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?

> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux...@kvack.org
> Cc: linux-ker...@vger.kernel.org
> Cc: Michal Hocko 
> Cc: Mel Gorman 
> Cc: Vlastimil Babka 
> Cc: "Kirill A. Shutemov" 
> Cc: Christopher Lameter 
> Cc: Michael Ellerman 
> Cc: Andrew Morton 
> Cc: Linus Torvalds 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/mm/numa.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8a399db..54dcd49 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -931,8 +931,20 @@ void __init mem_topology_setup(void)
>  
>   reset_numa_cpu_lookup_table();
>  
> - for_each_present_cpu(cpu)
> - numa_setup_cpu(cpu);
> + for_each_possible_cpu(cpu) {
> + /*
> +  * Powerpc with CONFIG_NUMA always used to have a node 0,
> +  * even if it was memoryless or cpuless. For all cpus that
> +  * are possible but not present, cpu_to_node() would point
> +  * to node 0. To remove a cpuless, memoryless dummy node,
> +  * powerpc need to make sure all possible but not present
> +  * cpu_to_node are set to a proper node.
> +  */
> + if (cpu_present(cpu))
> + numa_setup_cpu(cpu);
> + else
> + set_cpu_numa_node(cpu, first_online_node);
> + }
>  }
>  
>  void __init initmem_init(void)
> -- 
> 1.8.3.1

-- 
Michal Hocko
SUSE Labs


[PATCH 2/3] powerpc/numa: Prefer node id queried from vphn

2020-03-11 Thread Srikar Dronamraju
Node id queried from the static device tree may not
be correct. For example: it may always show 0 on a shared processor.
Hence prefer the node id queried from vphn and fallback on the device tree
based node id if vphn query fails.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: "Kirill A. Shutemov" 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/mm/numa.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 54dcd49..8735fed 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -719,20 +719,20 @@ static int __init parse_numa_properties(void)
 */
for_each_present_cpu(i) {
struct device_node *cpu;
-   int nid;
-
-   cpu = of_get_cpu_node(i, NULL);
-   BUG_ON(!cpu);
-   nid = of_node_to_nid_single(cpu);
-   of_node_put(cpu);
+   int nid = vphn_get_nid(i);
 
/*
 * Don't fall back to default_nid yet -- we will plug
 * cpus into nodes once the memory scan has discovered
 * the topology.
 */
-   if (nid < 0)
-   continue;
+   if (nid == NUMA_NO_NODE) {
+   cpu = of_get_cpu_node(i, NULL);
+   if (cpu) {
+   nid = of_node_to_nid_single(cpu);
+   of_node_put(cpu);
+   }
+   }
node_set_online(nid);
}
 
-- 
1.8.3.1



[PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-11 Thread Srikar Dronamraju
A Powerpc system with multiple possible nodes and with CONFIG_NUMA
enabled always used to have a node 0, even if node 0 does not any cpus
or memory attached to it. As per PAPR, node affinity of a cpu is only
available once its present / online. For all cpus that are possible but
not present, cpu_to_node() would point to node 0.

To ensure a cpuless, memoryless dummy node is not online, powerpc need
to make sure all possible but not present cpu_to_node are set to a
proper node.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: "Kirill A. Shutemov" 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/mm/numa.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8a399db..54dcd49 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -931,8 +931,20 @@ void __init mem_topology_setup(void)
 
reset_numa_cpu_lookup_table();
 
-   for_each_present_cpu(cpu)
-   numa_setup_cpu(cpu);
+   for_each_possible_cpu(cpu) {
+   /*
+* Powerpc with CONFIG_NUMA always used to have a node 0,
+* even if it was memoryless or cpuless. For all cpus that
+* are possible but not present, cpu_to_node() would point
+* to node 0. To remove a cpuless, memoryless dummy node,
+* powerpc need to make sure all possible but not present
+* cpu_to_node are set to a proper node.
+*/
+   if (cpu_present(cpu))
+   numa_setup_cpu(cpu);
+   else
+   set_cpu_numa_node(cpu, first_online_node);
+   }
 }
 
 void __init initmem_init(void)
-- 
1.8.3.1



[PATCH 0/3] Offline memoryless cpuless node 0

2020-03-11 Thread Srikar Dronamraju
Linux kernel configured with CONFIG_NUMA on a system with multiple
possible nodes, marks node 0 as online at boot. However in practice,
there are systems which have node 0 as memoryless and cpuless.

This can cause
1. numa_balancing to be enabled on systems with only one online node.
2. Existence of dummy (cpuless and memoryless) node which can confuse
users/scripts looking at output of lscpu / numactl.

This patchset wants to correct this anomaly.

This should only affect systems that have CONFIG_MEMORYLESS_NODES.
Currently there are only 2 architectures ia64 and powerpc that have this
config.

v5.6-rc4
 available: 2 nodes (0,2)
 node 0 cpus:
 node 0 size: 0 MB
 node 0 free: 0 MB
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31490 MB
 node distances:
 node   0   2
   0:  10  20
   2:  20  10

proc and sys files
--
 /sys/devices/system/node/online:0,2
 /proc/sys/kernel/numa_balancing:1
 /sys/devices/system/node/has_cpu:   2
 /sys/devices/system/node/has_memory:2
 /sys/devices/system/node/has_normal_memory: 2
 /sys/devices/system/node/possible:  0-31

v5.6-rc4 + patches
--
 available: 1 nodes (2)
 node 2 cpus: 0 1 2 3 4 5 6 7
 node 2 size: 32625 MB
 node 2 free: 31487 MB
 node distances:
 node   2
   2:  10

proc and sys files
--
/sys/devices/system/node/online:2
/proc/sys/kernel/numa_balancing:0
/sys/devices/system/node/has_cpu:   2
/sys/devices/system/node/has_memory:2
/sys/devices/system/node/has_normal_memory: 2
/sys/devices/system/node/possible:  0-31

Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Cc: Michal Hocko 
Cc: Mel Gorman 
Cc: Vlastimil Babka 
Cc: "Kirill A. Shutemov" 
Cc: Christopher Lameter 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: Linus Torvalds 

Srikar Dronamraju (3):
  powerpc/numa: Set numa_node for all possible cpus
  powerpc/numa: Prefer node id queried from vphn
  mm/page_alloc: Keep memoryless cpuless node 0 offline

 arch/powerpc/mm/numa.c | 32 ++--
 mm/page_alloc.c|  4 +++-
 2 files changed, 25 insertions(+), 11 deletions(-)

-- 
1.8.3.1



[PATCH v2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value

2020-03-11 Thread Balamuruhan S
ld instruction should have 14 bit immediate field (DS) concatenated with
0b00 on the right, encode it accordingly. Introduce macro `IMM_DS()`
to encode DS form instructions with 14 bit immediate field.

Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store 
instructions")
Reviewed-by: Sandipan Das 
Signed-off-by: Balamuruhan S 
---
 arch/powerpc/lib/test_emulate_step.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
---
changes in v2:
-
* squash the commits as per Christophe's review comment

diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 42347067739c..007292a1ad01 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -13,19 +13,20 @@
 #include 
 
 #define IMM_L(i)   ((uintptr_t)(i) & 0x)
+#define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
 
 /*
  * Defined with TEST_ prefix so it does not conflict with other
  * definitions.
  */
 #define TEST_LD(r, base, i)(PPC_INST_LD | ___PPC_RT(r) |   \
-   ___PPC_RA(base) | IMM_L(i))
+   ___PPC_RA(base) | IMM_DS(i))
 #define TEST_LWZ(r, base, i)   (PPC_INST_LWZ | ___PPC_RT(r) |  \
___PPC_RA(base) | IMM_L(i))
 #define TEST_LWZX(t, a, b) (PPC_INST_LWZX | ___PPC_RT(t) | \
___PPC_RA(a) | ___PPC_RB(b))
 #define TEST_STD(r, base, i)   (PPC_INST_STD | ___PPC_RS(r) |  \
-   ___PPC_RA(base) | ((i) & 0xfffc))
+   ___PPC_RA(base) | IMM_DS(i))
 #define TEST_LDARX(t, a, b, eh)(PPC_INST_LDARX | ___PPC_RT(t) |
\
___PPC_RA(a) | ___PPC_RB(b) |   \
__PPC_EH(eh))

base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696
-- 
2.24.1



[PATCH v3 6/6] pseries/sysfs: Minimise IPI noise while reading [idle_][s]purr

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently purr, spurr, idle_purr, idle_spurr are exposed for every CPU
via the sysfs interface
/sys/devices/system/cpu/cpuX/[idle_][s]purr. Each sysfs read currently
generates an IPI to obtain the desired value from the target CPU X.
Since these aforementioned sysfs files are typically read one after
another, we end up generating 4 IPIs per CPU in a short duration.

In order to minimize the IPI noise, this patch caches the values of
all the four entities whenever one of them is read. If subsequently
any of these are read within the next 10ms, the cached value is
returned. With this, we will generate at most one IPI every 10ms for
every CPU.

Test-results: While reading the four sysfs files back-to-back for a
given CPU every second for 100 seconds.

Without the patch:
 16 [XICS 2 Edge IPI] = 422 times
 DBL [Doorbell interrupts] = 13 times
 Total : 435 IPIs.

With the patch:
  16 [XICS 2 Edge IPI] = 111 times
  DBL [Doorbell interrupts] = 17 times
  Total : 128 IPIs.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/sysfs.c | 109 
 1 file changed, 90 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c9ddb83..db8fc90 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -586,8 +586,6 @@ void ppc_enable_pmcs(void)
  * SPRs which are not related to PMU.
  */
 #ifdef CONFIG_PPC64
-SYSFS_SPRSETUP(purr, SPRN_PURR);
-SYSFS_SPRSETUP(spurr, SPRN_SPURR);
 SYSFS_SPRSETUP(pir, SPRN_PIR);
 SYSFS_SPRSETUP(tscr, SPRN_TSCR);
 
@@ -596,8 +594,6 @@ void ppc_enable_pmcs(void)
   enable write when needed with a separate function.
   Lets be conservative and default to pseries.
 */
-static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
-static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
 static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr);
 #endif /* CONFIG_PPC64 */
@@ -761,39 +757,114 @@ static void create_svm_file(void)
 }
 #endif /* CONFIG_PPC_SVM */
 
-static void read_idle_purr(void *val)
+/*
+ * The duration (in ms) from the last IPI to the target CPU until
+ * which a cached value of purr, spurr, idle_purr, idle_spurr can be
+ * reported to the user on a corresponding sysfs file read. Beyond
+ * this duration, fresh values need to be obtained by sending IPIs to
+ * the target CPU when the sysfs files are read.
+ */
+static unsigned long util_stats_staleness_tolerance_ms = 10;
+struct util_acct_stats {
+   u64 latest_purr;
+   u64 latest_spurr;
+   u64 latest_idle_purr;
+   u64 latest_idle_spurr;
+   unsigned long last_update_jiffies;
+};
+
+DEFINE_PER_CPU(struct util_acct_stats, util_acct_stats);
+
+static void update_util_acct_stats(void *ptr)
 {
-   u64 *ret = val;
+   struct util_acct_stats *stats = ptr;
 
-   *ret = read_this_idle_purr();
+   stats->latest_purr = mfspr(SPRN_PURR);
+   stats->latest_spurr = mfspr(SPRN_SPURR);
+   stats->latest_idle_purr = read_this_idle_purr();
+   stats->latest_idle_spurr = read_this_idle_spurr();
+   stats->last_update_jiffies = jiffies;
 }
 
-static ssize_t idle_purr_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+struct util_acct_stats *get_util_stats_ptr(int cpu)
+{
+   struct util_acct_stats *stats = per_cpu_ptr(&util_acct_stats, cpu);
+   unsigned long delta_jiffies;
+
+   delta_jiffies = jiffies - stats->last_update_jiffies;
+
+   /*
+* If we have a recent enough data, reuse that instead of
+* sending an IPI.
+*/
+   if (jiffies_to_msecs(delta_jiffies) < util_stats_staleness_tolerance_ms)
+   return stats;
+
+   smp_call_function_single(cpu, update_util_acct_stats, stats, 1);
+   return stats;
+}
+
+static ssize_t show_purr(struct device *dev,
+struct device_attribute *attr, char *buf)
 {
struct cpu *cpu = container_of(dev, struct cpu, dev);
-   u64 val;
+   struct util_acct_stats *stats;
 
-   smp_call_function_single(cpu->dev.id, read_idle_purr, &val, 1);
-   return sprintf(buf, "%llx\n", val);
+   stats = get_util_stats_ptr(cpu->dev.id);
+   return sprintf(buf, "%llx\n", stats->latest_purr);
 }
-static DEVICE_ATTR(idle_purr, 0400, idle_purr_show, NULL);
 
-static void read_idle_spurr(void *val)
+static void write_purr(void *val)
 {
-   u64 *ret = val;
+   mtspr(SPRN_PURR, *(unsigned long *)val);
+}
 
-   *ret = read_this_idle_spurr();
+static ssize_t __used store_purr(struct device *dev,
+struct device_attribute *attr,
+const char *buf, size_t count)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   unsigned long val;
+   int ret = kstrtoul(buf, 16, 

[PATCH v3 0/6] Track and expose idle PURR and SPURR ticks

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

This is the third version of the patches to track and expose idle PURR
and SPURR ticks. These patches are required by tools such as lparstat
to compute system utilization for capacity planning purposes.

The previous versions can be found here:
v2: https://lkml.org/lkml/2020/2/21/21
v1: https://lore.kernel.org/patchwork/cover/1159341/

They key changes from v2 are:

- The prolog and epilog functions have been named
  pseries_idle_prolog() and pseries_idle_epilog() respectively to
  indicate their pseries specific nature.

- Fixed the Documentation for
  /sys/devices/system/cpu/cpuX/idle_spurr as pointed out by
  Nathan Lynch.

- Introduces a patch (Patch 6/6) to send an IPI in order to read
  and cache the values of purr, spurr, idle_purr and idle_spurr of
  the target CPU when any one of them is read via sysfs. These
  cached values will be presented if any of these sysfs are read
  within the next 10ms. If these sysfs files are read after 10ms
  from the earlier IPI, a fresh IPI is issued to read and cache
  the values again. This minimizes the number of IPIs required to
  be sent when these values are read back-to-back via the sysfs
  interface.

Test-results: While reading the four sysfs files back-to-back for a
given CPU every second for 100 seconds.

Without patch 6/6 (Without caching): 
 16 [XICS 2 Edge IPI] = 422 times
 DBL [Doorbell interrupts] = 13 times
 Total : 435 IPIs.

With patch 6/6 (With caching):
  16 [XICS 2 Edge IPI] = 111 times
  DBL [Doorbell interrupts] = 17 times
  Total : 128 IPIs.
Motivation:
===
On PSeries LPARs, the data centers planners desire a more accurate
view of system utilization per resource such as CPU to plan the system
capacity requirements better. Such accuracy can be obtained by reading
PURR/SPURR registers for CPU resource utilization.

Tools such as lparstat which are used to compute the utilization need
to know [S]PURR ticks when the cpu was busy or idle. The [S]PURR
counters are already exposed through sysfs.  We already account for
PURR ticks when we go to idle so that we can update the VPA area. This
patchset extends support to account for SPURR ticks when idle, and
expose both via per-cpu sysfs files.

These patches are required for enhancement to the lparstat utility
that compute the CPU utilization based on PURR and SPURR which can be
found here :
https://groups.google.com/forum/#!topic/powerpc-utils-devel/fYRo69xO9r4


With the patches, when lparstat is run on a LPAR running CPU-Hogs,
=
sudo ./src/lparstat -E 1 3
System Configuration
type=Dedicated mode=Capped smt=8 lcpu=2 mem=4834176 kB cpus=0 ent=2.00 
---Actual--- -Normalized-
%busy  %idle   Frequency %busy  %idle
-- --  - -- --
1  99.99   0.00  3.35GHz[111%] 110.99   0.00
2 100.00   0.00  3.35GHz[111%] 111.00   0.00
3 100.00   0.00  3.35GHz[111%] 111.00   0.00
=

When lparstat is run on an LPAR that is idle,
=
$ sudo ./src/lparstat -E 1 3
System Configuration
type=Dedicated mode=Capped smt=8 lcpu=2 mem=4834176 kB cpus=0 ent=2.00 
---Actual--- -Normalized-
%busy  %idle   Frequency %busy  %idle
-- --  - -- --
1   0.71  99.30  2.18GHz[ 72%]   0.53  71.48
2   0.56  99.44  2.11GHz[ 70%]   0.43  69.57
3   0.54  99.46  2.11GHz[ 70%]   0.43  69.57
=
 

Gautham R. Shenoy (6):
  powerpc: Move idle_loop_prolog()/epilog() functions to header file
  powerpc/idle: Add accessor function to always read latest idle PURR
  powerpc/pseries: Account for SPURR ticks on idle CPUs
  powerpc/sysfs: Show idle_purr and idle_spurr for every CPU
  Documentation: Document sysfs interfaces purr, spurr, idle_purr, idle_spurr
  pseries/sysfs: Minimise IPI noise while reading [idle_][s]purr

 Documentation/ABI/testing/sysfs-devices-system-cpu |  39 ++
 arch/powerpc/include/asm/idle.h|  89 ++
 arch/powerpc/kernel/sysfs.c| 133 +++--
 arch/powerpc/platforms/pseries/setup.c |   8 +-
 drivers/cpuidle/cpuidle-pseries.c  |  39 ++
 5 files changed, 267 insertions(+), 41 deletions(-)
 create mode 100644 arch/powerpc/include/asm/idle.h

-- 
1.9.4



[PATCH v3 1/6] powerpc: Move idle_loop_prolog()/epilog() functions to header file

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently prior to entering an idle state on a Linux Guest, the
pseries cpuidle driver implement an idle_loop_prolog() and
idle_loop_epilog() functions which ensure that idle_purr is correctly
computed, and the hypervisor is informed that the CPU cycles have been
donated.

These prolog and epilog functions are also required in the default
idle call, i.e pseries_lpar_idle(). Hence move these accessor
functions to a common header file and call them from
pseries_lpar_idle(). Since the existing header files such as
asm/processor.h have enough clutter, create a new header file
asm/idle.h. Finally rename idle_loop_prolog() and idle_loop_epilog()
to pseries_idle_prolog() and pseries_idle_epilog() as they are only
relavent for on pseries guests.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/idle.h| 28 ++
 arch/powerpc/platforms/pseries/setup.c |  7 +--
 drivers/cpuidle/cpuidle-pseries.c  | 36 +++---
 3 files changed, 40 insertions(+), 31 deletions(-)
 create mode 100644 arch/powerpc/include/asm/idle.h

diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h
new file mode 100644
index 000..e838ea5
--- /dev/null
+++ b/arch/powerpc/include/asm/idle.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_POWERPC_IDLE_H
+#define _ASM_POWERPC_IDLE_H
+#include 
+
+static inline void pseries_idle_prolog(unsigned long *in_purr)
+{
+   ppc64_runlatch_off();
+   *in_purr = mfspr(SPRN_PURR);
+   /*
+* Indicate to the HV that we are idle. Now would be
+* a good time to find other work to dispatch.
+*/
+   get_lppaca()->idle = 1;
+}
+
+static inline void pseries_idle_epilog(unsigned long in_purr)
+{
+   u64 wait_cycles;
+
+   wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
+   wait_cycles += mfspr(SPRN_PURR) - in_purr;
+   get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
+   get_lppaca()->idle = 0;
+
+   ppc64_runlatch_on();
+}
+#endif
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 0c8421d..2f53e6b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -319,6 +320,8 @@ static int alloc_dispatch_log_kmem_cache(void)
 
 static void pseries_lpar_idle(void)
 {
+   unsigned long in_purr;
+
/*
 * Default handler to go into low thread priority and possibly
 * low power mode by ceding processor to hypervisor
@@ -328,7 +331,7 @@ static void pseries_lpar_idle(void)
return;
 
/* Indicate to hypervisor that we are idle. */
-   get_lppaca()->idle = 1;
+   pseries_idle_prolog(&in_purr);
 
/*
 * Yield the processor to the hypervisor.  We return if
@@ -339,7 +342,7 @@ static void pseries_lpar_idle(void)
 */
cede_processor();
 
-   get_lppaca()->idle = 0;
+   pseries_idle_epilog(in_purr);
 }
 
 /*
diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index 74c2479..46d5e05 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct cpuidle_driver pseries_idle_driver = {
@@ -31,29 +32,6 @@ struct cpuidle_driver pseries_idle_driver = {
 static u64 snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
-static inline void idle_loop_prolog(unsigned long *in_purr)
-{
-   ppc64_runlatch_off();
-   *in_purr = mfspr(SPRN_PURR);
-   /*
-* Indicate to the HV that we are idle. Now would be
-* a good time to find other work to dispatch.
-*/
-   get_lppaca()->idle = 1;
-}
-
-static inline void idle_loop_epilog(unsigned long in_purr)
-{
-   u64 wait_cycles;
-
-   wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
-   wait_cycles += mfspr(SPRN_PURR) - in_purr;
-   get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
-   get_lppaca()->idle = 0;
-
-   ppc64_runlatch_on();
-}
-
 static int snooze_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
@@ -63,7 +41,7 @@ static int snooze_loop(struct cpuidle_device *dev,
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
-   idle_loop_prolog(&in_purr);
+   pseries_idle_prolog(&in_purr);
local_irq_enable();
snooze_exit_time = get_tb() + snooze_timeout;
 
@@ -87,7 +65,7 @@ static int snooze_loop(struct cpuidle_device *dev,
 
local_irq_disable();
 
-   idle_loop_epilog(in_purr);
+   pseries_idle_epilog(in_purr);
 
return index;
 }
@@ -115,7 +93,7 @@ static int dedicated_cede_loop(struct cpuidle_device *dev,
 {
unsigned long in_purr;
 

[PATCH v3 5/6] Documentation: Document sysfs interfaces purr, spurr, idle_purr, idle_spurr

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Add documentation for the following sysfs interfaces:
/sys/devices/system/cpu/cpuX/purr
/sys/devices/system/cpu/cpuX/spurr
/sys/devices/system/cpu/cpuX/idle_purr
/sys/devices/system/cpu/cpuX/idle_spurr

Signed-off-by: Gautham R. Shenoy 
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 39 ++
 1 file changed, 39 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2e0e3b4..bc07677 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -580,3 +580,42 @@ Description:   Secure Virtual Machine
If 1, it means the system is using the Protected Execution
Facility in POWER9 and newer processors. i.e., it is a Secure
Virtual Machine.
+
+What:  /sys/devices/system/cpu/cpuX/purr
+Date:  Apr 2005
+Contact:   Linux for PowerPC mailing list 
+Description:   PURR ticks for this CPU since the system boot.
+
+   The Processor Utilization Resources Register (PURR) is
+   a 64-bit counter which provides an estimate of the
+   resources used by the CPU thread. The contents of this
+   register increases monotonically. This sysfs interface
+   exposes the number of PURR ticks for cpuX.
+
+What:  /sys/devices/system/cpu/cpuX/spurr
+Date:  Dec 2006
+Contact:   Linux for PowerPC mailing list 
+Description:   SPURR ticks for this CPU since the system boot.
+
+   The Scaled Processor Utilization Resources Register
+   (SPURR) is a 64-bit counter that provides a frequency
+   invariant estimate of the resources used by the CPU
+   thread. The contents of this register increases
+   monotonically. This sysfs interface exposes the number
+   of SPURR ticks for cpuX.
+
+What:  /sys/devices/system/cpu/cpuX/idle_purr
+Date:  Mar 2020
+Contact:   Linux for PowerPC mailing list 
+Description:   PURR ticks for cpuX when it was idle.
+
+   This sysfs interface exposes the number of PURR ticks
+   for cpuX when it was idle.
+
+What:  /sys/devices/system/cpu/cpuX/idle_spurr
+Date:  Mar 2020
+Contact:   Linux for PowerPC mailing list 
+Description:   SPURR ticks for cpuX when it was idle.
+
+   This sysfs interface exposes the number of SPURR ticks
+   for cpuX when it was idle.
-- 
1.9.4



[PATCH v3 3/6] powerpc/pseries: Account for SPURR ticks on idle CPUs

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On Pseries LPARs, to calculate utilization, we need to know the
[S]PURR ticks when the CPUs were busy or idle.

Via pseries_idle_prolog(), pseries_idle_epilog(), we track the idle
PURR ticks in the VPA variable "wait_state_cycles". This patch extends
the support to account for the idle SPURR ticks. It also provides an
accessor function to accurately reads idle SPURR ticks.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/idle.h| 33 +
 arch/powerpc/platforms/pseries/setup.c |  2 ++
 2 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h
index 7552823..a375589 100644
--- a/arch/powerpc/include/asm/idle.h
+++ b/arch/powerpc/include/asm/idle.h
@@ -3,13 +3,20 @@
 #define _ASM_POWERPC_IDLE_H
 #include 
 
+DECLARE_PER_CPU(u64, idle_spurr_cycles);
 DECLARE_PER_CPU(u64, idle_entry_purr_snap);
+DECLARE_PER_CPU(u64, idle_entry_spurr_snap);
 
 static inline void snapshot_purr_idle_entry(void)
 {
*this_cpu_ptr(&idle_entry_purr_snap) = mfspr(SPRN_PURR);
 }
 
+static inline void snapshot_spurr_idle_entry(void)
+{
+   *this_cpu_ptr(&idle_entry_spurr_snap) = mfspr(SPRN_SPURR);
+}
+
 static inline void update_idle_purr_accounting(void)
 {
u64 wait_cycles;
@@ -20,10 +27,19 @@ static inline void update_idle_purr_accounting(void)
get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
 }
 
+static inline void update_idle_spurr_accounting(void)
+{
+   u64 *idle_spurr_cycles_ptr = this_cpu_ptr(&idle_spurr_cycles);
+   u64 in_spurr = *this_cpu_ptr(&idle_entry_spurr_snap);
+
+   *idle_spurr_cycles_ptr += mfspr(SPRN_SPURR) - in_spurr;
+}
+
 static inline void pseries_idle_prolog(void)
 {
ppc64_runlatch_off();
snapshot_purr_idle_entry();
+   snapshot_spurr_idle_entry();
/*
 * Indicate to the HV that we are idle. Now would be
 * a good time to find other work to dispatch.
@@ -34,6 +50,7 @@ static inline void pseries_idle_prolog(void)
 static inline void pseries_idle_epilog(void)
 {
update_idle_purr_accounting();
+   update_idle_spurr_accounting();
get_lppaca()->idle = 0;
ppc64_runlatch_on();
 }
@@ -53,4 +70,20 @@ static inline u64 read_this_idle_purr(void)
 
return be64_to_cpu(get_lppaca()->wait_state_cycles);
 }
+
+static inline u64 read_this_idle_spurr(void)
+{
+   /*
+* If we are reading from an idle context, update the
+* idle-spurr cycles corresponding to the last idle period.
+* Since the idle context is not yet over, take a fresh
+* snapshot of the idle-spurr.
+*/
+   if (get_lppaca()->idle == 1) {
+   update_idle_spurr_accounting();
+   snapshot_spurr_idle_entry();
+   }
+
+   return *this_cpu_ptr(&idle_spurr_cycles);
+}
 #endif
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 4905c96..1b55e80 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -318,7 +318,9 @@ static int alloc_dispatch_log_kmem_cache(void)
 }
 machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
 
+DEFINE_PER_CPU(u64, idle_spurr_cycles);
 DEFINE_PER_CPU(u64, idle_entry_purr_snap);
+DEFINE_PER_CPU(u64, idle_entry_spurr_snap);
 static void pseries_lpar_idle(void)
 {
/*
-- 
1.9.4



[PATCH v3 2/6] powerpc/idle: Add accessor function to always read latest idle PURR

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently when CPU goes idle, we take a snapshot of PURR via
pseries_idle_prolog() which is used at the CPU idle exit to compute
the idle PURR cycles via the function pseries_idle_epilog().  Thus,
the value of idle PURR cycle thus read before pseries_idle_prolog() and
after pseries_idle_epilog() is always correct.

However, if we were to read the idle PURR cycles from an interrupt
context between pseries_idle_prolog() and pseries_idle_epilog() (this will
be done in a future patch), then, the value of the idle PURR thus read
will not include the cycles spent in the most recent idle period.

This patch addresses the issue by providing accessor function to read
the idle PURR such such that it includes the cycles spent in the most
recent idle period, if we read it between pseries_idle_prolog() and
pseries_idle_epilog(). In order to achieve it, the patch saves the
snapshot of PURR in pseries_idle_prolog() in a per-cpu variable,
instead of on the stack, so that it can be accessed from an interrupt
context.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/include/asm/idle.h| 46 +++---
 arch/powerpc/platforms/pseries/setup.c |  7 +++---
 drivers/cpuidle/cpuidle-pseries.c  | 15 +--
 3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h
index e838ea5..7552823 100644
--- a/arch/powerpc/include/asm/idle.h
+++ b/arch/powerpc/include/asm/idle.h
@@ -3,10 +3,27 @@
 #define _ASM_POWERPC_IDLE_H
 #include 
 
-static inline void pseries_idle_prolog(unsigned long *in_purr)
+DECLARE_PER_CPU(u64, idle_entry_purr_snap);
+
+static inline void snapshot_purr_idle_entry(void)
+{
+   *this_cpu_ptr(&idle_entry_purr_snap) = mfspr(SPRN_PURR);
+}
+
+static inline void update_idle_purr_accounting(void)
+{
+   u64 wait_cycles;
+   u64 in_purr = *this_cpu_ptr(&idle_entry_purr_snap);
+
+   wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
+   wait_cycles += mfspr(SPRN_PURR) - in_purr;
+   get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
+}
+
+static inline void pseries_idle_prolog(void)
 {
ppc64_runlatch_off();
-   *in_purr = mfspr(SPRN_PURR);
+   snapshot_purr_idle_entry();
/*
 * Indicate to the HV that we are idle. Now would be
 * a good time to find other work to dispatch.
@@ -14,15 +31,26 @@ static inline void pseries_idle_prolog(unsigned long 
*in_purr)
get_lppaca()->idle = 1;
 }
 
-static inline void pseries_idle_epilog(unsigned long in_purr)
+static inline void pseries_idle_epilog(void)
 {
-   u64 wait_cycles;
-
-   wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
-   wait_cycles += mfspr(SPRN_PURR) - in_purr;
-   get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
+   update_idle_purr_accounting();
get_lppaca()->idle = 0;
-
ppc64_runlatch_on();
 }
+
+static inline u64 read_this_idle_purr(void)
+{
+   /*
+* If we are reading from an idle context, update the
+* idle-purr cycles corresponding to the last idle period.
+* Since the idle context is not yet over, take a fresh
+* snapshot of the idle-purr.
+*/
+   if (unlikely(get_lppaca()->idle == 1)) {
+   update_idle_purr_accounting();
+   snapshot_purr_idle_entry();
+   }
+
+   return be64_to_cpu(get_lppaca()->wait_state_cycles);
+}
 #endif
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 2f53e6b..4905c96 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -318,10 +318,9 @@ static int alloc_dispatch_log_kmem_cache(void)
 }
 machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
 
+DEFINE_PER_CPU(u64, idle_entry_purr_snap);
 static void pseries_lpar_idle(void)
 {
-   unsigned long in_purr;
-
/*
 * Default handler to go into low thread priority and possibly
 * low power mode by ceding processor to hypervisor
@@ -331,7 +330,7 @@ static void pseries_lpar_idle(void)
return;
 
/* Indicate to hypervisor that we are idle. */
-   pseries_idle_prolog(&in_purr);
+   pseries_idle_prolog();
 
/*
 * Yield the processor to the hypervisor.  We return if
@@ -342,7 +341,7 @@ static void pseries_lpar_idle(void)
 */
cede_processor();
 
-   pseries_idle_epilog(in_purr);
+   pseries_idle_epilog();
 }
 
 /*
diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index 46d5e05..6513ef2 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -36,12 +36,11 @@ static int snooze_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
 {
-   unsigned long in_purr;
u64 snooze_exit_time;
 
set_

[PATCH v3 4/6] powerpc/sysfs: Show idle_purr and idle_spurr for every CPU

2020-03-11 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On Pseries LPARs, to calculate utilization, we need to know the
[S]PURR ticks when the CPUs were busy or idle.

The total PURR and SPURR ticks are already exposed via the per-cpu
sysfs files "purr" and "spurr". This patch adds support for exposing
the idle PURR and SPURR ticks via new per-cpu sysfs files named
"idle_purr" and "idle_spurr".

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/sysfs.c | 54 ++---
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 479c706..c9ddb83 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "cacheinfo.h"
@@ -760,6 +761,42 @@ static void create_svm_file(void)
 }
 #endif /* CONFIG_PPC_SVM */
 
+static void read_idle_purr(void *val)
+{
+   u64 *ret = val;
+
+   *ret = read_this_idle_purr();
+}
+
+static ssize_t idle_purr_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   u64 val;
+
+   smp_call_function_single(cpu->dev.id, read_idle_purr, &val, 1);
+   return sprintf(buf, "%llx\n", val);
+}
+static DEVICE_ATTR(idle_purr, 0400, idle_purr_show, NULL);
+
+static void read_idle_spurr(void *val)
+{
+   u64 *ret = val;
+
+   *ret = read_this_idle_spurr();
+}
+
+static ssize_t idle_spurr_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   u64 val;
+
+   smp_call_function_single(cpu->dev.id, read_idle_spurr, &val, 1);
+   return sprintf(buf, "%llx\n", val);
+}
+static DEVICE_ATTR(idle_spurr, 0400, idle_spurr_show, NULL);
+
 static int register_cpu_online(unsigned int cpu)
 {
struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -823,10 +860,15 @@ static int register_cpu_online(unsigned int cpu)
if (!firmware_has_feature(FW_FEATURE_LPAR))
add_write_permission_dev_attr(&dev_attr_purr);
device_create_file(s, &dev_attr_purr);
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   device_create_file(s, &dev_attr_idle_purr);
}
 
-   if (cpu_has_feature(CPU_FTR_SPURR))
+   if (cpu_has_feature(CPU_FTR_SPURR)) {
device_create_file(s, &dev_attr_spurr);
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   device_create_file(s, &dev_attr_idle_spurr);
+   }
 
if (cpu_has_feature(CPU_FTR_DSCR))
device_create_file(s, &dev_attr_dscr);
@@ -910,11 +952,17 @@ static int unregister_cpu_online(unsigned int cpu)
device_remove_file(s, &dev_attr_mmcra);
 #endif /* CONFIG_PMU_SYSFS */
 
-   if (cpu_has_feature(CPU_FTR_PURR))
+   if (cpu_has_feature(CPU_FTR_PURR)) {
device_remove_file(s, &dev_attr_purr);
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   device_remove_file(s, &dev_attr_idle_purr);
+   }
 
-   if (cpu_has_feature(CPU_FTR_SPURR))
+   if (cpu_has_feature(CPU_FTR_SPURR)) {
device_remove_file(s, &dev_attr_spurr);
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   device_remove_file(s, &dev_attr_idle_spurr);
+   }
 
if (cpu_has_feature(CPU_FTR_DSCR))
device_remove_file(s, &dev_attr_dscr);
-- 
1.9.4



[PATCH V16] mm/debug: Add tests validating architecture page table helpers

2020-03-11 Thread Anshuman Khandual
This adds tests which will validate architecture page table helpers and
other accessors in their compliance with expected generic MM semantics.
This will help various architectures in validating changes to existing
page table helpers or addition of new ones.

This test covers basic page table entry transformations including but not
limited to old, young, dirty, clean, write, write protect etc at various
level along with populating intermediate entries with next page table page
and validating them.

Test page table pages are allocated from system memory with required size
and alignments. The mapped pfns at page table levels are derived from a
real pfn representing a valid kernel text symbol. This test gets called
inside kernel_init() right after async_synchronize_full().

This test gets built and run when CONFIG_DEBUG_VM_PGTABLE is selected. Any
architecture, which is willing to subscribe this test will need to select
ARCH_HAS_DEBUG_VM_PGTABLE. For now this is limited to arc, arm64, x86, s390
and powerpc platforms where the test is known to build and run successfully
Going forward, other architectures too can subscribe the test after fixing
any build or runtime problems with their page table helpers. Meanwhile for
better platform coverage, the test can also be enabled with CONFIG_EXPERT
even without ARCH_HAS_DEBUG_VM_PGTABLE.

Folks interested in making sure that a given platform's page table helpers
conform to expected generic MM semantics should enable the above config
which will just trigger this test during boot. Any non conformity here will
be reported as an warning which would need to be fixed. This test will help
catch any changes to the agreed upon semantics expected from generic MM and
enable platforms to accommodate it thereafter.

Cc: Andrew Morton 
Cc: Mike Rapoport 
Cc: Vineet Gupta 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Kirill A. Shutemov 
Cc: Paul Walmsley 
Cc: Palmer Dabbelt 
Cc: linux-snps-...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Cc: linux-ri...@lists.infradead.org
Cc: x...@kernel.org
Cc: linux-a...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org

Suggested-by: Catalin Marinas 
Reviewed-by: Ingo Molnar 
Tested-by: Gerald Schaefer  # s390
Tested-by: Christophe Leroy# ppc32
Signed-off-by: Qian Cai 
Signed-off-by: Andrew Morton 
Signed-off-by: Christophe Leroy 
Signed-off-by: Anshuman Khandual 
---
This adds a test validation for architecture exported page table helpers.
Patch adds basic transformation tests at various levels of the page table.

This test was originally suggested by Catalin during arm64 THP migration
RFC discussion earlier. Going forward it can include more specific tests
with respect to various generic MM functions like THP, HugeTLB etc and
platform specific tests.

https://lore.kernel.org/linux-mm/20190628102003.ga56...@arrakis.emea.arm.com/

Needs to be applied on linux V5.6-rc5

Changes in V16:

- Replaced WRITE_ONCE() with set_pte_at() with a new barrier() in 
pte_clear_tests() per Qian
- Enabled all powerpc platforms and updated the feature list

Changes in V15: (https://patchwork.kernel.org/patch/11422803/)

- Replaced __pa() with __pa_symbol() 
(https://patchwork.kernel.org/patch/11407715/) 
- Replaced pte_alloc_map() with pte_alloc_map_lock() per Qian
- Replaced pte_unmap() with pte_unmap_unlock() per Qian
- Added address to pte_clear_tests() and passed it down till pte_clear() per 
Qian

Changes in V14: 
(https://patchwork.kernel.org/project/linux-mm/list/?series=242305)

- Disabled DEBUG_VM_PGTABLE for IA64 and ARM (32 Bit) per Andrew and Christophe
- Updated DEBUG_VM_PGTABLE documentation wrt EXPERT and disabled platforms
- Updated RANDOM_[OR|NZ]VALUE open encodings with GENMASK() per Catalin
- Updated s390 constraint bits from 12 to 4 (S390_MASK_BITS) per Gerald
- Updated in-code documentation for RANDOM_ORVALUE per Gerald
- Updated pxx_basic_tests() to use invert functions first per Catalin
- Dropped ARCH_HAS_4LEVEL_HACK check from pud_basic_tests()
- Replaced __ARCH_HAS_[4|5]LEVEL_HACK with __PAGETABLE_[PUD|P4D]_FOLDED per 
Catalin
- Trimmed the CC list on the commit message per Catalin

Changes in V13: 
(https://patchwork.kernel.org/project/linux-mm/list/?series=237125)

- Subscribed s390 platform and updated debug-vm-pgtable/arch-support.txt per 
Gerald
- Dropped keyword 'extern' from debug_vm_pgtable() declaration per Christophe
- Moved debug_vm_pgtable() declarations to  per Christophe
- Moved debug_vm_pgtable() call site into kernel_init() per Christophe
- Changed CONFIG_DEBUG_VM_PGTABLE rules per Christophe
- Updated commit to include new supported platforms and changed config selection

Changes in V12: 
(https://patchwork.kernel.org/project/linux

Re: [PATCH -next 017/491] CELL BROADBAND ENGINE ARCHITECTURE: Use fallthrough;

2020-03-11 Thread Arnd Bergmann
On Wed, Mar 11, 2020 at 6:07 AM Joe Perches  wrote:
>
> Convert the various uses of fallthrough comments to fallthrough;
>
> Done via script
> Link: 
> https://lore.kernel.org/lkml/b56602fcf79f849e733e7b521bb0e17895d390fa.1582230379.git.joe.com/
>
> Signed-off-by: Joe Perches 

Acked-by: Arnd Bergmann 


[PATCH] hwmon: (ibmpowernv) Use scnprintf() for avoiding potential buffer overflow

2020-03-11 Thread Takashi Iwai
Since snprintf() returns the would-be-output size instead of the
actual output size, the succeeding calls may go beyond the given
buffer limit.  Fix it by replacing with scnprintf().

Signed-off-by: Takashi Iwai 
---
 drivers/hwmon/ibmpowernv.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 0e525cfbdfc5..a750647e66a4 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -186,7 +186,7 @@ static void make_sensor_label(struct device_node *np,
u32 id;
size_t n;
 
-   n = snprintf(sdata->label, sizeof(sdata->label), "%s", label);
+   n = scnprintf(sdata->label, sizeof(sdata->label), "%s", label);
 
/*
 * Core temp pretty print
@@ -199,11 +199,11 @@ static void make_sensor_label(struct device_node *np,
 * The digital thermal sensors are associated
 * with a core.
 */
-   n += snprintf(sdata->label + n,
+   n += scnprintf(sdata->label + n,
  sizeof(sdata->label) - n, " %d",
  cpuid);
else
-   n += snprintf(sdata->label + n,
+   n += scnprintf(sdata->label + n,
  sizeof(sdata->label) - n, " phy%d", id);
}
 
@@ -211,7 +211,7 @@ static void make_sensor_label(struct device_node *np,
 * Membuffer pretty print
 */
if (!of_property_read_u32(np, "ibm,chip-id", &id))
-   n += snprintf(sdata->label + n, sizeof(sdata->label) - n,
+   n += scnprintf(sdata->label + n, sizeof(sdata->label) - n,
  " %d", id & 0x);
 }
 
-- 
2.16.4



Re: [PATCH 1/2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value

2020-03-11 Thread Balamuruhan S
On Wed, 2020-03-11 at 08:01 +0100, Christophe Leroy wrote:
> 
> Le 11/03/2020 à 07:14, Balamuruhan S a écrit :
> > ld instruction should have 14 bit immediate field (DS) concatenated
> > with
> > 0b00 on the right, encode it accordingly.
> > 
> > Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store
> > instructions")
> > Reviewed-by: Sandipan Das 
> > Signed-off-by: Balamuruhan S 
> > ---
> >   arch/powerpc/lib/test_emulate_step.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/lib/test_emulate_step.c
> > b/arch/powerpc/lib/test_emulate_step.c
> > index 42347067739c..51c254fd15b5 100644
> > --- a/arch/powerpc/lib/test_emulate_step.c
> > +++ b/arch/powerpc/lib/test_emulate_step.c
> > @@ -19,7 +19,7 @@
> >* definitions.
> >*/
> >   #define TEST_LD(r, base, i)   (PPC_INST_LD | ___PPC_RT(r) |   
> > \
> > -   ___PPC_RA(base) | IMM_L(i))
> > +   ___PPC_RA(base) | ((i) &
> > 0xfffc))
> 
> I think you should squash patch 1 and 2 together. Or at least you
> should
> put the new IMM_DS macro in patch 1 and use it instead of open
> coding.

sure, I will make the changes as suggested.

-- Bala

> 
> >   #define TEST_LWZ(r, base, i)  (PPC_INST_LWZ | ___PPC_RT(r) |  
> > \
> > ___PPC_RA(base) | IMM_L(i))
> >   #define TEST_LWZX(t, a, b)(PPC_INST_LWZX | ___PPC_RT(t) |
> > \
> > 
> > base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696
> > 
> 
> Christophe



Re: [PATCH 1/2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value

2020-03-11 Thread Christophe Leroy




Le 11/03/2020 à 07:14, Balamuruhan S a écrit :

ld instruction should have 14 bit immediate field (DS) concatenated with
0b00 on the right, encode it accordingly.

Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store 
instructions")
Reviewed-by: Sandipan Das 
Signed-off-by: Balamuruhan S 
---
  arch/powerpc/lib/test_emulate_step.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 42347067739c..51c254fd15b5 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -19,7 +19,7 @@
   * definitions.
   */
  #define TEST_LD(r, base, i)   (PPC_INST_LD | ___PPC_RT(r) |   \
-   ___PPC_RA(base) | IMM_L(i))
+   ___PPC_RA(base) | ((i) & 0xfffc))


I think you should squash patch 1 and 2 together. Or at least you should
put the new IMM_DS macro in patch 1 and use it instead of open coding.


  #define TEST_LWZ(r, base, i)  (PPC_INST_LWZ | ___PPC_RT(r) |  \
___PPC_RA(base) | IMM_L(i))
  #define TEST_LWZX(t, a, b)(PPC_INST_LWZX | ___PPC_RT(t) | \

base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696



Christophe