[PATCH v4] powerpc: Replace setup_irq() by request_irq()
request_irq() is preferred over setup_irq(). Invocations of setup_irq() occur after memory allocators are ready. Per tglx[1], setup_irq() existed in olden days when allocators were not ready by the time early interrupts were initialized. Hence replace setup_irq() by request_irq(). [1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos Signed-off-by: afzal mohammed --- v4: * pass non-NULL dev_id while requesting shared irq in mpc85xx_cds.c, as request_irq() can fail due to sanity check, which is not done in setup_irq() v3: * Split out from tree wide series, as Thomas suggested to get it thr' respective maintainers * Modify pr_err displayed in case of error * Re-arrange code & choose pr_err args as required to improve readability * Remove irrelevant parts from commit message & improve v2: * Replace pr_err("request_irq() on %s failed" by pr_err("%s: request_irq() failed" * Commit message massage arch/powerpc/platforms/85xx/mpc85xx_cds.c | 11 - arch/powerpc/platforms/8xx/cpm1.c | 9 ++- arch/powerpc/platforms/8xx/m8xx_setup.c | 9 ++- arch/powerpc/platforms/chrp/setup.c | 14 --- arch/powerpc/platforms/powermac/pic.c | 29 +-- arch/powerpc/platforms/powermac/smp.c | 12 -- 6 files changed, 29 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index 6b1436abe9b1..915ab6710b93 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -218,12 +218,6 @@ static irqreturn_t mpc85xx_8259_cascade_action(int irq, void *dev_id) { return IRQ_HANDLED; } - -static struct irqaction mpc85xxcds_8259_irqaction = { - .handler = mpc85xx_8259_cascade_action, - .flags = IRQF_SHARED | IRQF_NO_THREAD, - .name = "8259 cascade", -}; #endif /* PPC_I8259 */ #endif /* CONFIG_PCI */ @@ -271,7 +265,10 @@ static int mpc85xx_cds_8259_attach(void) * disabled when the last user of the shared IRQ line frees their * interrupt. */ - if ((ret = setup_irq(cascade_irq, &mpc85xxcds_8259_irqaction))) { + ret = request_irq(cascade_irq, mpc85xx_8259_cascade_action, + IRQF_SHARED | IRQF_NO_THREAD, "8259 cascade", + cascade_node); + if (ret) { printk(KERN_ERR "Failed to setup cascade interrupt\n"); return ret; } diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c index a43ee7d1ff85..4db4ca2e1222 100644 --- a/arch/powerpc/platforms/8xx/cpm1.c +++ b/arch/powerpc/platforms/8xx/cpm1.c @@ -120,12 +120,6 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev) return IRQ_HANDLED; } -static struct irqaction cpm_error_irqaction = { - .handler = cpm_error_interrupt, - .flags = IRQF_NO_THREAD, - .name = "error", -}; - static const struct irq_domain_ops cpm_pic_host_ops = { .map = cpm_pic_host_map, }; @@ -187,7 +181,8 @@ unsigned int __init cpm_pic_init(void) if (!eirq) goto end; - if (setup_irq(eirq, &cpm_error_irqaction)) + if (request_irq(eirq, cpm_error_interrupt, IRQF_NO_THREAD, "error", + NULL)) printk(KERN_ERR "Could not allocate CPM error IRQ!"); setbits32(&cpic_reg->cpic_cicr, CICR_IEN); diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index f1c805c8adbc..df4d57d07f9a 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -39,12 +39,6 @@ static irqreturn_t timebase_interrupt(int irq, void *dev) return IRQ_HANDLED; } -static struct irqaction tbint_irqaction = { - .handler = timebase_interrupt, - .flags = IRQF_NO_THREAD, - .name = "tbint", -}; - /* per-board overridable init_internal_rtc() function. */ void __init __attribute__ ((weak)) init_internal_rtc(void) @@ -157,7 +151,8 @@ void __init mpc8xx_calibrate_decr(void) (TBSCR_TBF | TBSCR_TBE)); immr_unmap(sys_tmr2); - if (setup_irq(virq, &tbint_irqaction)) + if (request_irq(virq, timebase_interrupt, IRQF_NO_THREAD, "tbint", + NULL)) panic("Could not allocate timer IRQ!"); } diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index fcf6f2342ef4..8328cd5817b0 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -451,13 +451,6 @@ static void __init chrp_find_openpic(void) of_node_put(np); } -#if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_XMON) -static struct irqaction xmon_irqaction = { - .handler = xmon_irq, - .name = "XMON break", -}; -#endif - static void __init chr
Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"
On 3/11/20 12:04 AM, Arnaldo Carvalho de Melo wrote: > Em Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain escreveu: >> Patch enhances current metric infrastructure to handle "?" in the metric >> expression. The "?" can be use for parameters whose value not known while >> creating metric events and which can be replace later at runtime to >> the proper value. It also add flexibility to create multiple events out >> of single metric event added in json file. >> >> Patch adds function 'arch_get_runtimeparam' which is a arch specific >> function, returns the count of metric events need to be created. >> By default it return 1. >> >> One loop is added in function 'metricgroup__add_metric', which create >> multiple events at run time depend on return value of >> 'arch_get_runtimeparam' and merge that event in 'group_list'. >> >> This infrastructure needed for hv_24x7 socket/chip level events. >> "hv_24x7" chip level events needs specific chip-id to which the >> data is requested. Function 'arch_get_runtimeparam' implemented >> in header.c which extract number of sockets from sysfs file >> "sockets" under "/sys/devices/hv_24x7/interface/". >> >> Signed-off-by: Kajol Jain >> --- >> tools/perf/arch/powerpc/util/header.c | 22 + >> tools/perf/util/expr.h| 1 + >> tools/perf/util/expr.l| 19 +++- >> tools/perf/util/metricgroup.c | 124 -- >> tools/perf/util/metricgroup.h | 1 + >> tools/perf/util/stat-shadow.c | 8 ++ >> 6 files changed, 148 insertions(+), 27 deletions(-) >> >> diff --git a/tools/perf/arch/powerpc/util/header.c >> b/tools/perf/arch/powerpc/util/header.c >> index 3b4cdfc5efd6..036f6b2ce202 100644 >> --- a/tools/perf/arch/powerpc/util/header.c >> +++ b/tools/perf/arch/powerpc/util/header.c >> @@ -7,6 +7,11 @@ >> #include >> #include >> #include "header.h" >> +#include "metricgroup.h" >> +#include "evlist.h" >> +#include >> +#include "pmu.h" >> +#include >> >> #define mfspr(rn) ({unsigned long rval; \ >> asm volatile("mfspr %0," __stringify(rn) \ >> @@ -16,6 +21,8 @@ >> #define PVR_VER(pvr)(((pvr) >> 16) & 0x) /* Version field */ >> #define PVR_REV(pvr)(((pvr) >> 0) & 0x) /* Revison field */ >> >> +#define SOCKETS_INFO_FILE_PATH "/devices/hv_24x7/interface/" >> + >> int >> get_cpuid(char *buffer, size_t sz) >> { >> @@ -44,3 +51,18 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused) >> >> return bufp; >> } >> + >> +int arch_get_runtimeparam(void) >> +{ >> +int count; >> +char path[PATH_MAX]; >> +char filename[] = "sockets"; >> + >> +snprintf(path, PATH_MAX, >> + SOCKETS_INFO_FILE_PATH "%s", filename); >> + >> +if (sysfs__read_ull(path, (unsigned long long *)&count) < 0) >> +return 1; >> +else >> +return count; > > Why this cast dance? We have sysfs__read_int(path, &count). > > Also this is more compact: > > return sysfs__read_int(path, &count) < 0 ? 1 : count; Hi Arnaldo, Yes it will be better to use like that. > >> +} >> diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h >> index 9377538f4097..d17664e628db 100644 >> --- a/tools/perf/util/expr.h >> +++ b/tools/perf/util/expr.h >> @@ -15,6 +15,7 @@ struct parse_ctx { >> struct parse_id ids[MAX_PARSE_ID]; >> }; >> >> +int expr__runtimeparam; >> void expr__ctx_init(struct parse_ctx *ctx); >> void expr__add_id(struct parse_ctx *ctx, const char *id, double val); >> int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr); >> diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l >> index 1928f2a3dddc..ec4b00671f67 100644 >> --- a/tools/perf/util/expr.l >> +++ b/tools/perf/util/expr.l >> @@ -45,6 +45,21 @@ static char *normalize(char *str) >> *dst++ = '/'; >> else if (*str == '\\') >> *dst++ = *++str; >> +else if (*str == '?') { >> + >> +int size = snprintf(NULL, 0, "%d", expr__runtimeparam); > > TIL that C99 allows for a NULL str to format and return the number of > bytes it would write if the string was large enough... wonders never > cease :-) > >> +char * paramval = (char *)malloc(size); > > No need for the cast, malloc returns void *, or has that changed? 8-) > and please no space before the variable name. > Okk, Will take care of that. > Humm this is all complicated, why not use asprintf and have something > like: > >> +int i = 0; >> + >> +if(!paramval) >> +*dst++ = '0'; >> +else { >> +sprintf(paramval, "%d", expr__runtimeparam); >> +while(i < size) >> +*dst++ = paramval[i++]; >> +free(paramval); >> +} > > cha
Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record
Tyrel Datwyler writes: > On 3/10/20 10:25 AM, Nathan Lynch wrote: >> Tyrel Datwyler writes: >>> The expectation is that when calling of_read_drc_info_cell() >>> repeatedly to parse multiple drc-info records that the in/out curval >>> parameter points at the start of the next record on return. However, >>> the current behavior has curval still pointing at the final value of >>> the record just parsed. The result of which is that if the >>> ibm,drc-info property contains multiple properties the parsed value >>> of the drc_type for any record after the first has the power_domain >>> value of the previous record appended to the type string. >>> >>> Ex: observed the following 0x prepended to PHB >>> >>> [ 69.485037] drc-info: type: \xff\xff\xff\xffPHB, prefix: PHB , >>> index_start: 0x2001 >>> [ 69.485038] drc-info: suffix_start: 1, sequential_elems: 3072, >>> sequential_inc: 1 >>> [ 69.485038] drc-info: power-domain: 0x, last_index: 0x2c00 >>> >>> Fix by incrementing curval past the power_domain value to point at >>> drc_type string of next record. >>> >>> Fixes: a29396653b8bf ("pseries/drc-info: Search DRC properties for CPU >>> indexes") >> >> I have a different commit hash for that: >> e83636ac3334 pseries/drc-info: Search DRC properties for CPU indexes > > Oof, looks like I grabbed the commit hash from the SLES 15 SP1 branch. You > have > the correct upstream commit. > > Fixes: e83636ac3334 ("pseries/drc-info: Search DRC properties for CPU > indexes") > > Michael, let me know if you want me to resubmit, or if you will fixup the > Fixes > tag on your end? I can update the Fixes tag. What's the practical effect of this bug? It seems like it should break all the hotplug stuff, but presumably it doesn't in practice for some reason? It would also be *really* nice if we had some unit tests for this parsing code, it's demonstrably very bug prone. cheers
Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus
* Michal Hocko [2020-03-11 12:57:35]: > On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote: > > A Powerpc system with multiple possible nodes and with CONFIG_NUMA > > enabled always used to have a node 0, even if node 0 does not any cpus > > or memory attached to it. As per PAPR, node affinity of a cpu is only > > available once its present / online. For all cpus that are possible but > > not present, cpu_to_node() would point to node 0. > > > > To ensure a cpuless, memoryless dummy node is not online, powerpc need > > to make sure all possible but not present cpu_to_node are set to a > > proper node. > > Just curious, is this somehow related to > http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz? > The issue I am trying to fix is a known issue in Powerpc since many years. So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate shrinker_map on appropriate NUMA node"). I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the kernel. Will work with Sachin/Abdul (reporters of the issue). > > Cc: linuxppc-dev@lists.ozlabs.org > > Cc: linux...@kvack.org > > Cc: linux-ker...@vger.kernel.org > > Cc: Michal Hocko > > Cc: Mel Gorman > > Cc: Vlastimil Babka > > Cc: "Kirill A. Shutemov" > > Cc: Christopher Lameter > > Cc: Michael Ellerman > > Cc: Andrew Morton > > Cc: Linus Torvalds > > Signed-off-by: Srikar Dronamraju > > --- > > arch/powerpc/mm/numa.c | 16 ++-- > > 1 file changed, 14 insertions(+), 2 deletions(-) > > > > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c > > index 8a399db..54dcd49 100644 > > --- a/arch/powerpc/mm/numa.c > > +++ b/arch/powerpc/mm/numa.c > > @@ -931,8 +931,20 @@ void __init mem_topology_setup(void) > > > > reset_numa_cpu_lookup_table(); > > > > - for_each_present_cpu(cpu) > > - numa_setup_cpu(cpu); > > + for_each_possible_cpu(cpu) { > > + /* > > +* Powerpc with CONFIG_NUMA always used to have a node 0, > > +* even if it was memoryless or cpuless. For all cpus that > > +* are possible but not present, cpu_to_node() would point > > +* to node 0. To remove a cpuless, memoryless dummy node, > > +* powerpc need to make sure all possible but not present > > +* cpu_to_node are set to a proper node. > > +*/ > > + if (cpu_present(cpu)) > > + numa_setup_cpu(cpu); > > + else > > + set_cpu_numa_node(cpu, first_online_node); > > + } > > } > > > > void __init initmem_init(void) > > -- > > 1.8.3.1 > > -- > Michal Hocko > SUSE Labs -- Thanks and Regards Srikar Dronamraju
Re: [RFC PATCH v1] pseries/drmem: don't cache node id in drmem_lmb struct
On Wed, Mar 11, 2020 at 06:08:15PM -0500, Scott Cheloha wrote: > At memory hot-remove time we can retrieve an LMB's nid from its > corresponding memory_block. There is no need to store the nid > in multiple locations. > > Signed-off-by: Scott Cheloha > --- > The linear search in powerpc's memory_add_physaddr_to_nid() has become a > bottleneck at boot on systems with many LMBs. > > As described in this patch here: > > https://lore.kernel.org/linuxppc-dev/20200221172901.1596249-2-chel...@linux.ibm.com/ > > the linear search seriously cripples drmem_init(). > > The obvious solution (shown in that patch) is to just make the search > in memory_add_physaddr_to_nid() faster. An XArray seems well-suited > to the task of mapping an address range to an LMB object. > > The less obvious approach is to just call memory_add_physaddr_to_nid() > in fewer places. > > I'm not sure which approach is correct, hence the RFC. You basically revert the below which will likely cause the very error that was fixed there: commit b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40 Author: Nathan Fontenot Date: Tue Oct 2 10:35:59 2018 -0500 powerpc/pseries: Track LMB nid instead of using device tree When removing memory we need to remove the memory from the node it was added to instead of looking up the node it should be in in the device tree. During testing we have seen scenarios where the affinity for a LMB changes due to a partition migration or PRRN event. In these cases the node the LMB exists in may not match the node the device tree indicates it belongs in. This can lead to a system crash when trying to DLPAR remove the LMB after a migration or PRRN event. The current code looks up the node in the device tree to remove the LMB from, the crash occurs when we try to offline this node and it does not have any data, i.e. node_data[nid] == NULL. Thanks Michal
Re: [PATCH v3 23/27] powerpc/powernv/pmem: Add debug IOCTLs
On Thu, 2020-03-05 at 14:11 +1100, Andrew Donnellan wrote: > On 21/2/20 2:27 pm, Alastair D'Silva wrote: > > From: Alastair D'Silva > > > > These IOCTLs provide low level access to the card to aid in > > debugging > > controller/FPGA firmware. > > > > Signed-off-by: Alastair D'Silva > > --- > > arch/powerpc/platforms/powernv/pmem/Kconfig | 6 + > > arch/powerpc/platforms/powernv/pmem/ocxl.c | 249 > > > > include/uapi/nvdimm/ocxl-pmem.h | 32 +++ > > 3 files changed, 287 insertions(+) > > > > diff --git a/arch/powerpc/platforms/powernv/pmem/Kconfig > > b/arch/powerpc/platforms/powernv/pmem/Kconfig > > index c5d927520920..3f44429d70c9 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/Kconfig > > +++ b/arch/powerpc/platforms/powernv/pmem/Kconfig > > @@ -12,4 +12,10 @@ config OCXL_PMEM > > > > Select N if unsure. > > > > +config OCXL_PMEM_DEBUG > > + bool "OpenCAPI Persistent Memory debugging" > > + depends on OCXL_PMEM > > + help > > + Enables low level IOCTLs for OpenCAPI Persistent Memory > > firmware development > > + > > How dangerous are these ioctls and does that need to be pointed out > in > this description? Good point, I'll elaborate. > > > endif > > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > index e01f6f9fc180..d4ce5e9e0521 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > @@ -1050,6 +1050,235 @@ int req_controller_health_perf(struct > > ocxlpmem *ocxlpmem) > > GLOBAL_MMIO_HCI_REQ_HEALTH_PERF); > > } > > > > +#ifdef CONFIG_OCXL_PMEM_DEBUG > > +/** > > + * enable_fwdebug() - Enable FW debug on the controller > > + * @ocxlpmem: the device metadata > > + * Return: 0 on success, negative on failure > > + */ > > +static int enable_fwdebug(const struct ocxlpmem *ocxlpmem) > > +{ > > + return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu, > > GLOBAL_MMIO_HCI, > > + OCXL_LITTLE_ENDIAN, > > + GLOBAL_MMIO_HCI_FW_DEBUG); > > +} > > + > > +/** > > + * disable_fwdebug() - Disable FW debug on the controller > > + * @ocxlpmem: the device metadata > > + * Return: 0 on success, negative on failure > > + */ > > +static int disable_fwdebug(const struct ocxlpmem *ocxlpmem) > > +{ > > + return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu, > > GLOBAL_MMIO_HCIC, > > + OCXL_LITTLE_ENDIAN, > > + GLOBAL_MMIO_HCI_FW_DEBUG); > > +} > > + > > +static int ioctl_fwdebug(struct ocxlpmem *ocxlpmem, > > +struct ioctl_ocxl_pmem_fwdebug __user > > *uarg) > > +{ > > + struct ioctl_ocxl_pmem_fwdebug args; > > + u64 val; > > + int i; > > + int rc; > > + > > + if (copy_from_user(&args, uarg, sizeof(args))) > > + return -EFAULT; > > + > > + // Buffer size must be a multiple of 8 > > + if ((args.buf_size & 0x07)) > > + return -EINVAL; > > + > > + if (args.buf_size > ocxlpmem->admin_command.data_size) > > + return -EINVAL; > > + > > + mutex_lock(&ocxlpmem->admin_command.lock); > > + > > + rc = enable_fwdebug(ocxlpmem); > > + if (rc) > > + goto out; > > + > > + rc = admin_command_request(ocxlpmem, ADMIN_COMMAND_FW_DEBUG); > > + if (rc) > > + goto out; > > + > > + // Write DebugAction & FunctionCode > > + val = ((u64)args.debug_action << 56) | ((u64)args.function_code > > << 40); > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x08, > > + OCXL_LITTLE_ENDIAN, val); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x10, > > + OCXL_LITTLE_ENDIAN, > > args.debug_parameter_1); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x18, > > + OCXL_LITTLE_ENDIAN, > > args.debug_parameter_2); > > + if (rc) > > + goto out; > > + > > + for (i = 0x20; i < 0x38; i += 0x08) > > Comparison should be <=, the request block ends at 0x40. > > But in any case, scm_command_request() should I think already handle > the > clearing of the request block? > Correct. > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + i, > > + OCXL_LITTLE_ENDIAN, 0); > > + if (rc) > > + goto out; > > + > > + > > + // Populate admin command buffer > > + if (args.buf_size) {
Re: [PATCH v3 19/27] powerpc/powernv/pmem: Add an IOCTL to report controller statistics
On Thu, 2020-03-05 at 11:46 +1100, Andrew Donnellan wrote: > On 21/2/20 2:27 pm, Alastair D'Silva wrote: > > From: Alastair D'Silva > > > > The controller can report a number of statistics that are useful > > in evaluating the performance and reliability of the card. > > > > This patch exposes this information via an IOCTL. > > > > Signed-off-by: Alastair D'Silva > > --- > > arch/powerpc/platforms/powernv/pmem/ocxl.c | 185 > > + > > include/uapi/nvdimm/ocxl-pmem.h| 17 ++ > > 2 files changed, 202 insertions(+) > > > > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > index 2cabafe1fc58..009d4fd29e7d 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > @@ -758,6 +758,186 @@ static int > > ioctl_controller_dump_complete(struct ocxlpmem *ocxlpmem) > > GLOBAL_MMIO_HCI_CONTROLLER_DUMP_COL > > LECTED); > > } > > > > +/** > > + * controller_stats_header_parse() - Parse the first 64 bits of > > the controller stats admin command response > > + * @ocxlpmem: the device metadata > > + * @length: out, returns the number of bytes in the response > > (excluding the 64 bit header) > > + */ > > +static int controller_stats_header_parse(struct ocxlpmem > > *ocxlpmem, > > + u32 *length) > > +{ > > + int rc; > > + u64 val; > > + > > + u16 data_identifier; > > + u32 data_length; > > + > > + rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, > > +ocxlpmem- > > >admin_command.data_offset, > > +OCXL_LITTLE_ENDIAN, &val); > > + if (rc) > > + return rc; > > + > > + data_identifier = val >> 48; > > + data_length = val & 0x; > > + > > + if (data_identifier != 0x4353) { // 'CS' > > + dev_err(&ocxlpmem->dev, > > + "Bad data identifier for controller stats, > > expected 'CS', got '%-.*s'\n", > > + 2, (char *)&data_identifier); > > + return -EINVAL; > > Same comment as earlier patches re EINVAL > I don't think I've seen a comment yet on these particular blocks. Can you suggest a better return value? > > + } > > + > > + *length = data_length; > > + return 0; > > +} > > + > > +static int ioctl_controller_stats(struct ocxlpmem *ocxlpmem, > > + struct > > ioctl_ocxl_pmem_controller_stats __user *uarg) > > +{ > > + struct ioctl_ocxl_pmem_controller_stats args; > > + u32 length; > > + int rc; > > + u64 val; > > + > > + memset(&args, '\0', sizeof(args)); > > + > > + mutex_lock(&ocxlpmem->admin_command.lock); > > + > > + rc = admin_command_request(ocxlpmem, > > ADMIN_COMMAND_CONTROLLER_STATS); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x08, > > + OCXL_LITTLE_ENDIAN, 0); > > + if (rc) > > + goto out; > > + > > + rc = admin_command_execute(ocxlpmem); > > + if (rc) > > + goto out; > > + > > + > > + rc = admin_command_complete_timeout(ocxlpmem, > > + ADMIN_COMMAND_CONTROLLER_ST > > ATS); > > + if (rc < 0) { > > + dev_warn(&ocxlpmem->dev, "Controller stats timed > > out\n"); > > + goto out; > > + } > > + > > + rc = admin_response(ocxlpmem); > > + if (rc < 0) > > + goto out; > > + if (rc != STATUS_SUCCESS) { > > + warn_status(ocxlpmem, > > + "Unexpected status from controller stats", > > rc); > > + goto out; > > + } > > + > > + rc = controller_stats_header_parse(ocxlpmem, &length); > > + if (rc) > > + goto out; > > + > > + if (length != 0x140) > > + warn_status(ocxlpmem, > > + "Unexpected length for controller stats > > data, expected 0x140, got 0x%x", > > + length); > > Might be worth a comment to explain where 0x140 comes from (it looks > correct from my reading of the spec) Ok > > > + > > + rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, > > +ocxlpmem- > > >admin_command.data_offset + 0x08 + 0x08, > > +OCXL_LITTLE_ENDIAN, &val); > > + if (rc) > > + goto out; > > + > > + args.reset_count = val >> 32; > > + args.reset_uptime = val & 0x; > > + > > + rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, > > +ocxlpmem- > > >admin_command.data_offset + 0x08 + 0x10, > > +OCXL_LITTLE_ENDIAN, &val); > > + if (rc) > > + goto out; > > + > > + args.power_on_uptime = val >> 32; > > We're not collecting life remaining? > It looks like my implementation is out of date. I'll bring it in line with
Re: [PATCH v3 23/27] powerpc/powernv/pmem: Add debug IOCTLs
On Wed, 2020-03-04 at 16:21 +0100, Frederic Barrat wrote: > > Le 21/02/2020 à 04:27, Alastair D'Silva a écrit : > > From: Alastair D'Silva > > > > These IOCTLs provide low level access to the card to aid in > > debugging > > controller/FPGA firmware. > > > > Signed-off-by: Alastair D'Silva > > --- > > arch/powerpc/platforms/powernv/pmem/Kconfig | 6 + > > arch/powerpc/platforms/powernv/pmem/ocxl.c | 249 > > > > include/uapi/nvdimm/ocxl-pmem.h | 32 +++ > > 3 files changed, 287 insertions(+) > > > > diff --git a/arch/powerpc/platforms/powernv/pmem/Kconfig > > b/arch/powerpc/platforms/powernv/pmem/Kconfig > > index c5d927520920..3f44429d70c9 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/Kconfig > > +++ b/arch/powerpc/platforms/powernv/pmem/Kconfig > > @@ -12,4 +12,10 @@ config OCXL_PMEM > > > > Select N if unsure. > > > > +config OCXL_PMEM_DEBUG > > + bool "OpenCAPI Persistent Memory debugging" > > + depends on OCXL_PMEM > > + help > > + Enables low level IOCTLs for OpenCAPI Persistent Memory > > firmware development > > + > > endif > > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > index e01f6f9fc180..d4ce5e9e0521 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > @@ -1050,6 +1050,235 @@ int req_controller_health_perf(struct > > ocxlpmem *ocxlpmem) > > GLOBAL_MMIO_HCI_REQ_HEALTH_PERF); > > } > > > > +#ifdef CONFIG_OCXL_PMEM_DEBUG > > +/** > > + * enable_fwdebug() - Enable FW debug on the controller > > + * @ocxlpmem: the device metadata > > + * Return: 0 on success, negative on failure > > + */ > > +static int enable_fwdebug(const struct ocxlpmem *ocxlpmem) > > +{ > > + return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu, > > GLOBAL_MMIO_HCI, > > + OCXL_LITTLE_ENDIAN, > > + GLOBAL_MMIO_HCI_FW_DEBUG); > > +} > > + > > +/** > > + * disable_fwdebug() - Disable FW debug on the controller > > + * @ocxlpmem: the device metadata > > + * Return: 0 on success, negative on failure > > + */ > > +static int disable_fwdebug(const struct ocxlpmem *ocxlpmem) > > +{ > > + return ocxl_global_mmio_set64(ocxlpmem->ocxl_afu, > > GLOBAL_MMIO_HCIC, > > + OCXL_LITTLE_ENDIAN, > > + GLOBAL_MMIO_HCI_FW_DEBUG); > > +} > > + > > +static int ioctl_fwdebug(struct ocxlpmem *ocxlpmem, > > +struct ioctl_ocxl_pmem_fwdebug __user > > *uarg) > > +{ > > + struct ioctl_ocxl_pmem_fwdebug args; > > + u64 val; > > + int i; > > + int rc; > > + > > + if (copy_from_user(&args, uarg, sizeof(args))) > > + return -EFAULT; > > + > > + // Buffer size must be a multiple of 8 > > + if ((args.buf_size & 0x07)) > > + return -EINVAL; > > + > > + if (args.buf_size > ocxlpmem->admin_command.data_size) > > + return -EINVAL; > > + > > + mutex_lock(&ocxlpmem->admin_command.lock); > > + > > + rc = enable_fwdebug(ocxlpmem); > > + if (rc) > > + goto out; > > + > > + rc = admin_command_request(ocxlpmem, ADMIN_COMMAND_FW_DEBUG); > > + if (rc) > > + goto out; > > + > > + // Write DebugAction & FunctionCode > > + val = ((u64)args.debug_action << 56) | ((u64)args.function_code > > << 40); > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x08, > > + OCXL_LITTLE_ENDIAN, val); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x10, > > + OCXL_LITTLE_ENDIAN, > > args.debug_parameter_1); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x18, > > + OCXL_LITTLE_ENDIAN, > > args.debug_parameter_2); > > + if (rc) > > + goto out; > > + > > + for (i = 0x20; i < 0x38; i += 0x08) > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + i, > > + OCXL_LITTLE_ENDIAN, 0); > > + if (rc) > > + goto out; > > rc is the for loop body. The rc test is not. > Whoops :) > > > + > > + > > + // Populate admin command buffer > > + if (args.buf_size) { > > + for (i = 0; i < args.buf_size; i += sizeof(u64)) { > > + u64 val; > > + > > + if (copy_from_user(&val, &args.buf[i], > > sizeof(u64))) > > + return -EFAULT; > > need
Re: [PATCH v4 1/5] mm/memremap_pages: Introduce memremap_compat_align()
Dan Williams writes: > The "sub-section memory hotplug" facility allows memremap_pages() users > like libnvdimm to compensate for hardware platforms like x86 that have a > section size larger than their hardware memory mapping granularity. The > compensation that sub-section support affords is being tolerant of > physical memory resources shifting by units smaller (64MiB on x86) than > the memory-hotplug section size (128 MiB). Where the platform > physical-memory mapping granularity is limited by the number and > capability of address-decode-registers in the memory controller. > > While the sub-section support allows memremap_pages() to operate on > sub-section (2MiB) granularity, the Power architecture may still > require 16MiB alignment on "!radix_enabled()" platforms. > > In order for libnvdimm to be able to detect and manage this per-arch > limitation, introduce memremap_compat_align() as a common minimum > alignment across all driver-facing memory-mapping interfaces, and let > Power override it to 16MiB in the "!radix_enabled()" case. > > The assumption / requirement for 16MiB to be a viable > memremap_compat_align() value is that Power does not have platforms > where its equivalent of address-decode-registers never hardware remaps a > persistent memory resource on smaller than 16MiB boundaries. Note that I > tried my best to not add a new Kconfig symbol, but header include > entanglements defeated the #ifndef memremap_compat_align design pattern > and the need to export it defeats the __weak design pattern for arch > overrides. > > Based on an initial patch by Aneesh. > > Link: > http://lore.kernel.org/r/capcyv4gbgnp95apyabcsocea50tqj9b5h__83vgngjq3oug...@mail.gmail.com > Reported-by: Aneesh Kumar K.V > Reported-by: Jeff Moyer > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Reviewed-by: Aneesh Kumar K.V > Signed-off-by: Dan Williams > --- > arch/powerpc/Kconfig |1 + > arch/powerpc/mm/ioremap.c | 21 + > drivers/nvdimm/pfn_devs.c |2 +- > include/linux/memremap.h |8 > include/linux/mmzone.h|1 + > lib/Kconfig |3 +++ > mm/memremap.c | 23 +++ > 7 files changed, 58 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 497b7d0b2d7e..e6ffe905e2b9 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -122,6 +122,7 @@ config PPC > select ARCH_HAS_GCOV_PROFILE_ALL > select ARCH_HAS_KCOV > select ARCH_HAS_HUGEPD if HUGETLB_PAGE > + select ARCH_HAS_MEMREMAP_COMPAT_ALIGN > select ARCH_HAS_MMIOWB if PPC64 > select ARCH_HAS_PHYS_TO_DMA > select ARCH_HAS_PMEM_API > diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c > index fc669643ce6a..b1a0aebe8c48 100644 > --- a/arch/powerpc/mm/ioremap.c > +++ b/arch/powerpc/mm/ioremap.c > @@ -2,6 +2,7 @@ > > #include > #include > +#include > #include > #include > > @@ -97,3 +98,23 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t > offset, unsigned long size, > > return NULL; > } > + > +#ifdef CONFIG_ZONE_DEVICE > +/* > + * Override the generic version in mm/memremap.c. > + * > + * With hash translation, the direct-map range is mapped with just one > + * page size selected by htab_init_page_sizes(). Consult > + * mmu_psize_defs[] to determine the minimum page size alignment. > +*/ > +unsigned long memremap_compat_align(void) > +{ > + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; > + > + if (radix_enabled()) > + return SUBSECTION_SIZE; > + return max(SUBSECTION_SIZE, 1UL << shift); > + > +} > +EXPORT_SYMBOL_GPL(memremap_compat_align); > +#endif LGTM. Acked-by: Michael Ellerman (powerpc) cheers
Re: [PATCH v3 19/27] powerpc/powernv/pmem: Add an IOCTL to report controller statistics
On Wed, 2020-03-04 at 10:25 +0100, Frederic Barrat wrote: > > Le 21/02/2020 à 04:27, Alastair D'Silva a écrit : > > From: Alastair D'Silva > > > > The controller can report a number of statistics that are useful > > in evaluating the performance and reliability of the card. > > > > This patch exposes this information via an IOCTL. > > > > Signed-off-by: Alastair D'Silva > > --- > > arch/powerpc/platforms/powernv/pmem/ocxl.c | 185 > > + > > include/uapi/nvdimm/ocxl-pmem.h| 17 ++ > > 2 files changed, 202 insertions(+) > > > > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > index 2cabafe1fc58..009d4fd29e7d 100644 > > --- a/arch/powerpc/platforms/powernv/pmem/ocxl.c > > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl.c > > @@ -758,6 +758,186 @@ static int > > ioctl_controller_dump_complete(struct ocxlpmem *ocxlpmem) > > GLOBAL_MMIO_HCI_CONTROLLER_DUMP_COL > > LECTED); > > } > > > > +/** > > + * controller_stats_header_parse() - Parse the first 64 bits of > > the controller stats admin command response > > + * @ocxlpmem: the device metadata > > + * @length: out, returns the number of bytes in the response > > (excluding the 64 bit header) > > + */ > > +static int controller_stats_header_parse(struct ocxlpmem > > *ocxlpmem, > > + u32 *length) > > +{ > > + int rc; > > + u64 val; > > + > > unexpected empty line > Ok > > > + u16 data_identifier; > > + u32 data_length; > > + > > + rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, > > +ocxlpmem- > > >admin_command.data_offset, > > +OCXL_LITTLE_ENDIAN, &val); > > + if (rc) > > + return rc; > > + > > + data_identifier = val >> 48; > > + data_length = val & 0x; > > + > > + if (data_identifier != 0x4353) { // 'CS' > > + dev_err(&ocxlpmem->dev, > > + "Bad data identifier for controller stats, > > expected 'CS', got '%-.*s'\n", > > + 2, (char *)&data_identifier); > > > Wow, I'm clueless what that string format looks like :-) > 2 arguments? Did you check the kernel string formatter does what you > want? > You may consider unifying the format though, the error log patch uses > a > simpler (better?) format for a similar message. > Sorry, force of habit from my old job where we dealt with a lot of variable length, non-NULL terminated buffers. FYI - it takes the string length from the first argument. I'll change it to a fixed length string like the others :) > > > > + return -EINVAL; > > + } > > + > > + *length = data_length; > > + return 0; > > +} > > + > > +static int ioctl_controller_stats(struct ocxlpmem *ocxlpmem, > > + struct > > ioctl_ocxl_pmem_controller_stats __user *uarg) > > +{ > > + struct ioctl_ocxl_pmem_controller_stats args; > > + u32 length; > > + int rc; > > + u64 val; > > + > > + memset(&args, '\0', sizeof(args)); > > + > > + mutex_lock(&ocxlpmem->admin_command.lock); > > + > > + rc = admin_command_request(ocxlpmem, > > ADMIN_COMMAND_CONTROLLER_STATS); > > + if (rc) > > + goto out; > > + > > + rc = ocxl_global_mmio_write64(ocxlpmem->ocxl_afu, > > + ocxlpmem- > > >admin_command.request_offset + 0x08, > > + OCXL_LITTLE_ENDIAN, 0); > > + if (rc) > > + goto out; > > + > > + rc = admin_command_execute(ocxlpmem); > > + if (rc) > > + goto out; > > + > > + > > + rc = admin_command_complete_timeout(ocxlpmem, > > + ADMIN_COMMAND_CONTROLLER_ST > > ATS); > > + if (rc < 0) { > > + dev_warn(&ocxlpmem->dev, "Controller stats timed > > out\n"); > > + goto out; > > + } > > + > > + rc = admin_response(ocxlpmem); > > + if (rc < 0) > > + goto out; > > + if (rc != STATUS_SUCCESS) { > > + warn_status(ocxlpmem, > > + "Unexpected status from controller stats", > > rc); > > + goto out; > > + } > > All those ioctls commands follow the same pattern: > 1. admin_command_request() > 2. optionnaly, set some mmio registers specific to the command > 3. admin_command_execute() > 4. admin_command_complete_timeout() > 5. admin_response() > > By swapping 1 and 2, we could then factorize steps 1, 3, 4 and 5 in > a > function and simplify/shorten the code each time a command is called. > > Regarding step 2 (and that's true for all similar patches), a > comment > about what the mmio tuning does would help and avoid looking up the > spec. Looking up the spec during the review is expected, but it will > ease reading the code 6 months from now. > > I'll rework this and add a wrapper in the Admin Commands patch. > > > + > > + rc = controller_stats_header_parse(ocxlpmem, &length); > > + if (rc) > > +
[RFC PATCH v1] pseries/drmem: don't cache node id in drmem_lmb struct
At memory hot-remove time we can retrieve an LMB's nid from its corresponding memory_block. There is no need to store the nid in multiple locations. Signed-off-by: Scott Cheloha --- The linear search in powerpc's memory_add_physaddr_to_nid() has become a bottleneck at boot on systems with many LMBs. As described in this patch here: https://lore.kernel.org/linuxppc-dev/20200221172901.1596249-2-chel...@linux.ibm.com/ the linear search seriously cripples drmem_init(). The obvious solution (shown in that patch) is to just make the search in memory_add_physaddr_to_nid() faster. An XArray seems well-suited to the task of mapping an address range to an LMB object. The less obvious approach is to just call memory_add_physaddr_to_nid() in fewer places. I'm not sure which approach is correct, hence the RFC. The attached patch is an example of how we could just eliminate the memory_add_physaddr_to_nid() calls during drmem_init(). Unless this is somehow an abuse of the memory_block I think retrieving the nid in this way deduplicates memoization. There is another spot where we don't *need* to search by address to find the node id. In dlpar_memory_add() we're calling lmb_set_nid(), which is just memory_add_physaddr_to_nid(). But we could easily bypass the search and call of_drconf_to_nid_single() directly with the LMB we already have a pointer to. Then again, I see callers like the xen balloon driver and probe_store() that seemingly need to do an address-to-nid lookup. So, does memory_add_physaddr_to_nid() need to be fast? Should I bite the bullet and make it fast so we can leverage it without slowdown? Or is calling it in fewer places an acceptable approach? arch/powerpc/include/asm/drmem.h | 21 --- arch/powerpc/mm/drmem.c | 6 +- .../platforms/pseries/hotplug-memory.c| 19 ++--- 3 files changed, 13 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 3d76e1c388c2..4e7b6b70e366 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -13,9 +13,6 @@ struct drmem_lmb { u32 drc_index; u32 aa_index; u32 flags; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; -#endif }; struct drmem_lmb_info { @@ -103,22 +100,4 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) lmb->aa_index = 0x; } -#ifdef CONFIG_MEMORY_HOTPLUG -static inline void lmb_set_nid(struct drmem_lmb *lmb) -{ - lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr); -} -static inline void lmb_clear_nid(struct drmem_lmb *lmb) -{ - lmb->nid = -1; -} -#else -static inline void lmb_set_nid(struct drmem_lmb *lmb) -{ -} -static inline void lmb_clear_nid(struct drmem_lmb *lmb) -{ -} -#endif - #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 59327cefbc6a..873fcfc7b875 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -362,10 +362,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) if (!drmem_info->lmbs) return; - for_each_drmem_lmb(lmb) { + for_each_drmem_lmb(lmb) read_drconf_v1_cell(lmb, &prop); - lmb_set_nid(lmb); - } } static void __init init_drmem_v2_lmbs(const __be32 *prop) @@ -410,8 +408,6 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) lmb->aa_index = dr_cell.aa_index; lmb->flags = dr_cell.flags; - - lmb_set_nid(lmb); } } } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index a4d40a3ceea3..f6d4236286cf 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -376,25 +376,29 @@ static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { + struct memory_block *mem_block; unsigned long block_sz; int rc; if (!lmb_is_removable(lmb)) return -EINVAL; + mem_block = lmb_to_memblock(lmb); + if (mem_block == NULL) + return -EINVAL; + rc = dlpar_offline_lmb(lmb); if (rc) return rc; block_sz = pseries_memory_block_size(); - __remove_memory(lmb->nid, lmb->base_addr, block_sz); + __remove_memory(mem_block->nid, lmb->base_addr, block_sz); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); - lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; @@ -651,7 +655,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) {
Re: [PATCH v3] ima: add a new CONFIG for loading arch-specific policies
On Sun, 8 Mar 2020 20:57:51 -0400 Nayna Jain wrote: > From: Nayna Jain > > Every time a new architecture defines the IMA architecture specific > functions - arch_ima_get_secureboot() and arch_ima_get_policy(), the IMA > include file needs to be updated. To avoid this "noise", this patch > defines a new IMA Kconfig IMA_SECURE_AND_OR_TRUSTED_BOOT option, allowing > the different architectures to select it. > > Suggested-by: Linus Torvalds > Signed-off-by: Nayna Jain > Acked-by: Ard Biesheuvel > Cc: Ard Biesheuvel > Cc: Philipp Rudo > Cc: Michael Ellerman > --- > v3: > * Removes CONFIG_IMA dependency. Thanks Ard. > * Updated the patch with improvements suggested by Michael. It now uses > "imply" instead of "select". Thanks Michael. > * Replaced the CONFIG_IMA in x86 and s390 with new config, else it was > resulting in redefinition when the IMA_SECURE_AND_OR_TRUSTED_BOOT > is not enabled. Thanks to Mimi for identifying the problem. > * Removed "#ifdef EFI" check in the arch/x86/Makefile for compiling > ima_arch.c file. > * Ard, Thanks for your Acked-by. I have changed the arch/x86/Makefile in > this version. Can you please review again and confirm ? > * Rudo, Thanks for your review. I have changed arch/s390/Makefile as well. > Can you also please review again ? Looks good to me for the s390 part Acked-by: Philipp Rudo Thanks Philipp > > v2: > * Fixed the issue identified by Mimi. Thanks Mimi, Ard, Heiko and Michael for > discussing the fix. > > arch/powerpc/Kconfig | 1 + > arch/s390/Kconfig | 1 + > arch/s390/kernel/Makefile | 2 +- > arch/x86/Kconfig | 1 + > arch/x86/kernel/Makefile | 4 +--- > include/linux/ima.h| 3 +-- > security/integrity/ima/Kconfig | 7 +++ > 7 files changed, 13 insertions(+), 6 deletions(-) > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 497b7d0b2d7e..5b9f1cba2a44 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -979,6 +979,7 @@ config PPC_SECURE_BOOT > bool > depends on PPC_POWERNV > depends on IMA_ARCH_POLICY > + imply IMA_SECURE_AND_OR_TRUSTED_BOOT > help > Systems with firmware secure boot enabled need to define security > policies to extend secure boot to the OS. This config allows a user > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > index 8abe77536d9d..59c216af6264 100644 > --- a/arch/s390/Kconfig > +++ b/arch/s390/Kconfig > @@ -195,6 +195,7 @@ config S390 > select ARCH_HAS_FORCE_DMA_UNENCRYPTED > select SWIOTLB > select GENERIC_ALLOCATOR > + imply IMA_SECURE_AND_OR_TRUSTED_BOOT > > > config SCHED_OMIT_FRAME_POINTER > diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile > index 2b1203cf7be6..578a6fa82ea4 100644 > --- a/arch/s390/kernel/Makefile > +++ b/arch/s390/kernel/Makefile > @@ -70,7 +70,7 @@ obj-$(CONFIG_JUMP_LABEL)+= jump_label.o > obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o > obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o > > -obj-$(CONFIG_IMA)+= ima_arch.o > +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o > > obj-$(CONFIG_PERF_EVENTS)+= perf_event.o perf_cpum_cf_common.o > obj-$(CONFIG_PERF_EVENTS)+= perf_cpum_cf.o perf_cpum_sf.o > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index beea77046f9b..dcf5b1729f7c 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -230,6 +230,7 @@ config X86 > select VIRT_TO_BUS > select X86_FEATURE_NAMESif PROC_FS > select PROC_PID_ARCH_STATUS if PROC_FS > + imply IMA_SECURE_AND_OR_TRUSTED_BOOTif EFI > > config INSTRUCTION_DECODER > def_bool y > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile > index 9b294c13809a..cfef37a27def 100644 > --- a/arch/x86/kernel/Makefile > +++ b/arch/x86/kernel/Makefile > @@ -154,6 +154,4 @@ ifeq ($(CONFIG_X86_64),y) > obj-y += vsmp_64.o > endif > > -ifdef CONFIG_EFI > -obj-$(CONFIG_IMA)+= ima_arch.o > -endif > +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o > diff --git a/include/linux/ima.h b/include/linux/ima.h > index 1659217e9b60..aefe758f4466 100644 > --- a/include/linux/ima.h > +++ b/include/linux/ima.h > @@ -30,8 +30,7 @@ extern void ima_kexec_cmdline(const void *buf, int size); > extern void ima_add_kexec_buffer(struct kimage *image); > #endif > > -#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) \ > - || defined(CONFIG_PPC_SECURE_BOOT) > +#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT > extern bool arch_ima_get_secureboot(void); > extern const char * const *arch_get_ima_policy(void); > #else > diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig > index 3f3ee4e2eb0d..edde88dbe576 100644 > --- a/security/integrity/ima/Kconfig > +++ b/security/integrity/ima/Kconfig > @@ -327,3 +327,10 @@ config IMA_QUEUE_EAR
Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type
On 11.03.20 17:55, Vitaly Kuznetsov wrote: > David Hildenbrand writes: > >> For now, distributions implement advanced udev rules to essentially >> - Don't online any hotplugged memory (s390x) >> - Online all memory to ZONE_NORMAL (e.g., most virt environments like >> hyperv) >> - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken >> care of (e.g., bare metal, special virt environments) >> >> In summary: All memory is usually onlined the same way, however, the >> kernel always has to ask userspace to come up with the same answer. >> E.g., HyperV always waits for a memory block to get onlined before >> continuing, otherwise it might end up adding memory faster than >> hotplugging it, which can result in strange OOM situations. >> >> Let's allow to specify a default online_type, not just "online" and >> "offline". This allows distributions to configure the default online_type >> when booting up and be done with it. >> >> We can now specify "offline", "online", "online_movable" and >> "online_kernel" via >> - "memhp_default_state=" on the kernel cmdline >> - /sys/devices/systemn/memory/auto_online_blocks >> just like we are able to specify for a single memory block via >> /sys/devices/systemn/memory/memoryX/state >> > > Thank you for picking this up! > > It's been awhile since I've added CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE > but I vaguely recall one problem: memory hotplug may happen *very* early > (just because some memory is presented to a VM as hotplug memory, it is > not in e820). It happens way before we launch userspace (including > udev). The question is -- which ZONE will this memory be assigned too? If it's added via add_memory() ("hot/cold plugged memory") like ACPI DIMMs not part of e820, Hyper-V balloon added memory, XEN balloon added memory, s390x standby memory etc. the memory will be onlined as configured via memhp_default_online_type. Assume that one is set to "offline". *If* userspace changes memhp_default_online_type (as in my script in the cover letter), userspace has to online all memory that has been added before userspace was active itself (again, as done in my script). Memory not added via add_memory() is considered "initial memory" and not as hot/cold plugged memory. Same handling as for now using udev rules. (once userspace is up, udev rules for all early added memory is triggered as well) > > 'memhp_default_state=' resolves the issue but nobody likes additional > kernel parameters for anything but > debug. CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE was supposed to help, but it > is binary and distro-wide (so *all* deployments will get the same > default and as you validly stated we want it differently). > > We could've added something like your example onlining script to the > kernel itself but this is likely going to be hard to sell: "policies > belong to userspace!" will likely be the answer. Exactly my thought. -- Thanks, David / dhildenb
Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type
David Hildenbrand writes: > For now, distributions implement advanced udev rules to essentially > - Don't online any hotplugged memory (s390x) > - Online all memory to ZONE_NORMAL (e.g., most virt environments like > hyperv) > - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken > care of (e.g., bare metal, special virt environments) > > In summary: All memory is usually onlined the same way, however, the > kernel always has to ask userspace to come up with the same answer. > E.g., HyperV always waits for a memory block to get onlined before > continuing, otherwise it might end up adding memory faster than > hotplugging it, which can result in strange OOM situations. > > Let's allow to specify a default online_type, not just "online" and > "offline". This allows distributions to configure the default online_type > when booting up and be done with it. > > We can now specify "offline", "online", "online_movable" and > "online_kernel" via > - "memhp_default_state=" on the kernel cmdline > - /sys/devices/systemn/memory/auto_online_blocks > just like we are able to specify for a single memory block via > /sys/devices/systemn/memory/memoryX/state > Thank you for picking this up! It's been awhile since I've added CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE but I vaguely recall one problem: memory hotplug may happen *very* early (just because some memory is presented to a VM as hotplug memory, it is not in e820). It happens way before we launch userspace (including udev). The question is -- which ZONE will this memory be assigned too? 'memhp_default_state=' resolves the issue but nobody likes additional kernel parameters for anything but debug. CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE was supposed to help, but it is binary and distro-wide (so *all* deployments will get the same default and as you validly stated we want it differently). We could've added something like your example onlining script to the kernel itself but this is likely going to be hard to sell: "policies belong to userspace!" will likely be the answer. So if we don't want to start the endless discussions (again), your proposal is 'good enough'. > Cc: Greg Kroah-Hartman > Cc: Andrew Morton > Cc: Michal Hocko > Cc: Oscar Salvador > Cc: "Rafael J. Wysocki" > Cc: Baoquan He > Cc: Wei Yang > Signed-off-by: David Hildenbrand > --- > drivers/base/memory.c | 11 +-- > include/linux/memory_hotplug.h | 2 ++ > mm/memory_hotplug.c| 8 > 3 files changed, 11 insertions(+), 10 deletions(-) > > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index 8d3e16dab69f..2b09b68b9f78 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = { > [MMOP_ONLINE_MOVABLE] = "online_movable", > }; > > -static int memhp_online_type_from_str(const char *str) > +int memhp_online_type_from_str(const char *str) > { > int i; > > @@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device > *dev, > struct device_attribute *attr, > const char *buf, size_t count) > { > - if (sysfs_streq(buf, "online")) > - memhp_default_online_type = MMOP_ONLINE; > - else if (sysfs_streq(buf, "offline")) > - memhp_default_online_type = MMOP_OFFLINE; > - else > + const int online_type = memhp_online_type_from_str(buf); > + > + if (online_type < 0) > return -EINVAL; > > + memhp_default_online_type = online_type; > return count; > } > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index c6e090b34c4b..ef55115320fb 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, > struct mhp_restrictions *restrictions); > extern u64 max_mem_size; > > +extern int memhp_online_type_from_str(const char *str); > + > /* Default online_type (MMOP_*) when new memory blocks are added. */ > extern int memhp_default_online_type; > /* If movable_node boot option specified */ > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 01443c70aa27..4a96273eafa7 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type); > > static int __init setup_memhp_default_state(char *str) > { > - if (!strcmp(str, "online")) > - memhp_default_online_type = MMOP_ONLINE; > - else if (!strcmp(str, "offline")) > - memhp_default_online_type = MMOP_OFFLINE; > + const int online_type = memhp_online_type_from_str(str); > + > + if (online_type >= 0) > + memhp_default_online_type = online_type; > > return 1; > } -- Vitaly
Re: [RFC 00/11] perf: Enhancing perf to export processor hazard information
Hi Kim, On 3/6/20 3:36 AM, Kim Phillips wrote: On 3/3/20 3:55 AM, Kim Phillips wrote: On 3/2/20 2:21 PM, Stephane Eranian wrote: On Mon, Mar 2, 2020 at 2:13 AM Peter Zijlstra wrote: On Mon, Mar 02, 2020 at 10:53:44AM +0530, Ravi Bangoria wrote: Modern processors export such hazard data in Performance Monitoring Unit (PMU) registers. Ex, 'Sampled Instruction Event Register' on IBM PowerPC[1][2] and 'Instruction-Based Sampling' on AMD[3] provides similar information. Implementation detail: A new sample_type called PERF_SAMPLE_PIPELINE_HAZ is introduced. If it's set, kernel converts arch specific hazard information into generic format: struct perf_pipeline_haz_data { /* Instruction/Opcode type: Load, Store, Branch */ __u8 itype; /* Instruction Cache source */ __u8 icache; /* Instruction suffered hazard in pipeline stage */ __u8 hazard_stage; /* Hazard reason */ __u8 hazard_reason; /* Instruction suffered stall in pipeline stage */ __u8 stall_stage; /* Stall reason */ __u8 stall_reason; __u16 pad; }; Kim, does this format indeed work for AMD IBS? It's not really 1:1, we don't have these separations of stages and reasons, for example: we have missed in L2 cache, for example. So IBS output is flatter, with more cycle latency figures than IBM's AFAICT. AMD IBS captures pipeline latency data incase Fetch sampling like the Fetch latency, tag to retire latency, completion to retire latency and so on. Yes, Ops sampling do provide more data on load/store centric information. But it also captures more detailed data for Branch instructions. And we also looked at ARM SPE, which also captures more details pipeline data and latency information. Personally, I don't like the term hazard. This is too IBM Power specific. We need to find a better term, maybe stall or penalty. Right, IBS doesn't have a filter to only count stalled or otherwise bad events. IBS' PPR descriptions has one occurrence of the word stall, and no penalty. The way I read IBS is it's just reporting more sample data than just the precise IP: things like hits, misses, cycle latencies, addresses, types, etc., so words like 'extended', or the 'auxiliary' already used today even are more appropriate for IBS, although I'm the last person to bikeshed. We are thinking of using "pipeline" word instead of Hazard. Hm, the word 'pipeline' occurs 0 times in IBS documentation. NP. We thought pipeline is generic hw term so we proposed "pipeline" word. We are open to term which can be generic enough. I realize there are a couple of core pipeline-specific pieces of information coming out of it, but the vast majority are addresses, latencies of various components in the memory hierarchy, and various component hit/miss bits. Yes. we should capture core pipeline specific details. For example, IBS generates Branch unit information(IbsOpData1) and Icahce related data(IbsFetchCtl) which is something that shouldn't be extended as part of perf-mem, IMO. What's needed here is a vendor-specific extended sample information that all these technologies gather, of which things like e.g., 'L1 TLB cycle latency' we all should have in common. Yes. We will include fields to capture the latency cycles (like Issue latency, Instruction completion latency etc..) along with other pipeline details in the proposed structure. I'm not sure why a new PERF_SAMPLE_PIPELINE_HAZ is needed either. Can we use PERF_SAMPLE_AUX instead? We took a look at PERF_SAMPLE_AUX. IIUC, PERF_SAMPLE_AUX is intended when large volume of data needs to be captured as part of perf.data without frequent PMIs. But proposed type is to address the capture of pipeline information on each sample using PMI at periodic intervals. Hence proposing PERF_SAMPLE_PIPELINE_HAZ. Take a look at commit 98dcf14d7f9c "perf tools: Add kernel AUX area sampling definitions". The sample identifier can be used to determine which vendor's sampling IP's data is in it, and events can be recorded just by copying the content of the SIER, etc. registers, and then events get synthesized from the aux sample at report/inject/annotate etc. time. This allows for less sample recording overhead, and moves all the vendor specific decoding and common event conversions for userspace to figure out. When AUX buffer data is structured, tool side changes added to present the pipeline data can be re-used. Also worth considering is the support of ARM SPE (Statistical Profiling Extension) which is their version of IBS. Whatever gets added need to cover all three with no limitations. I thought Intel's various LBR, PEBS, and PT supported providing similar sample data in perf already, like with perf mem/c2c? perf-mem is more of data centric in my opinion. It is more towards memory profiling. So proposal here is to expose pipeline related det
Re: [PATCH v3] ima: add a new CONFIG for loading arch-specific policies
On Sun, 2020-03-08 at 20:57 -0400, Nayna Jain wrote: > From: Nayna Jain > > Every time a new architecture defines the IMA architecture specific > functions - arch_ima_get_secureboot() and arch_ima_get_policy(), the IMA > include file needs to be updated. To avoid this "noise", this patch > defines a new IMA Kconfig IMA_SECURE_AND_OR_TRUSTED_BOOT option, allowing > the different architectures to select it. > > Suggested-by: Linus Torvalds > Signed-off-by: Nayna Jain > Acked-by: Ard Biesheuvel > Cc: Philipp Rudo > Cc: Michael Ellerman Thanks, Michael for the suggestion of using "imply". Seems to be working nicely. Thanks, Nayna. I pushed this patch out to next- integrity-testing. Could we get some tags on this version of the patch? thanks, Mimi
Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type
On 11.03.20 15:26, Wei Yang wrote: > On Wed, Mar 11, 2020 at 01:30:26PM +0100, David Hildenbrand wrote: >> For now, distributions implement advanced udev rules to essentially >> - Don't online any hotplugged memory (s390x) >> - Online all memory to ZONE_NORMAL (e.g., most virt environments like >> hyperv) >> - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken >> care of (e.g., bare metal, special virt environments) >> >> In summary: All memory is usually onlined the same way, however, the >> kernel always has to ask userspace to come up with the same answer. >> E.g., HyperV always waits for a memory block to get onlined before >> continuing, otherwise it might end up adding memory faster than >> hotplugging it, which can result in strange OOM situations. >> >> Let's allow to specify a default online_type, not just "online" and >> "offline". This allows distributions to configure the default online_type >> when booting up and be done with it. >> >> We can now specify "offline", "online", "online_movable" and >> "online_kernel" via >> - "memhp_default_state=" on the kernel cmdline >> - /sys/devices/systemn/memory/auto_online_blocks >> just like we are able to specify for a single memory block via >> /sys/devices/systemn/memory/memoryX/state >> >> Cc: Greg Kroah-Hartman >> Cc: Andrew Morton >> Cc: Michal Hocko >> Cc: Oscar Salvador >> Cc: "Rafael J. Wysocki" >> Cc: Baoquan He >> Cc: Wei Yang >> Signed-off-by: David Hildenbrand > > Ok, I got the reason to leave the change on string compare here. Thanks for your *very fast* review! :) -- Thanks, David / dhildenb
Re: [PATCH] hwmon: (ibmpowernv) Use scnprintf() for avoiding potential buffer overflow
On Wed, Mar 11, 2020 at 08:39:44AM +0100, Takashi Iwai wrote: > Since snprintf() returns the would-be-output size instead of the > actual output size, the succeeding calls may go beyond the given > buffer limit. Fix it by replacing with scnprintf(). > > Signed-off-by: Takashi Iwai Applied to hwmon-next. Thanks, Guenter > --- > drivers/hwmon/ibmpowernv.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c > index 0e525cfbdfc5..a750647e66a4 100644 > --- a/drivers/hwmon/ibmpowernv.c > +++ b/drivers/hwmon/ibmpowernv.c > @@ -186,7 +186,7 @@ static void make_sensor_label(struct device_node *np, > u32 id; > size_t n; > > - n = snprintf(sdata->label, sizeof(sdata->label), "%s", label); > + n = scnprintf(sdata->label, sizeof(sdata->label), "%s", label); > > /* >* Core temp pretty print > @@ -199,11 +199,11 @@ static void make_sensor_label(struct device_node *np, >* The digital thermal sensors are associated >* with a core. >*/ > - n += snprintf(sdata->label + n, > + n += scnprintf(sdata->label + n, > sizeof(sdata->label) - n, " %d", > cpuid); > else > - n += snprintf(sdata->label + n, > + n += scnprintf(sdata->label + n, > sizeof(sdata->label) - n, " phy%d", id); > } > > @@ -211,7 +211,7 @@ static void make_sensor_label(struct device_node *np, >* Membuffer pretty print >*/ > if (!of_property_read_u32(np, "ibm,chip-id", &id)) > - n += snprintf(sdata->label + n, sizeof(sdata->label) - n, > + n += scnprintf(sdata->label + n, sizeof(sdata->label) - n, > " %d", id & 0x); > } > > -- > 2.16.4 >
Re: [PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array
On Wed, Mar 11, 2020 at 02:20:02PM +, Wei Yang wrote: >On Wed, Mar 11, 2020 at 01:30:24PM +0100, David Hildenbrand wrote: >>Let's use a simple array which we can reuse soon. While at it, move the >>string->mmop conversion out of the device hotplug lock. >> >>Cc: Greg Kroah-Hartman >>Cc: Andrew Morton >>Cc: Michal Hocko >>Cc: Oscar Salvador >>Cc: "Rafael J. Wysocki" >>Cc: Baoquan He >>Cc: Wei Yang >>Signed-off-by: David Hildenbrand Ok, I got the reason. Reviewed-by: Wei Yang >>--- >> drivers/base/memory.c | 38 +++--- >> 1 file changed, 23 insertions(+), 15 deletions(-) >> >>diff --git a/drivers/base/memory.c b/drivers/base/memory.c >>index e7e77cafef80..8a7f29c0bf97 100644 >>--- a/drivers/base/memory.c >>+++ b/drivers/base/memory.c >>@@ -28,6 +28,24 @@ >> >> #define MEMORY_CLASS_NAME"memory" >> >>+static const char *const online_type_to_str[] = { >>+ [MMOP_OFFLINE] = "offline", >>+ [MMOP_ONLINE] = "online", >>+ [MMOP_ONLINE_KERNEL] = "online_kernel", >>+ [MMOP_ONLINE_MOVABLE] = "online_movable", >>+}; >>+ >>+static int memhp_online_type_from_str(const char *str) >>+{ >>+ int i; >>+ >>+ for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { >>+ if (sysfs_streq(str, online_type_to_str[i])) >>+ return i; >>+ } >>+ return -EINVAL; >>+} >>+ >> #define to_memory_block(dev) container_of(dev, struct memory_block, dev) >> >> static int sections_per_block; >>@@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev) >> static ssize_t state_store(struct device *dev, struct device_attribute *attr, >> const char *buf, size_t count) >> { >>+ const int online_type = memhp_online_type_from_str(buf); > >In your following patch, you did the same conversion. Is it possible to merge >them into this one? > >> struct memory_block *mem = to_memory_block(dev); >>- int ret, online_type; >>+ int ret; >>+ >>+ if (online_type < 0) >>+ return -EINVAL; >> >> ret = lock_device_hotplug_sysfs(); >> if (ret) >> return ret; >> >>- if (sysfs_streq(buf, "online_kernel")) >>- online_type = MMOP_ONLINE_KERNEL; >>- else if (sysfs_streq(buf, "online_movable")) >>- online_type = MMOP_ONLINE_MOVABLE; >>- else if (sysfs_streq(buf, "online")) >>- online_type = MMOP_ONLINE; >>- else if (sysfs_streq(buf, "offline")) >>- online_type = MMOP_OFFLINE; >>- else { >>- ret = -EINVAL; >>- goto err; >>- } >>- >> switch (online_type) { >> case MMOP_ONLINE_KERNEL: >> case MMOP_ONLINE_MOVABLE: >>@@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct >>device_attribute *attr, >> ret = -EINVAL; /* should never happen */ >> } >> >>-err: >> unlock_device_hotplug(); >> >> if (ret < 0) >>-- >>2.24.1 > >-- >Wei Yang >Help you, Help me -- Wei Yang Help you, Help me
Re: [PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type
On Wed, Mar 11, 2020 at 01:30:26PM +0100, David Hildenbrand wrote: >For now, distributions implement advanced udev rules to essentially >- Don't online any hotplugged memory (s390x) >- Online all memory to ZONE_NORMAL (e.g., most virt environments like > hyperv) >- Online all memory to ZONE_MOVABLE in case the zone imbalance is taken > care of (e.g., bare metal, special virt environments) > >In summary: All memory is usually onlined the same way, however, the >kernel always has to ask userspace to come up with the same answer. >E.g., HyperV always waits for a memory block to get onlined before >continuing, otherwise it might end up adding memory faster than >hotplugging it, which can result in strange OOM situations. > >Let's allow to specify a default online_type, not just "online" and >"offline". This allows distributions to configure the default online_type >when booting up and be done with it. > >We can now specify "offline", "online", "online_movable" and >"online_kernel" via >- "memhp_default_state=" on the kernel cmdline >- /sys/devices/systemn/memory/auto_online_blocks >just like we are able to specify for a single memory block via >/sys/devices/systemn/memory/memoryX/state > >Cc: Greg Kroah-Hartman >Cc: Andrew Morton >Cc: Michal Hocko >Cc: Oscar Salvador >Cc: "Rafael J. Wysocki" >Cc: Baoquan He >Cc: Wei Yang >Signed-off-by: David Hildenbrand Ok, I got the reason to leave the change on string compare here. Reviewed-by: Wei Yang >--- > drivers/base/memory.c | 11 +-- > include/linux/memory_hotplug.h | 2 ++ > mm/memory_hotplug.c| 8 > 3 files changed, 11 insertions(+), 10 deletions(-) > >diff --git a/drivers/base/memory.c b/drivers/base/memory.c >index 8d3e16dab69f..2b09b68b9f78 100644 >--- a/drivers/base/memory.c >+++ b/drivers/base/memory.c >@@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = { > [MMOP_ONLINE_MOVABLE] = "online_movable", > }; > >-static int memhp_online_type_from_str(const char *str) >+int memhp_online_type_from_str(const char *str) > { > int i; > >@@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device >*dev, > struct device_attribute *attr, > const char *buf, size_t count) > { >- if (sysfs_streq(buf, "online")) >- memhp_default_online_type = MMOP_ONLINE; >- else if (sysfs_streq(buf, "offline")) >- memhp_default_online_type = MMOP_OFFLINE; >- else >+ const int online_type = memhp_online_type_from_str(buf); >+ >+ if (online_type < 0) > return -EINVAL; > >+ memhp_default_online_type = online_type; > return count; > } > >diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h >index c6e090b34c4b..ef55115320fb 100644 >--- a/include/linux/memory_hotplug.h >+++ b/include/linux/memory_hotplug.h >@@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, > struct mhp_restrictions *restrictions); > extern u64 max_mem_size; > >+extern int memhp_online_type_from_str(const char *str); >+ > /* Default online_type (MMOP_*) when new memory blocks are added. */ > extern int memhp_default_online_type; > /* If movable_node boot option specified */ >diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c >index 01443c70aa27..4a96273eafa7 100644 >--- a/mm/memory_hotplug.c >+++ b/mm/memory_hotplug.c >@@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type); > > static int __init setup_memhp_default_state(char *str) > { >- if (!strcmp(str, "online")) >- memhp_default_online_type = MMOP_ONLINE; >- else if (!strcmp(str, "offline")) >- memhp_default_online_type = MMOP_OFFLINE; >+ const int online_type = memhp_online_type_from_str(str); >+ >+ if (online_type >= 0) >+ memhp_default_online_type = online_type; > > return 1; > } >-- >2.24.1 -- Wei Yang Help you, Help me
Re: [PATCH v1 4/5] mm/memory_hotplug: convert memhp_auto_online to store an online_type
On Wed, Mar 11, 2020 at 01:30:25PM +0100, David Hildenbrand wrote: >... and rename it to memhp_default_online_type. This is a preparation >for more detailed default online behavior. > >Cc: Greg Kroah-Hartman >Cc: Andrew Morton >Cc: Michal Hocko >Cc: Oscar Salvador >Cc: "Rafael J. Wysocki" >Cc: Baoquan He >Cc: Wei Yang >Cc: Benjamin Herrenschmidt >Cc: Paul Mackerras >Cc: Michael Ellerman >Cc: "K. Y. Srinivasan" >Cc: Haiyang Zhang >Cc: Stephen Hemminger >Cc: Wei Liu >Cc: Thomas Gleixner >Cc: linuxppc-dev@lists.ozlabs.org >Cc: linux-hyp...@vger.kernel.org >Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang >--- > arch/powerpc/platforms/powernv/memtrace.c | 2 +- > drivers/base/memory.c | 10 -- > drivers/hv/hv_balloon.c | 2 +- > include/linux/memory_hotplug.h| 3 ++- > mm/memory_hotplug.c | 13 +++-- > 5 files changed, 15 insertions(+), 15 deletions(-) > >diff --git a/arch/powerpc/platforms/powernv/memtrace.c >b/arch/powerpc/platforms/powernv/memtrace.c >index d6d64f8718e6..e15a600cfa4d 100644 >--- a/arch/powerpc/platforms/powernv/memtrace.c >+++ b/arch/powerpc/platforms/powernv/memtrace.c >@@ -235,7 +235,7 @@ static int memtrace_online(void) >* If kernel isn't compiled with the auto online option >* we need to online the memory ourselves. >*/ >- if (!memhp_auto_online) { >+ if (memhp_default_online_type == MMOP_OFFLINE) { > lock_device_hotplug(); > walk_memory_blocks(ent->start, ent->size, NULL, > online_mem_block); >diff --git a/drivers/base/memory.c b/drivers/base/memory.c >index 8a7f29c0bf97..8d3e16dab69f 100644 >--- a/drivers/base/memory.c >+++ b/drivers/base/memory.c >@@ -386,10 +386,8 @@ static DEVICE_ATTR_RO(block_size_bytes); > static ssize_t auto_online_blocks_show(struct device *dev, > struct device_attribute *attr, char *buf) > { >- if (memhp_auto_online) >- return sprintf(buf, "online\n"); >- else >- return sprintf(buf, "offline\n"); >+ return sprintf(buf, "%s\n", >+ online_type_to_str[memhp_default_online_type]); > } > > static ssize_t auto_online_blocks_store(struct device *dev, >@@ -397,9 +395,9 @@ static ssize_t auto_online_blocks_store(struct device *dev, > const char *buf, size_t count) > { > if (sysfs_streq(buf, "online")) >- memhp_auto_online = true; >+ memhp_default_online_type = MMOP_ONLINE; > else if (sysfs_streq(buf, "offline")) >- memhp_auto_online = false; >+ memhp_default_online_type = MMOP_OFFLINE; > else > return -EINVAL; > >diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c >index a02ce43d778d..3b90fd12e0c5 100644 >--- a/drivers/hv/hv_balloon.c >+++ b/drivers/hv/hv_balloon.c >@@ -727,7 +727,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned >long size, > spin_unlock_irqrestore(&dm_device.ha_lock, flags); > > init_completion(&dm_device.ol_waitevent); >- dm_device.ha_waiting = !memhp_auto_online; >+ dm_device.ha_waiting = memhp_default_online_type == >MMOP_OFFLINE; > > nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); > ret = add_memory(nid, PFN_PHYS((start_pfn)), >diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h >index c2e06ed5e0e9..c6e090b34c4b 100644 >--- a/include/linux/memory_hotplug.h >+++ b/include/linux/memory_hotplug.h >@@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, > struct mhp_restrictions *restrictions); > extern u64 max_mem_size; > >-extern bool memhp_auto_online; >+/* Default online_type (MMOP_*) when new memory blocks are added. */ >+extern int memhp_default_online_type; > /* If movable_node boot option specified */ > extern bool movable_node_enabled; > static inline bool movable_node_is_enabled(void) >diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c >index 1a00b5a37ef6..01443c70aa27 100644 >--- a/mm/memory_hotplug.c >+++ b/mm/memory_hotplug.c >@@ -67,18 +67,18 @@ void put_online_mems(void) > bool movable_node_enabled = false; > > #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE >-bool memhp_auto_online; >+int memhp_default_online_type = MMOP_OFFLINE; > #else >-bool memhp_auto_online = true; >+int memhp_default_online_type = MMOP_ONLINE; > #endif >-EXPORT_SYMBOL_GPL(memhp_auto_online); >+EXPORT_SYMBOL_GPL(memhp_default_online_type); > > static int __init setup_memhp_default_state(char *str) > { > if (!strcmp(str, "online")) >- memhp_auto_online = true; >+ memhp_default_online_type = MMOP_ONLINE; > else if (!strcmp(str, "offline")) >- memh
Re: [PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array
On Wed, Mar 11, 2020 at 01:30:24PM +0100, David Hildenbrand wrote: >Let's use a simple array which we can reuse soon. While at it, move the >string->mmop conversion out of the device hotplug lock. > >Cc: Greg Kroah-Hartman >Cc: Andrew Morton >Cc: Michal Hocko >Cc: Oscar Salvador >Cc: "Rafael J. Wysocki" >Cc: Baoquan He >Cc: Wei Yang >Signed-off-by: David Hildenbrand >--- > drivers/base/memory.c | 38 +++--- > 1 file changed, 23 insertions(+), 15 deletions(-) > >diff --git a/drivers/base/memory.c b/drivers/base/memory.c >index e7e77cafef80..8a7f29c0bf97 100644 >--- a/drivers/base/memory.c >+++ b/drivers/base/memory.c >@@ -28,6 +28,24 @@ > > #define MEMORY_CLASS_NAME "memory" > >+static const char *const online_type_to_str[] = { >+ [MMOP_OFFLINE] = "offline", >+ [MMOP_ONLINE] = "online", >+ [MMOP_ONLINE_KERNEL] = "online_kernel", >+ [MMOP_ONLINE_MOVABLE] = "online_movable", >+}; >+ >+static int memhp_online_type_from_str(const char *str) >+{ >+ int i; >+ >+ for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { >+ if (sysfs_streq(str, online_type_to_str[i])) >+ return i; >+ } >+ return -EINVAL; >+} >+ > #define to_memory_block(dev) container_of(dev, struct memory_block, dev) > > static int sections_per_block; >@@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev) > static ssize_t state_store(struct device *dev, struct device_attribute *attr, > const char *buf, size_t count) > { >+ const int online_type = memhp_online_type_from_str(buf); In your following patch, you did the same conversion. Is it possible to merge them into this one? > struct memory_block *mem = to_memory_block(dev); >- int ret, online_type; >+ int ret; >+ >+ if (online_type < 0) >+ return -EINVAL; > > ret = lock_device_hotplug_sysfs(); > if (ret) > return ret; > >- if (sysfs_streq(buf, "online_kernel")) >- online_type = MMOP_ONLINE_KERNEL; >- else if (sysfs_streq(buf, "online_movable")) >- online_type = MMOP_ONLINE_MOVABLE; >- else if (sysfs_streq(buf, "online")) >- online_type = MMOP_ONLINE; >- else if (sysfs_streq(buf, "offline")) >- online_type = MMOP_OFFLINE; >- else { >- ret = -EINVAL; >- goto err; >- } >- > switch (online_type) { > case MMOP_ONLINE_KERNEL: > case MMOP_ONLINE_MOVABLE: >@@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct >device_attribute *attr, > ret = -EINVAL; /* should never happen */ > } > >-err: > unlock_device_hotplug(); > > if (ret < 0) >-- >2.24.1 -- Wei Yang Help you, Help me
Re: [PATCH v1 2/5] drivers/base/memory: map MMOP_OFFLINE to 0
On Wed, Mar 11, 2020 at 01:30:23PM +0100, David Hildenbrand wrote: >I have no idea why we have to start at -1. Just treat 0 as the special >case. Clarify a comment (which was wrong, when we come via >device_online() the first time, the online_type would have been 0 / >MEM_ONLINE). The default is now always MMOP_OFFLINE. > >This is a preparation to use the online_type as an array index. > >Cc: Greg Kroah-Hartman >Cc: Andrew Morton >Cc: Michal Hocko >Cc: Oscar Salvador >Cc: "Rafael J. Wysocki" >Cc: Baoquan He >Cc: Wei Yang >Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang >--- > drivers/base/memory.c | 11 --- > include/linux/memory_hotplug.h | 2 +- > 2 files changed, 5 insertions(+), 8 deletions(-) > >diff --git a/drivers/base/memory.c b/drivers/base/memory.c >index 8c5ce42c0fc3..e7e77cafef80 100644 >--- a/drivers/base/memory.c >+++ b/drivers/base/memory.c >@@ -211,17 +211,14 @@ static int memory_subsys_online(struct device *dev) > return 0; > > /* >- * If we are called from state_store(), online_type will be >- * set >= 0 Otherwise we were called from the device online >- * attribute and need to set the online_type. >+ * When called via device_online() without configuring the online_type, >+ * we want to default to MMOP_ONLINE. >*/ >- if (mem->online_type < 0) >+ if (mem->online_type == MMOP_OFFLINE) > mem->online_type = MMOP_ONLINE; > > ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); >- >- /* clear online_type */ >- mem->online_type = -1; >+ mem->online_type = MMOP_OFFLINE; > > return ret; > } >diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h >index 261dbf010d5d..c2e06ed5e0e9 100644 >--- a/include/linux/memory_hotplug.h >+++ b/include/linux/memory_hotplug.h >@@ -48,7 +48,7 @@ enum { > /* Types for control the zone type of onlined and offlined memory */ > enum { > /* Offline the memory. */ >- MMOP_OFFLINE = -1, >+ MMOP_OFFLINE = 0, > /* Online the memory. Zone depends, see default_zone_for_pfn(). */ > MMOP_ONLINE, > /* Online the memory to ZONE_NORMAL. */ >-- >2.24.1 -- Wei Yang Help you, Help me
Re: [PATCH v1 1/5] drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE
On Wed, Mar 11, 2020 at 01:30:22PM +0100, David Hildenbrand wrote: >The name is misleading. Let's just name it like the online_type name we >expose to user space ("online"). > >Add some documentation to the types. > >Cc: Greg Kroah-Hartman >Cc: Andrew Morton >Cc: Michal Hocko >Cc: Oscar Salvador >Cc: "Rafael J. Wysocki" >Cc: Baoquan He >Cc: Wei Yang >Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang >--- > drivers/base/memory.c | 9 + > include/linux/memory_hotplug.h | 6 +- > 2 files changed, 10 insertions(+), 5 deletions(-) > >diff --git a/drivers/base/memory.c b/drivers/base/memory.c >index 6448c9ece2cb..8c5ce42c0fc3 100644 >--- a/drivers/base/memory.c >+++ b/drivers/base/memory.c >@@ -216,7 +216,7 @@ static int memory_subsys_online(struct device *dev) >* attribute and need to set the online_type. >*/ > if (mem->online_type < 0) >- mem->online_type = MMOP_ONLINE_KEEP; >+ mem->online_type = MMOP_ONLINE; > > ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); > >@@ -251,7 +251,7 @@ static ssize_t state_store(struct device *dev, struct >device_attribute *attr, > else if (sysfs_streq(buf, "online_movable")) > online_type = MMOP_ONLINE_MOVABLE; > else if (sysfs_streq(buf, "online")) >- online_type = MMOP_ONLINE_KEEP; >+ online_type = MMOP_ONLINE; > else if (sysfs_streq(buf, "offline")) > online_type = MMOP_OFFLINE; > else { >@@ -262,7 +262,7 @@ static ssize_t state_store(struct device *dev, struct >device_attribute *attr, > switch (online_type) { > case MMOP_ONLINE_KERNEL: > case MMOP_ONLINE_MOVABLE: >- case MMOP_ONLINE_KEEP: >+ case MMOP_ONLINE: > /* mem->online_type is protected by device_hotplug_lock */ > mem->online_type = online_type; > ret = device_online(&mem->dev); >@@ -342,7 +342,8 @@ static ssize_t valid_zones_show(struct device *dev, > } > > nid = mem->nid; >- default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, >nr_pages); >+ default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, >+nr_pages); > strcat(buf, default_zone->name); > > print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, >diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h >index f4d59155f3d4..261dbf010d5d 100644 >--- a/include/linux/memory_hotplug.h >+++ b/include/linux/memory_hotplug.h >@@ -47,9 +47,13 @@ enum { > > /* Types for control the zone type of onlined and offlined memory */ > enum { >+ /* Offline the memory. */ > MMOP_OFFLINE = -1, >- MMOP_ONLINE_KEEP, >+ /* Online the memory. Zone depends, see default_zone_for_pfn(). */ >+ MMOP_ONLINE, >+ /* Online the memory to ZONE_NORMAL. */ > MMOP_ONLINE_KERNEL, >+ /* Online the memory to ZONE_MOVABLE. */ > MMOP_ONLINE_MOVABLE, > }; > >-- >2.24.1 -- Wei Yang Help you, Help me
[PATCH] ocxl: control via sysfs whether the FPGA is reloaded on a link reset
Some opencapi FPGA images allow to control if the FPGA should be reloaded on the next adapter reset. If it is supported, the image specifies it through a Vendor Specific DVSEC in the config space of function 0. This patch adds an interface to sysfs to control that behavior, if possible. Signed-off-by: Philippe Bergheaud --- Documentation/ABI/testing/sysfs-class-ocxl | 10 drivers/misc/ocxl/config.c | 59 +- drivers/misc/ocxl/ocxl_internal.h | 6 +++ drivers/misc/ocxl/sysfs.c | 35 + include/misc/ocxl-config.h | 1 + 5 files changed, 110 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-class-ocxl b/Documentation/ABI/testing/sysfs-class-ocxl index b5b1fa197592..b9ea671d5805 100644 --- a/Documentation/ABI/testing/sysfs-class-ocxl +++ b/Documentation/ABI/testing/sysfs-class-ocxl @@ -33,3 +33,13 @@ Date:January 2018 Contact: linuxppc-dev@lists.ozlabs.org Description: read/write Give access the global mmio area for the AFU + +What: /sys/class/ocxl//reload_on_reset +Date: February 2020 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Control whether the FPGA is reloaded on a link reset + 0 Do not reload FPGA image from flash + 1 Reload FPGA image from flash + unavailable + The device does not support this capability diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index c8e19bfb5ef9..3488463c1640 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -71,6 +71,20 @@ static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx) return 0; } +/** + * get_function_0() - Find a related PCI device (function 0) + * @device: PCI device to match + * + * Returns a pointer to the related device, or null if not found + */ +static struct pci_dev *get_function_0(struct pci_dev *dev) +{ + unsigned int devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + + return pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus), + dev->bus->number, devfn); +} + static void read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn) { u16 val; @@ -159,7 +173,7 @@ static int read_dvsec_afu_info(struct pci_dev *dev, struct ocxl_fn_config *fn) static int read_dvsec_vendor(struct pci_dev *dev) { int pos; - u32 cfg, tlx, dlx; + u32 cfg, tlx, dlx, reset_reload; /* * vendor specific DVSEC is optional @@ -178,11 +192,54 @@ static int read_dvsec_vendor(struct pci_dev *dev) pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_CFG_VERS, &cfg); pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_TLX_VERS, &tlx); pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_DLX_VERS, &dlx); + pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, &reset_reload); dev_dbg(&dev->dev, "Vendor specific DVSEC:\n"); dev_dbg(&dev->dev, " CFG version = 0x%x\n", cfg); dev_dbg(&dev->dev, " TLX version = 0x%x\n", tlx); dev_dbg(&dev->dev, " DLX version = 0x%x\n", dlx); + dev_dbg(&dev->dev, " ResetReload = 0x%x\n", reset_reload); + return 0; +} + +int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val) +{ + int reset_reload = -1; + int pos = 0; + struct pci_dev *dev0 = get_function_0(dev); + + if (dev0) + pos = find_dvsec(dev0, OCXL_DVSEC_VENDOR_ID); + + if (pos) + pci_read_config_dword(dev0, + pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, + &reset_reload); + if (reset_reload == -1) + return reset_reload; + + *val = reset_reload & BIT(0); + return 0; +} + +int ocxl_config_set_reset_reload(struct pci_dev *dev, int val) +{ + int reset_reload = -1; + int pos = 0; + struct pci_dev *dev0 = get_function_0(dev); + + if (dev0) + pos = find_dvsec(dev0, OCXL_DVSEC_VENDOR_ID); + + if (pos) + pci_read_config_dword(dev0, + pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, + &reset_reload); + if (reset_reload == -1) + return reset_reload; + + val &= BIT(0); + pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, val); return 0; } diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 345bf843a38e..af9a84aeee6f 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -112,6 +112,12 @@ void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size); */ int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); +/* + * Con
[PATCH v1 5/5] mm/memory_hotplug: allow to specify a default online_type
For now, distributions implement advanced udev rules to essentially - Don't online any hotplugged memory (s390x) - Online all memory to ZONE_NORMAL (e.g., most virt environments like hyperv) - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken care of (e.g., bare metal, special virt environments) In summary: All memory is usually onlined the same way, however, the kernel always has to ask userspace to come up with the same answer. E.g., HyperV always waits for a memory block to get onlined before continuing, otherwise it might end up adding memory faster than hotplugging it, which can result in strange OOM situations. Let's allow to specify a default online_type, not just "online" and "offline". This allows distributions to configure the default online_type when booting up and be done with it. We can now specify "offline", "online", "online_movable" and "online_kernel" via - "memhp_default_state=" on the kernel cmdline - /sys/devices/systemn/memory/auto_online_blocks just like we are able to specify for a single memory block via /sys/devices/systemn/memory/memoryX/state Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Baoquan He Cc: Wei Yang Signed-off-by: David Hildenbrand --- drivers/base/memory.c | 11 +-- include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c| 8 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8d3e16dab69f..2b09b68b9f78 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = { [MMOP_ONLINE_MOVABLE] = "online_movable", }; -static int memhp_online_type_from_str(const char *str) +int memhp_online_type_from_str(const char *str) { int i; @@ -394,13 +394,12 @@ static ssize_t auto_online_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (sysfs_streq(buf, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - memhp_default_online_type = MMOP_OFFLINE; - else + const int online_type = memhp_online_type_from_str(buf); + + if (online_type < 0) return -EINVAL; + memhp_default_online_type = online_type; return count; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c6e090b34c4b..ef55115320fb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; +extern int memhp_online_type_from_str(const char *str); + /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01443c70aa27..4a96273eafa7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -75,10 +75,10 @@ EXPORT_SYMBOL_GPL(memhp_default_online_type); static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_default_online_type = MMOP_ONLINE; - else if (!strcmp(str, "offline")) - memhp_default_online_type = MMOP_OFFLINE; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } -- 2.24.1
[PATCH v1 4/5] mm/memory_hotplug: convert memhp_auto_online to store an online_type
... and rename it to memhp_default_online_type. This is a preparation for more detailed default online behavior. Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Baoquan He Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-hyp...@vger.kernel.org Signed-off-by: David Hildenbrand --- arch/powerpc/platforms/powernv/memtrace.c | 2 +- drivers/base/memory.c | 10 -- drivers/hv/hv_balloon.c | 2 +- include/linux/memory_hotplug.h| 3 ++- mm/memory_hotplug.c | 13 +++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index d6d64f8718e6..e15a600cfa4d 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -235,7 +235,7 @@ static int memtrace_online(void) * If kernel isn't compiled with the auto online option * we need to online the memory ourselves. */ - if (!memhp_auto_online) { + if (memhp_default_online_type == MMOP_OFFLINE) { lock_device_hotplug(); walk_memory_blocks(ent->start, ent->size, NULL, online_mem_block); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8a7f29c0bf97..8d3e16dab69f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -386,10 +386,8 @@ static DEVICE_ATTR_RO(block_size_bytes); static ssize_t auto_online_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { - if (memhp_auto_online) - return sprintf(buf, "online\n"); - else - return sprintf(buf, "offline\n"); + return sprintf(buf, "%s\n", + online_type_to_str[memhp_default_online_type]); } static ssize_t auto_online_blocks_store(struct device *dev, @@ -397,9 +395,9 @@ static ssize_t auto_online_blocks_store(struct device *dev, const char *buf, size_t count) { if (sysfs_streq(buf, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; else return -EINVAL; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index a02ce43d778d..3b90fd12e0c5 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -727,7 +727,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, spin_unlock_irqrestore(&dm_device.ha_lock, flags); init_completion(&dm_device.ol_waitevent); - dm_device.ha_waiting = !memhp_auto_online; + dm_device.ha_waiting = memhp_default_online_type == MMOP_OFFLINE; nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c2e06ed5e0e9..c6e090b34c4b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; -extern bool memhp_auto_online; +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1a00b5a37ef6..01443c70aa27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,18 +67,18 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif -EXPORT_SYMBOL_GPL(memhp_auto_online); +EXPORT_SYMBOL_GPL(memhp_default_online_type); static int __init setup_memhp_default_state(char *str) { if (!strcmp(str, "online")) - memhp_auto_online = true; + memhp_default_online_type = MMOP_ONLINE; else if (!strcmp(str, "offline")) - memhp_auto_online = false; + memhp_default_online_type = MMOP_OFFLINE; return 1; } @@ -991,6 +991,7 @@ static int check_hotplug_memory_range(u64 start, u64 size
[PATCH v1 0/5] mm/memory_hotplug: allow to specify a default online_type
Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules. Everytime we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process. To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" === Example /usr/libexec/config-memhotplug === #!/bin/bash VIRT=`systemd-detect-virt --vm` ARCH=`uname -p` sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 } if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 is handled via a user space daemon AFAIK ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi echo "Selected online_type:" $ONLINE_TYPE # Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi # Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi === Example /usr/lib/systemd/system/config-memhotplug.service === [Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks [Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes [Install] WantedBy=sysinit.target === Example modification to the 40-redhat.rules [2] === diff --git a/40-redhat.rules b/40-redhat.rules-new index 2c690e5..168fd03 100644 --- a/40-redhat.rules +++ b/40-redhat.rules-new @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} # Memory hotadd request SUBSYSTEM!="memory", GOTO="memory_hotplug_end" ACTION!="add", GOTO="memory_hotplug_end" +# memory hotplug behavior configured +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" + PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" ENV{.state}="online" === [1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules Cc: Vitaly Kuznetsov Cc: Yumei Huang Cc: Igor Mammedov Cc: Baoquan He Cc: Eduardo Habkost Cc: Milan Zamazal David Hildenbrand (5): drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE drivers/base/memory: map MMOP_OFFLINE to 0 drivers/base/memory: store ma
[PATCH v1 2/5] drivers/base/memory: map MMOP_OFFLINE to 0
I have no idea why we have to start at -1. Just treat 0 as the special case. Clarify a comment (which was wrong, when we come via device_online() the first time, the online_type would have been 0 / MEM_ONLINE). The default is now always MMOP_OFFLINE. This is a preparation to use the online_type as an array index. Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Baoquan He Cc: Wei Yang Signed-off-by: David Hildenbrand --- drivers/base/memory.c | 11 --- include/linux/memory_hotplug.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8c5ce42c0fc3..e7e77cafef80 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -211,17 +211,14 @@ static int memory_subsys_online(struct device *dev) return 0; /* -* If we are called from state_store(), online_type will be -* set >= 0 Otherwise we were called from the device online -* attribute and need to set the online_type. +* When called via device_online() without configuring the online_type, +* we want to default to MMOP_ONLINE. */ - if (mem->online_type < 0) + if (mem->online_type == MMOP_OFFLINE) mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); - - /* clear online_type */ - mem->online_type = -1; + mem->online_type = MMOP_OFFLINE; return ret; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 261dbf010d5d..c2e06ed5e0e9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -48,7 +48,7 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ - MMOP_OFFLINE = -1, + MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ MMOP_ONLINE, /* Online the memory to ZONE_NORMAL. */ -- 2.24.1
[PATCH v1 3/5] drivers/base/memory: store mapping between MMOP_* and string in an array
Let's use a simple array which we can reuse soon. While at it, move the string->mmop conversion out of the device hotplug lock. Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Baoquan He Cc: Wei Yang Signed-off-by: David Hildenbrand --- drivers/base/memory.c | 38 +++--- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e7e77cafef80..8a7f29c0bf97 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -28,6 +28,24 @@ #define MEMORY_CLASS_NAME "memory" +static const char *const online_type_to_str[] = { + [MMOP_OFFLINE] = "offline", + [MMOP_ONLINE] = "online", + [MMOP_ONLINE_KERNEL] = "online_kernel", + [MMOP_ONLINE_MOVABLE] = "online_movable", +}; + +static int memhp_online_type_from_str(const char *str) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { + if (sysfs_streq(str, online_type_to_str[i])) + return i; + } + return -EINVAL; +} + #define to_memory_block(dev) container_of(dev, struct memory_block, dev) static int sections_per_block; @@ -236,26 +254,17 @@ static int memory_subsys_offline(struct device *dev) static ssize_t state_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + const int online_type = memhp_online_type_from_str(buf); struct memory_block *mem = to_memory_block(dev); - int ret, online_type; + int ret; + + if (online_type < 0) + return -EINVAL; ret = lock_device_hotplug_sysfs(); if (ret) return ret; - if (sysfs_streq(buf, "online_kernel")) - online_type = MMOP_ONLINE_KERNEL; - else if (sysfs_streq(buf, "online_movable")) - online_type = MMOP_ONLINE_MOVABLE; - else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE; - else if (sysfs_streq(buf, "offline")) - online_type = MMOP_OFFLINE; - else { - ret = -EINVAL; - goto err; - } - switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: @@ -271,7 +280,6 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, ret = -EINVAL; /* should never happen */ } -err: unlock_device_hotplug(); if (ret < 0) -- 2.24.1
[PATCH v1 1/5] drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE
The name is misleading. Let's just name it like the online_type name we expose to user space ("online"). Add some documentation to the types. Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Baoquan He Cc: Wei Yang Signed-off-by: David Hildenbrand --- drivers/base/memory.c | 9 + include/linux/memory_hotplug.h | 6 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6448c9ece2cb..8c5ce42c0fc3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -216,7 +216,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = MMOP_ONLINE_KEEP; + mem->online_type = MMOP_ONLINE; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); @@ -251,7 +251,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, else if (sysfs_streq(buf, "online_movable")) online_type = MMOP_ONLINE_MOVABLE; else if (sysfs_streq(buf, "online")) - online_type = MMOP_ONLINE_KEEP; + online_type = MMOP_ONLINE; else if (sysfs_streq(buf, "offline")) online_type = MMOP_OFFLINE; else { @@ -262,7 +262,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: - case MMOP_ONLINE_KEEP: + case MMOP_ONLINE: /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; ret = device_online(&mem->dev); @@ -342,7 +342,8 @@ static ssize_t valid_zones_show(struct device *dev, } nid = mem->nid; - default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); + default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, + nr_pages); strcat(buf, default_zone->name); print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d4..261dbf010d5d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { + /* Offline the memory. */ MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, }; -- 2.24.1
[PATCH 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
Currently Linux kernel with CONFIG_NUMA on a system with multiple possible nodes, marks node 0 as online at boot. However in practice, there are systems which have node 0 as memoryless and cpuless. This can cause numa_balancing to be enabled on systems with only one node with memory and CPUs. The existence of this dummy node which is cpuless and memoryless node can confuse users/scripts looking at output of lscpu / numactl. Lets stop assuming that Node 0 is always online. v5.6-rc4 available: 2 nodes (0,2) node 0 cpus: node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: 0 1 2 3 4 5 6 7 node 2 size: 32625 MB node 2 free: 31490 MB node distances: node 0 2 0: 10 20 2: 20 10 proc and sys files -- /sys/devices/system/node/online:0,2 /proc/sys/kernel/numa_balancing:1 /sys/devices/system/node/has_cpu: 2 /sys/devices/system/node/has_memory:2 /sys/devices/system/node/has_normal_memory: 2 /sys/devices/system/node/possible: 0-31 v5.6-rc4 + patch -- available: 1 nodes (2) node 2 cpus: 0 1 2 3 4 5 6 7 node 2 size: 32625 MB node 2 free: 31487 MB node distances: node 2 2: 10 proc and sys files -- /sys/devices/system/node/online:2 /proc/sys/kernel/numa_balancing:0 /sys/devices/system/node/has_cpu: 2 /sys/devices/system/node/has_memory:2 /sys/devices/system/node/has_normal_memory: 2 /sys/devices/system/node/possible: 0-31 Cc: linuxppc-dev@lists.ozlabs.org Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Christopher Lameter Cc: Michael Ellerman Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Srikar Dronamraju --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c4eb75..68e635f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -116,8 +116,10 @@ struct pcpu_drain { */ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, +#ifdef CONFIG_NUMA + [N_ONLINE] = NODE_MASK_NONE, +#else [N_ONLINE] = { { [0] = 1UL } }, -#ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, -- 1.8.3.1
Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus
On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote: > A Powerpc system with multiple possible nodes and with CONFIG_NUMA > enabled always used to have a node 0, even if node 0 does not any cpus > or memory attached to it. As per PAPR, node affinity of a cpu is only > available once its present / online. For all cpus that are possible but > not present, cpu_to_node() would point to node 0. > > To ensure a cpuless, memoryless dummy node is not online, powerpc need > to make sure all possible but not present cpu_to_node are set to a > proper node. Just curious, is this somehow related to http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz? > Cc: linuxppc-dev@lists.ozlabs.org > Cc: linux...@kvack.org > Cc: linux-ker...@vger.kernel.org > Cc: Michal Hocko > Cc: Mel Gorman > Cc: Vlastimil Babka > Cc: "Kirill A. Shutemov" > Cc: Christopher Lameter > Cc: Michael Ellerman > Cc: Andrew Morton > Cc: Linus Torvalds > Signed-off-by: Srikar Dronamraju > --- > arch/powerpc/mm/numa.c | 16 ++-- > 1 file changed, 14 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c > index 8a399db..54dcd49 100644 > --- a/arch/powerpc/mm/numa.c > +++ b/arch/powerpc/mm/numa.c > @@ -931,8 +931,20 @@ void __init mem_topology_setup(void) > > reset_numa_cpu_lookup_table(); > > - for_each_present_cpu(cpu) > - numa_setup_cpu(cpu); > + for_each_possible_cpu(cpu) { > + /* > + * Powerpc with CONFIG_NUMA always used to have a node 0, > + * even if it was memoryless or cpuless. For all cpus that > + * are possible but not present, cpu_to_node() would point > + * to node 0. To remove a cpuless, memoryless dummy node, > + * powerpc need to make sure all possible but not present > + * cpu_to_node are set to a proper node. > + */ > + if (cpu_present(cpu)) > + numa_setup_cpu(cpu); > + else > + set_cpu_numa_node(cpu, first_online_node); > + } > } > > void __init initmem_init(void) > -- > 1.8.3.1 -- Michal Hocko SUSE Labs
[PATCH 2/3] powerpc/numa: Prefer node id queried from vphn
Node id queried from the static device tree may not be correct. For example: it may always show 0 on a shared processor. Hence prefer the node id queried from vphn and fallback on the device tree based node id if vphn query fails. Cc: linuxppc-dev@lists.ozlabs.org Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Christopher Lameter Cc: Michael Ellerman Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Srikar Dronamraju --- arch/powerpc/mm/numa.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 54dcd49..8735fed 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -719,20 +719,20 @@ static int __init parse_numa_properties(void) */ for_each_present_cpu(i) { struct device_node *cpu; - int nid; - - cpu = of_get_cpu_node(i, NULL); - BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); - of_node_put(cpu); + int nid = vphn_get_nid(i); /* * Don't fall back to default_nid yet -- we will plug * cpus into nodes once the memory scan has discovered * the topology. */ - if (nid < 0) - continue; + if (nid == NUMA_NO_NODE) { + cpu = of_get_cpu_node(i, NULL); + if (cpu) { + nid = of_node_to_nid_single(cpu); + of_node_put(cpu); + } + } node_set_online(nid); } -- 1.8.3.1
[PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus
A Powerpc system with multiple possible nodes and with CONFIG_NUMA enabled always used to have a node 0, even if node 0 does not any cpus or memory attached to it. As per PAPR, node affinity of a cpu is only available once its present / online. For all cpus that are possible but not present, cpu_to_node() would point to node 0. To ensure a cpuless, memoryless dummy node is not online, powerpc need to make sure all possible but not present cpu_to_node are set to a proper node. Cc: linuxppc-dev@lists.ozlabs.org Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Christopher Lameter Cc: Michael Ellerman Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Srikar Dronamraju --- arch/powerpc/mm/numa.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 8a399db..54dcd49 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -931,8 +931,20 @@ void __init mem_topology_setup(void) reset_numa_cpu_lookup_table(); - for_each_present_cpu(cpu) - numa_setup_cpu(cpu); + for_each_possible_cpu(cpu) { + /* +* Powerpc with CONFIG_NUMA always used to have a node 0, +* even if it was memoryless or cpuless. For all cpus that +* are possible but not present, cpu_to_node() would point +* to node 0. To remove a cpuless, memoryless dummy node, +* powerpc need to make sure all possible but not present +* cpu_to_node are set to a proper node. +*/ + if (cpu_present(cpu)) + numa_setup_cpu(cpu); + else + set_cpu_numa_node(cpu, first_online_node); + } } void __init initmem_init(void) -- 1.8.3.1
[PATCH 0/3] Offline memoryless cpuless node 0
Linux kernel configured with CONFIG_NUMA on a system with multiple possible nodes, marks node 0 as online at boot. However in practice, there are systems which have node 0 as memoryless and cpuless. This can cause 1. numa_balancing to be enabled on systems with only one online node. 2. Existence of dummy (cpuless and memoryless) node which can confuse users/scripts looking at output of lscpu / numactl. This patchset wants to correct this anomaly. This should only affect systems that have CONFIG_MEMORYLESS_NODES. Currently there are only 2 architectures ia64 and powerpc that have this config. v5.6-rc4 available: 2 nodes (0,2) node 0 cpus: node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: 0 1 2 3 4 5 6 7 node 2 size: 32625 MB node 2 free: 31490 MB node distances: node 0 2 0: 10 20 2: 20 10 proc and sys files -- /sys/devices/system/node/online:0,2 /proc/sys/kernel/numa_balancing:1 /sys/devices/system/node/has_cpu: 2 /sys/devices/system/node/has_memory:2 /sys/devices/system/node/has_normal_memory: 2 /sys/devices/system/node/possible: 0-31 v5.6-rc4 + patches -- available: 1 nodes (2) node 2 cpus: 0 1 2 3 4 5 6 7 node 2 size: 32625 MB node 2 free: 31487 MB node distances: node 2 2: 10 proc and sys files -- /sys/devices/system/node/online:2 /proc/sys/kernel/numa_balancing:0 /sys/devices/system/node/has_cpu: 2 /sys/devices/system/node/has_memory:2 /sys/devices/system/node/has_normal_memory: 2 /sys/devices/system/node/possible: 0-31 Cc: linuxppc-dev@lists.ozlabs.org Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Christopher Lameter Cc: Michael Ellerman Cc: Andrew Morton Cc: Linus Torvalds Srikar Dronamraju (3): powerpc/numa: Set numa_node for all possible cpus powerpc/numa: Prefer node id queried from vphn mm/page_alloc: Keep memoryless cpuless node 0 offline arch/powerpc/mm/numa.c | 32 ++-- mm/page_alloc.c| 4 +++- 2 files changed, 25 insertions(+), 11 deletions(-) -- 1.8.3.1
[PATCH v2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value
ld instruction should have 14 bit immediate field (DS) concatenated with 0b00 on the right, encode it accordingly. Introduce macro `IMM_DS()` to encode DS form instructions with 14 bit immediate field. Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store instructions") Reviewed-by: Sandipan Das Signed-off-by: Balamuruhan S --- arch/powerpc/lib/test_emulate_step.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- changes in v2: - * squash the commits as per Christophe's review comment diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 42347067739c..007292a1ad01 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -13,19 +13,20 @@ #include #define IMM_L(i) ((uintptr_t)(i) & 0x) +#define IMM_DS(i) ((uintptr_t)(i) & 0xfffc) /* * Defined with TEST_ prefix so it does not conflict with other * definitions. */ #define TEST_LD(r, base, i)(PPC_INST_LD | ___PPC_RT(r) | \ - ___PPC_RA(base) | IMM_L(i)) + ___PPC_RA(base) | IMM_DS(i)) #define TEST_LWZ(r, base, i) (PPC_INST_LWZ | ___PPC_RT(r) | \ ___PPC_RA(base) | IMM_L(i)) #define TEST_LWZX(t, a, b) (PPC_INST_LWZX | ___PPC_RT(t) | \ ___PPC_RA(a) | ___PPC_RB(b)) #define TEST_STD(r, base, i) (PPC_INST_STD | ___PPC_RS(r) | \ - ___PPC_RA(base) | ((i) & 0xfffc)) + ___PPC_RA(base) | IMM_DS(i)) #define TEST_LDARX(t, a, b, eh)(PPC_INST_LDARX | ___PPC_RT(t) | \ ___PPC_RA(a) | ___PPC_RB(b) | \ __PPC_EH(eh)) base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696 -- 2.24.1
[PATCH v3 6/6] pseries/sysfs: Minimise IPI noise while reading [idle_][s]purr
From: "Gautham R. Shenoy" Currently purr, spurr, idle_purr, idle_spurr are exposed for every CPU via the sysfs interface /sys/devices/system/cpu/cpuX/[idle_][s]purr. Each sysfs read currently generates an IPI to obtain the desired value from the target CPU X. Since these aforementioned sysfs files are typically read one after another, we end up generating 4 IPIs per CPU in a short duration. In order to minimize the IPI noise, this patch caches the values of all the four entities whenever one of them is read. If subsequently any of these are read within the next 10ms, the cached value is returned. With this, we will generate at most one IPI every 10ms for every CPU. Test-results: While reading the four sysfs files back-to-back for a given CPU every second for 100 seconds. Without the patch: 16 [XICS 2 Edge IPI] = 422 times DBL [Doorbell interrupts] = 13 times Total : 435 IPIs. With the patch: 16 [XICS 2 Edge IPI] = 111 times DBL [Doorbell interrupts] = 17 times Total : 128 IPIs. Signed-off-by: Gautham R. Shenoy --- arch/powerpc/kernel/sysfs.c | 109 1 file changed, 90 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index c9ddb83..db8fc90 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -586,8 +586,6 @@ void ppc_enable_pmcs(void) * SPRs which are not related to PMU. */ #ifdef CONFIG_PPC64 -SYSFS_SPRSETUP(purr, SPRN_PURR); -SYSFS_SPRSETUP(spurr, SPRN_SPURR); SYSFS_SPRSETUP(pir, SPRN_PIR); SYSFS_SPRSETUP(tscr, SPRN_TSCR); @@ -596,8 +594,6 @@ void ppc_enable_pmcs(void) enable write when needed with a separate function. Lets be conservative and default to pseries. */ -static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); -static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr); #endif /* CONFIG_PPC64 */ @@ -761,39 +757,114 @@ static void create_svm_file(void) } #endif /* CONFIG_PPC_SVM */ -static void read_idle_purr(void *val) +/* + * The duration (in ms) from the last IPI to the target CPU until + * which a cached value of purr, spurr, idle_purr, idle_spurr can be + * reported to the user on a corresponding sysfs file read. Beyond + * this duration, fresh values need to be obtained by sending IPIs to + * the target CPU when the sysfs files are read. + */ +static unsigned long util_stats_staleness_tolerance_ms = 10; +struct util_acct_stats { + u64 latest_purr; + u64 latest_spurr; + u64 latest_idle_purr; + u64 latest_idle_spurr; + unsigned long last_update_jiffies; +}; + +DEFINE_PER_CPU(struct util_acct_stats, util_acct_stats); + +static void update_util_acct_stats(void *ptr) { - u64 *ret = val; + struct util_acct_stats *stats = ptr; - *ret = read_this_idle_purr(); + stats->latest_purr = mfspr(SPRN_PURR); + stats->latest_spurr = mfspr(SPRN_SPURR); + stats->latest_idle_purr = read_this_idle_purr(); + stats->latest_idle_spurr = read_this_idle_spurr(); + stats->last_update_jiffies = jiffies; } -static ssize_t idle_purr_show(struct device *dev, - struct device_attribute *attr, char *buf) +struct util_acct_stats *get_util_stats_ptr(int cpu) +{ + struct util_acct_stats *stats = per_cpu_ptr(&util_acct_stats, cpu); + unsigned long delta_jiffies; + + delta_jiffies = jiffies - stats->last_update_jiffies; + + /* +* If we have a recent enough data, reuse that instead of +* sending an IPI. +*/ + if (jiffies_to_msecs(delta_jiffies) < util_stats_staleness_tolerance_ms) + return stats; + + smp_call_function_single(cpu, update_util_acct_stats, stats, 1); + return stats; +} + +static ssize_t show_purr(struct device *dev, +struct device_attribute *attr, char *buf) { struct cpu *cpu = container_of(dev, struct cpu, dev); - u64 val; + struct util_acct_stats *stats; - smp_call_function_single(cpu->dev.id, read_idle_purr, &val, 1); - return sprintf(buf, "%llx\n", val); + stats = get_util_stats_ptr(cpu->dev.id); + return sprintf(buf, "%llx\n", stats->latest_purr); } -static DEVICE_ATTR(idle_purr, 0400, idle_purr_show, NULL); -static void read_idle_spurr(void *val) +static void write_purr(void *val) { - u64 *ret = val; + mtspr(SPRN_PURR, *(unsigned long *)val); +} - *ret = read_this_idle_spurr(); +static ssize_t __used store_purr(struct device *dev, +struct device_attribute *attr, +const char *buf, size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + unsigned long val; + int ret = kstrtoul(buf, 16,
[PATCH v3 0/6] Track and expose idle PURR and SPURR ticks
From: "Gautham R. Shenoy" Hi, This is the third version of the patches to track and expose idle PURR and SPURR ticks. These patches are required by tools such as lparstat to compute system utilization for capacity planning purposes. The previous versions can be found here: v2: https://lkml.org/lkml/2020/2/21/21 v1: https://lore.kernel.org/patchwork/cover/1159341/ They key changes from v2 are: - The prolog and epilog functions have been named pseries_idle_prolog() and pseries_idle_epilog() respectively to indicate their pseries specific nature. - Fixed the Documentation for /sys/devices/system/cpu/cpuX/idle_spurr as pointed out by Nathan Lynch. - Introduces a patch (Patch 6/6) to send an IPI in order to read and cache the values of purr, spurr, idle_purr and idle_spurr of the target CPU when any one of them is read via sysfs. These cached values will be presented if any of these sysfs are read within the next 10ms. If these sysfs files are read after 10ms from the earlier IPI, a fresh IPI is issued to read and cache the values again. This minimizes the number of IPIs required to be sent when these values are read back-to-back via the sysfs interface. Test-results: While reading the four sysfs files back-to-back for a given CPU every second for 100 seconds. Without patch 6/6 (Without caching): 16 [XICS 2 Edge IPI] = 422 times DBL [Doorbell interrupts] = 13 times Total : 435 IPIs. With patch 6/6 (With caching): 16 [XICS 2 Edge IPI] = 111 times DBL [Doorbell interrupts] = 17 times Total : 128 IPIs. Motivation: === On PSeries LPARs, the data centers planners desire a more accurate view of system utilization per resource such as CPU to plan the system capacity requirements better. Such accuracy can be obtained by reading PURR/SPURR registers for CPU resource utilization. Tools such as lparstat which are used to compute the utilization need to know [S]PURR ticks when the cpu was busy or idle. The [S]PURR counters are already exposed through sysfs. We already account for PURR ticks when we go to idle so that we can update the VPA area. This patchset extends support to account for SPURR ticks when idle, and expose both via per-cpu sysfs files. These patches are required for enhancement to the lparstat utility that compute the CPU utilization based on PURR and SPURR which can be found here : https://groups.google.com/forum/#!topic/powerpc-utils-devel/fYRo69xO9r4 With the patches, when lparstat is run on a LPAR running CPU-Hogs, = sudo ./src/lparstat -E 1 3 System Configuration type=Dedicated mode=Capped smt=8 lcpu=2 mem=4834176 kB cpus=0 ent=2.00 ---Actual--- -Normalized- %busy %idle Frequency %busy %idle -- -- - -- -- 1 99.99 0.00 3.35GHz[111%] 110.99 0.00 2 100.00 0.00 3.35GHz[111%] 111.00 0.00 3 100.00 0.00 3.35GHz[111%] 111.00 0.00 = When lparstat is run on an LPAR that is idle, = $ sudo ./src/lparstat -E 1 3 System Configuration type=Dedicated mode=Capped smt=8 lcpu=2 mem=4834176 kB cpus=0 ent=2.00 ---Actual--- -Normalized- %busy %idle Frequency %busy %idle -- -- - -- -- 1 0.71 99.30 2.18GHz[ 72%] 0.53 71.48 2 0.56 99.44 2.11GHz[ 70%] 0.43 69.57 3 0.54 99.46 2.11GHz[ 70%] 0.43 69.57 = Gautham R. Shenoy (6): powerpc: Move idle_loop_prolog()/epilog() functions to header file powerpc/idle: Add accessor function to always read latest idle PURR powerpc/pseries: Account for SPURR ticks on idle CPUs powerpc/sysfs: Show idle_purr and idle_spurr for every CPU Documentation: Document sysfs interfaces purr, spurr, idle_purr, idle_spurr pseries/sysfs: Minimise IPI noise while reading [idle_][s]purr Documentation/ABI/testing/sysfs-devices-system-cpu | 39 ++ arch/powerpc/include/asm/idle.h| 89 ++ arch/powerpc/kernel/sysfs.c| 133 +++-- arch/powerpc/platforms/pseries/setup.c | 8 +- drivers/cpuidle/cpuidle-pseries.c | 39 ++ 5 files changed, 267 insertions(+), 41 deletions(-) create mode 100644 arch/powerpc/include/asm/idle.h -- 1.9.4
[PATCH v3 1/6] powerpc: Move idle_loop_prolog()/epilog() functions to header file
From: "Gautham R. Shenoy" Currently prior to entering an idle state on a Linux Guest, the pseries cpuidle driver implement an idle_loop_prolog() and idle_loop_epilog() functions which ensure that idle_purr is correctly computed, and the hypervisor is informed that the CPU cycles have been donated. These prolog and epilog functions are also required in the default idle call, i.e pseries_lpar_idle(). Hence move these accessor functions to a common header file and call them from pseries_lpar_idle(). Since the existing header files such as asm/processor.h have enough clutter, create a new header file asm/idle.h. Finally rename idle_loop_prolog() and idle_loop_epilog() to pseries_idle_prolog() and pseries_idle_epilog() as they are only relavent for on pseries guests. Signed-off-by: Gautham R. Shenoy --- arch/powerpc/include/asm/idle.h| 28 ++ arch/powerpc/platforms/pseries/setup.c | 7 +-- drivers/cpuidle/cpuidle-pseries.c | 36 +++--- 3 files changed, 40 insertions(+), 31 deletions(-) create mode 100644 arch/powerpc/include/asm/idle.h diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h new file mode 100644 index 000..e838ea5 --- /dev/null +++ b/arch/powerpc/include/asm/idle.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_POWERPC_IDLE_H +#define _ASM_POWERPC_IDLE_H +#include + +static inline void pseries_idle_prolog(unsigned long *in_purr) +{ + ppc64_runlatch_off(); + *in_purr = mfspr(SPRN_PURR); + /* +* Indicate to the HV that we are idle. Now would be +* a good time to find other work to dispatch. +*/ + get_lppaca()->idle = 1; +} + +static inline void pseries_idle_epilog(unsigned long in_purr) +{ + u64 wait_cycles; + + wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles); + wait_cycles += mfspr(SPRN_PURR) - in_purr; + get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); + get_lppaca()->idle = 0; + + ppc64_runlatch_on(); +} +#endif diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 0c8421d..2f53e6b 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -319,6 +320,8 @@ static int alloc_dispatch_log_kmem_cache(void) static void pseries_lpar_idle(void) { + unsigned long in_purr; + /* * Default handler to go into low thread priority and possibly * low power mode by ceding processor to hypervisor @@ -328,7 +331,7 @@ static void pseries_lpar_idle(void) return; /* Indicate to hypervisor that we are idle. */ - get_lppaca()->idle = 1; + pseries_idle_prolog(&in_purr); /* * Yield the processor to the hypervisor. We return if @@ -339,7 +342,7 @@ static void pseries_lpar_idle(void) */ cede_processor(); - get_lppaca()->idle = 0; + pseries_idle_epilog(in_purr); } /* diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 74c2479..46d5e05 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -19,6 +19,7 @@ #include #include #include +#include #include struct cpuidle_driver pseries_idle_driver = { @@ -31,29 +32,6 @@ struct cpuidle_driver pseries_idle_driver = { static u64 snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; -static inline void idle_loop_prolog(unsigned long *in_purr) -{ - ppc64_runlatch_off(); - *in_purr = mfspr(SPRN_PURR); - /* -* Indicate to the HV that we are idle. Now would be -* a good time to find other work to dispatch. -*/ - get_lppaca()->idle = 1; -} - -static inline void idle_loop_epilog(unsigned long in_purr) -{ - u64 wait_cycles; - - wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles); - wait_cycles += mfspr(SPRN_PURR) - in_purr; - get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); - get_lppaca()->idle = 0; - - ppc64_runlatch_on(); -} - static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -63,7 +41,7 @@ static int snooze_loop(struct cpuidle_device *dev, set_thread_flag(TIF_POLLING_NRFLAG); - idle_loop_prolog(&in_purr); + pseries_idle_prolog(&in_purr); local_irq_enable(); snooze_exit_time = get_tb() + snooze_timeout; @@ -87,7 +65,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_disable(); - idle_loop_epilog(in_purr); + pseries_idle_epilog(in_purr); return index; } @@ -115,7 +93,7 @@ static int dedicated_cede_loop(struct cpuidle_device *dev, { unsigned long in_purr;
[PATCH v3 5/6] Documentation: Document sysfs interfaces purr, spurr, idle_purr, idle_spurr
From: "Gautham R. Shenoy" Add documentation for the following sysfs interfaces: /sys/devices/system/cpu/cpuX/purr /sys/devices/system/cpu/cpuX/spurr /sys/devices/system/cpu/cpuX/idle_purr /sys/devices/system/cpu/cpuX/idle_spurr Signed-off-by: Gautham R. Shenoy --- Documentation/ABI/testing/sysfs-devices-system-cpu | 39 ++ 1 file changed, 39 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2e0e3b4..bc07677 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -580,3 +580,42 @@ Description: Secure Virtual Machine If 1, it means the system is using the Protected Execution Facility in POWER9 and newer processors. i.e., it is a Secure Virtual Machine. + +What: /sys/devices/system/cpu/cpuX/purr +Date: Apr 2005 +Contact: Linux for PowerPC mailing list +Description: PURR ticks for this CPU since the system boot. + + The Processor Utilization Resources Register (PURR) is + a 64-bit counter which provides an estimate of the + resources used by the CPU thread. The contents of this + register increases monotonically. This sysfs interface + exposes the number of PURR ticks for cpuX. + +What: /sys/devices/system/cpu/cpuX/spurr +Date: Dec 2006 +Contact: Linux for PowerPC mailing list +Description: SPURR ticks for this CPU since the system boot. + + The Scaled Processor Utilization Resources Register + (SPURR) is a 64-bit counter that provides a frequency + invariant estimate of the resources used by the CPU + thread. The contents of this register increases + monotonically. This sysfs interface exposes the number + of SPURR ticks for cpuX. + +What: /sys/devices/system/cpu/cpuX/idle_purr +Date: Mar 2020 +Contact: Linux for PowerPC mailing list +Description: PURR ticks for cpuX when it was idle. + + This sysfs interface exposes the number of PURR ticks + for cpuX when it was idle. + +What: /sys/devices/system/cpu/cpuX/idle_spurr +Date: Mar 2020 +Contact: Linux for PowerPC mailing list +Description: SPURR ticks for cpuX when it was idle. + + This sysfs interface exposes the number of SPURR ticks + for cpuX when it was idle. -- 1.9.4
[PATCH v3 3/6] powerpc/pseries: Account for SPURR ticks on idle CPUs
From: "Gautham R. Shenoy" On Pseries LPARs, to calculate utilization, we need to know the [S]PURR ticks when the CPUs were busy or idle. Via pseries_idle_prolog(), pseries_idle_epilog(), we track the idle PURR ticks in the VPA variable "wait_state_cycles". This patch extends the support to account for the idle SPURR ticks. It also provides an accessor function to accurately reads idle SPURR ticks. Signed-off-by: Gautham R. Shenoy --- arch/powerpc/include/asm/idle.h| 33 + arch/powerpc/platforms/pseries/setup.c | 2 ++ 2 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h index 7552823..a375589 100644 --- a/arch/powerpc/include/asm/idle.h +++ b/arch/powerpc/include/asm/idle.h @@ -3,13 +3,20 @@ #define _ASM_POWERPC_IDLE_H #include +DECLARE_PER_CPU(u64, idle_spurr_cycles); DECLARE_PER_CPU(u64, idle_entry_purr_snap); +DECLARE_PER_CPU(u64, idle_entry_spurr_snap); static inline void snapshot_purr_idle_entry(void) { *this_cpu_ptr(&idle_entry_purr_snap) = mfspr(SPRN_PURR); } +static inline void snapshot_spurr_idle_entry(void) +{ + *this_cpu_ptr(&idle_entry_spurr_snap) = mfspr(SPRN_SPURR); +} + static inline void update_idle_purr_accounting(void) { u64 wait_cycles; @@ -20,10 +27,19 @@ static inline void update_idle_purr_accounting(void) get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); } +static inline void update_idle_spurr_accounting(void) +{ + u64 *idle_spurr_cycles_ptr = this_cpu_ptr(&idle_spurr_cycles); + u64 in_spurr = *this_cpu_ptr(&idle_entry_spurr_snap); + + *idle_spurr_cycles_ptr += mfspr(SPRN_SPURR) - in_spurr; +} + static inline void pseries_idle_prolog(void) { ppc64_runlatch_off(); snapshot_purr_idle_entry(); + snapshot_spurr_idle_entry(); /* * Indicate to the HV that we are idle. Now would be * a good time to find other work to dispatch. @@ -34,6 +50,7 @@ static inline void pseries_idle_prolog(void) static inline void pseries_idle_epilog(void) { update_idle_purr_accounting(); + update_idle_spurr_accounting(); get_lppaca()->idle = 0; ppc64_runlatch_on(); } @@ -53,4 +70,20 @@ static inline u64 read_this_idle_purr(void) return be64_to_cpu(get_lppaca()->wait_state_cycles); } + +static inline u64 read_this_idle_spurr(void) +{ + /* +* If we are reading from an idle context, update the +* idle-spurr cycles corresponding to the last idle period. +* Since the idle context is not yet over, take a fresh +* snapshot of the idle-spurr. +*/ + if (get_lppaca()->idle == 1) { + update_idle_spurr_accounting(); + snapshot_spurr_idle_entry(); + } + + return *this_cpu_ptr(&idle_spurr_cycles); +} #endif diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 4905c96..1b55e80 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -318,7 +318,9 @@ static int alloc_dispatch_log_kmem_cache(void) } machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache); +DEFINE_PER_CPU(u64, idle_spurr_cycles); DEFINE_PER_CPU(u64, idle_entry_purr_snap); +DEFINE_PER_CPU(u64, idle_entry_spurr_snap); static void pseries_lpar_idle(void) { /* -- 1.9.4
[PATCH v3 2/6] powerpc/idle: Add accessor function to always read latest idle PURR
From: "Gautham R. Shenoy" Currently when CPU goes idle, we take a snapshot of PURR via pseries_idle_prolog() which is used at the CPU idle exit to compute the idle PURR cycles via the function pseries_idle_epilog(). Thus, the value of idle PURR cycle thus read before pseries_idle_prolog() and after pseries_idle_epilog() is always correct. However, if we were to read the idle PURR cycles from an interrupt context between pseries_idle_prolog() and pseries_idle_epilog() (this will be done in a future patch), then, the value of the idle PURR thus read will not include the cycles spent in the most recent idle period. This patch addresses the issue by providing accessor function to read the idle PURR such such that it includes the cycles spent in the most recent idle period, if we read it between pseries_idle_prolog() and pseries_idle_epilog(). In order to achieve it, the patch saves the snapshot of PURR in pseries_idle_prolog() in a per-cpu variable, instead of on the stack, so that it can be accessed from an interrupt context. Signed-off-by: Gautham R. Shenoy --- arch/powerpc/include/asm/idle.h| 46 +++--- arch/powerpc/platforms/pseries/setup.c | 7 +++--- drivers/cpuidle/cpuidle-pseries.c | 15 +-- 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/idle.h b/arch/powerpc/include/asm/idle.h index e838ea5..7552823 100644 --- a/arch/powerpc/include/asm/idle.h +++ b/arch/powerpc/include/asm/idle.h @@ -3,10 +3,27 @@ #define _ASM_POWERPC_IDLE_H #include -static inline void pseries_idle_prolog(unsigned long *in_purr) +DECLARE_PER_CPU(u64, idle_entry_purr_snap); + +static inline void snapshot_purr_idle_entry(void) +{ + *this_cpu_ptr(&idle_entry_purr_snap) = mfspr(SPRN_PURR); +} + +static inline void update_idle_purr_accounting(void) +{ + u64 wait_cycles; + u64 in_purr = *this_cpu_ptr(&idle_entry_purr_snap); + + wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles); + wait_cycles += mfspr(SPRN_PURR) - in_purr; + get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); +} + +static inline void pseries_idle_prolog(void) { ppc64_runlatch_off(); - *in_purr = mfspr(SPRN_PURR); + snapshot_purr_idle_entry(); /* * Indicate to the HV that we are idle. Now would be * a good time to find other work to dispatch. @@ -14,15 +31,26 @@ static inline void pseries_idle_prolog(unsigned long *in_purr) get_lppaca()->idle = 1; } -static inline void pseries_idle_epilog(unsigned long in_purr) +static inline void pseries_idle_epilog(void) { - u64 wait_cycles; - - wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles); - wait_cycles += mfspr(SPRN_PURR) - in_purr; - get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles); + update_idle_purr_accounting(); get_lppaca()->idle = 0; - ppc64_runlatch_on(); } + +static inline u64 read_this_idle_purr(void) +{ + /* +* If we are reading from an idle context, update the +* idle-purr cycles corresponding to the last idle period. +* Since the idle context is not yet over, take a fresh +* snapshot of the idle-purr. +*/ + if (unlikely(get_lppaca()->idle == 1)) { + update_idle_purr_accounting(); + snapshot_purr_idle_entry(); + } + + return be64_to_cpu(get_lppaca()->wait_state_cycles); +} #endif diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2f53e6b..4905c96 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -318,10 +318,9 @@ static int alloc_dispatch_log_kmem_cache(void) } machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache); +DEFINE_PER_CPU(u64, idle_entry_purr_snap); static void pseries_lpar_idle(void) { - unsigned long in_purr; - /* * Default handler to go into low thread priority and possibly * low power mode by ceding processor to hypervisor @@ -331,7 +330,7 @@ static void pseries_lpar_idle(void) return; /* Indicate to hypervisor that we are idle. */ - pseries_idle_prolog(&in_purr); + pseries_idle_prolog(); /* * Yield the processor to the hypervisor. We return if @@ -342,7 +341,7 @@ static void pseries_lpar_idle(void) */ cede_processor(); - pseries_idle_epilog(in_purr); + pseries_idle_epilog(); } /* diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 46d5e05..6513ef2 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -36,12 +36,11 @@ static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long in_purr; u64 snooze_exit_time; set_
[PATCH v3 4/6] powerpc/sysfs: Show idle_purr and idle_spurr for every CPU
From: "Gautham R. Shenoy" On Pseries LPARs, to calculate utilization, we need to know the [S]PURR ticks when the CPUs were busy or idle. The total PURR and SPURR ticks are already exposed via the per-cpu sysfs files "purr" and "spurr". This patch adds support for exposing the idle PURR and SPURR ticks via new per-cpu sysfs files named "idle_purr" and "idle_spurr". Signed-off-by: Gautham R. Shenoy --- arch/powerpc/kernel/sysfs.c | 54 ++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 479c706..c9ddb83 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "cacheinfo.h" @@ -760,6 +761,42 @@ static void create_svm_file(void) } #endif /* CONFIG_PPC_SVM */ +static void read_idle_purr(void *val) +{ + u64 *ret = val; + + *ret = read_this_idle_purr(); +} + +static ssize_t idle_purr_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + u64 val; + + smp_call_function_single(cpu->dev.id, read_idle_purr, &val, 1); + return sprintf(buf, "%llx\n", val); +} +static DEVICE_ATTR(idle_purr, 0400, idle_purr_show, NULL); + +static void read_idle_spurr(void *val) +{ + u64 *ret = val; + + *ret = read_this_idle_spurr(); +} + +static ssize_t idle_spurr_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + u64 val; + + smp_call_function_single(cpu->dev.id, read_idle_spurr, &val, 1); + return sprintf(buf, "%llx\n", val); +} +static DEVICE_ATTR(idle_spurr, 0400, idle_spurr_show, NULL); + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -823,10 +860,15 @@ static int register_cpu_online(unsigned int cpu) if (!firmware_has_feature(FW_FEATURE_LPAR)) add_write_permission_dev_attr(&dev_attr_purr); device_create_file(s, &dev_attr_purr); + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_create_file(s, &dev_attr_idle_purr); } - if (cpu_has_feature(CPU_FTR_SPURR)) + if (cpu_has_feature(CPU_FTR_SPURR)) { device_create_file(s, &dev_attr_spurr); + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_create_file(s, &dev_attr_idle_spurr); + } if (cpu_has_feature(CPU_FTR_DSCR)) device_create_file(s, &dev_attr_dscr); @@ -910,11 +952,17 @@ static int unregister_cpu_online(unsigned int cpu) device_remove_file(s, &dev_attr_mmcra); #endif /* CONFIG_PMU_SYSFS */ - if (cpu_has_feature(CPU_FTR_PURR)) + if (cpu_has_feature(CPU_FTR_PURR)) { device_remove_file(s, &dev_attr_purr); + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_remove_file(s, &dev_attr_idle_purr); + } - if (cpu_has_feature(CPU_FTR_SPURR)) + if (cpu_has_feature(CPU_FTR_SPURR)) { device_remove_file(s, &dev_attr_spurr); + if (firmware_has_feature(FW_FEATURE_LPAR)) + device_remove_file(s, &dev_attr_idle_spurr); + } if (cpu_has_feature(CPU_FTR_DSCR)) device_remove_file(s, &dev_attr_dscr); -- 1.9.4
[PATCH V16] mm/debug: Add tests validating architecture page table helpers
This adds tests which will validate architecture page table helpers and other accessors in their compliance with expected generic MM semantics. This will help various architectures in validating changes to existing page table helpers or addition of new ones. This test covers basic page table entry transformations including but not limited to old, young, dirty, clean, write, write protect etc at various level along with populating intermediate entries with next page table page and validating them. Test page table pages are allocated from system memory with required size and alignments. The mapped pfns at page table levels are derived from a real pfn representing a valid kernel text symbol. This test gets called inside kernel_init() right after async_synchronize_full(). This test gets built and run when CONFIG_DEBUG_VM_PGTABLE is selected. Any architecture, which is willing to subscribe this test will need to select ARCH_HAS_DEBUG_VM_PGTABLE. For now this is limited to arc, arm64, x86, s390 and powerpc platforms where the test is known to build and run successfully Going forward, other architectures too can subscribe the test after fixing any build or runtime problems with their page table helpers. Meanwhile for better platform coverage, the test can also be enabled with CONFIG_EXPERT even without ARCH_HAS_DEBUG_VM_PGTABLE. Folks interested in making sure that a given platform's page table helpers conform to expected generic MM semantics should enable the above config which will just trigger this test during boot. Any non conformity here will be reported as an warning which would need to be fixed. This test will help catch any changes to the agreed upon semantics expected from generic MM and enable platforms to accommodate it thereafter. Cc: Andrew Morton Cc: Mike Rapoport Cc: Vineet Gupta Cc: Catalin Marinas Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Kirill A. Shutemov Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: linux-snps-...@lists.infradead.org Cc: linux-arm-ker...@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s...@vger.kernel.org Cc: linux-ri...@lists.infradead.org Cc: x...@kernel.org Cc: linux-a...@vger.kernel.org Cc: linux-ker...@vger.kernel.org Suggested-by: Catalin Marinas Reviewed-by: Ingo Molnar Tested-by: Gerald Schaefer # s390 Tested-by: Christophe Leroy# ppc32 Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: Anshuman Khandual --- This adds a test validation for architecture exported page table helpers. Patch adds basic transformation tests at various levels of the page table. This test was originally suggested by Catalin during arm64 THP migration RFC discussion earlier. Going forward it can include more specific tests with respect to various generic MM functions like THP, HugeTLB etc and platform specific tests. https://lore.kernel.org/linux-mm/20190628102003.ga56...@arrakis.emea.arm.com/ Needs to be applied on linux V5.6-rc5 Changes in V16: - Replaced WRITE_ONCE() with set_pte_at() with a new barrier() in pte_clear_tests() per Qian - Enabled all powerpc platforms and updated the feature list Changes in V15: (https://patchwork.kernel.org/patch/11422803/) - Replaced __pa() with __pa_symbol() (https://patchwork.kernel.org/patch/11407715/) - Replaced pte_alloc_map() with pte_alloc_map_lock() per Qian - Replaced pte_unmap() with pte_unmap_unlock() per Qian - Added address to pte_clear_tests() and passed it down till pte_clear() per Qian Changes in V14: (https://patchwork.kernel.org/project/linux-mm/list/?series=242305) - Disabled DEBUG_VM_PGTABLE for IA64 and ARM (32 Bit) per Andrew and Christophe - Updated DEBUG_VM_PGTABLE documentation wrt EXPERT and disabled platforms - Updated RANDOM_[OR|NZ]VALUE open encodings with GENMASK() per Catalin - Updated s390 constraint bits from 12 to 4 (S390_MASK_BITS) per Gerald - Updated in-code documentation for RANDOM_ORVALUE per Gerald - Updated pxx_basic_tests() to use invert functions first per Catalin - Dropped ARCH_HAS_4LEVEL_HACK check from pud_basic_tests() - Replaced __ARCH_HAS_[4|5]LEVEL_HACK with __PAGETABLE_[PUD|P4D]_FOLDED per Catalin - Trimmed the CC list on the commit message per Catalin Changes in V13: (https://patchwork.kernel.org/project/linux-mm/list/?series=237125) - Subscribed s390 platform and updated debug-vm-pgtable/arch-support.txt per Gerald - Dropped keyword 'extern' from debug_vm_pgtable() declaration per Christophe - Moved debug_vm_pgtable() declarations to per Christophe - Moved debug_vm_pgtable() call site into kernel_init() per Christophe - Changed CONFIG_DEBUG_VM_PGTABLE rules per Christophe - Updated commit to include new supported platforms and changed config selection Changes in V12: (https://patchwork.kernel.org/project/linux
Re: [PATCH -next 017/491] CELL BROADBAND ENGINE ARCHITECTURE: Use fallthrough;
On Wed, Mar 11, 2020 at 6:07 AM Joe Perches wrote: > > Convert the various uses of fallthrough comments to fallthrough; > > Done via script > Link: > https://lore.kernel.org/lkml/b56602fcf79f849e733e7b521bb0e17895d390fa.1582230379.git.joe.com/ > > Signed-off-by: Joe Perches Acked-by: Arnd Bergmann
[PATCH] hwmon: (ibmpowernv) Use scnprintf() for avoiding potential buffer overflow
Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Signed-off-by: Takashi Iwai --- drivers/hwmon/ibmpowernv.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c index 0e525cfbdfc5..a750647e66a4 100644 --- a/drivers/hwmon/ibmpowernv.c +++ b/drivers/hwmon/ibmpowernv.c @@ -186,7 +186,7 @@ static void make_sensor_label(struct device_node *np, u32 id; size_t n; - n = snprintf(sdata->label, sizeof(sdata->label), "%s", label); + n = scnprintf(sdata->label, sizeof(sdata->label), "%s", label); /* * Core temp pretty print @@ -199,11 +199,11 @@ static void make_sensor_label(struct device_node *np, * The digital thermal sensors are associated * with a core. */ - n += snprintf(sdata->label + n, + n += scnprintf(sdata->label + n, sizeof(sdata->label) - n, " %d", cpuid); else - n += snprintf(sdata->label + n, + n += scnprintf(sdata->label + n, sizeof(sdata->label) - n, " phy%d", id); } @@ -211,7 +211,7 @@ static void make_sensor_label(struct device_node *np, * Membuffer pretty print */ if (!of_property_read_u32(np, "ibm,chip-id", &id)) - n += snprintf(sdata->label + n, sizeof(sdata->label) - n, + n += scnprintf(sdata->label + n, sizeof(sdata->label) - n, " %d", id & 0x); } -- 2.16.4
Re: [PATCH 1/2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value
On Wed, 2020-03-11 at 08:01 +0100, Christophe Leroy wrote: > > Le 11/03/2020 à 07:14, Balamuruhan S a écrit : > > ld instruction should have 14 bit immediate field (DS) concatenated > > with > > 0b00 on the right, encode it accordingly. > > > > Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store > > instructions") > > Reviewed-by: Sandipan Das > > Signed-off-by: Balamuruhan S > > --- > > arch/powerpc/lib/test_emulate_step.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/arch/powerpc/lib/test_emulate_step.c > > b/arch/powerpc/lib/test_emulate_step.c > > index 42347067739c..51c254fd15b5 100644 > > --- a/arch/powerpc/lib/test_emulate_step.c > > +++ b/arch/powerpc/lib/test_emulate_step.c > > @@ -19,7 +19,7 @@ > >* definitions. > >*/ > > #define TEST_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) | > > \ > > - ___PPC_RA(base) | IMM_L(i)) > > + ___PPC_RA(base) | ((i) & > > 0xfffc)) > > I think you should squash patch 1 and 2 together. Or at least you > should > put the new IMM_DS macro in patch 1 and use it instead of open > coding. sure, I will make the changes as suggested. -- Bala > > > #define TEST_LWZ(r, base, i) (PPC_INST_LWZ | ___PPC_RT(r) | > > \ > > ___PPC_RA(base) | IMM_L(i)) > > #define TEST_LWZX(t, a, b)(PPC_INST_LWZX | ___PPC_RT(t) | > > \ > > > > base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696 > > > > Christophe
Re: [PATCH 1/2] powerpc test_emulate_step: fix DS operand in ld encoding to appropriate value
Le 11/03/2020 à 07:14, Balamuruhan S a écrit : ld instruction should have 14 bit immediate field (DS) concatenated with 0b00 on the right, encode it accordingly. Fixes: 4ceae137bdab ("powerpc: emulate_step() tests for load/store instructions") Reviewed-by: Sandipan Das Signed-off-by: Balamuruhan S --- arch/powerpc/lib/test_emulate_step.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 42347067739c..51c254fd15b5 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -19,7 +19,7 @@ * definitions. */ #define TEST_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) | \ - ___PPC_RA(base) | IMM_L(i)) + ___PPC_RA(base) | ((i) & 0xfffc)) I think you should squash patch 1 and 2 together. Or at least you should put the new IMM_DS macro in patch 1 and use it instead of open coding. #define TEST_LWZ(r, base, i) (PPC_INST_LWZ | ___PPC_RT(r) | \ ___PPC_RA(base) | IMM_L(i)) #define TEST_LWZX(t, a, b)(PPC_INST_LWZX | ___PPC_RT(t) | \ base-commit: 5aa19adac1f3152a5fd3b865a1ab46bb845d3696 Christophe