[PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Artem Savkov
linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
for lazy preemption")) that adds an extra member to struct trace_entry.
This causes the offset of args field in struct trace_event_raw_sys_enter
be different from the one in struct syscall_trace_enter:

struct trace_event_raw_sys_enter {
struct trace_entry ent;  /* 012 */

/* XXX last struct has 3 bytes of padding */
/* XXX 4 bytes hole, try to pack */

long int   id;   /*16 8 */
long unsigned int  args[6];  /*2448 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
char   __data[]; /*72 0 */

/* size: 72, cachelines: 2, members: 4 */
/* sum members: 68, holes: 1, sum holes: 4 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 8 bytes */
};

struct syscall_trace_enter {
struct trace_entry ent;  /* 012 */

/* XXX last struct has 3 bytes of padding */

intnr;   /*12 4 */
long unsigned int  args[];   /*16 0 */

/* size: 16, cachelines: 1, members: 3 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 16 bytes */
};

This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
test_profiler testcase because max_ctx_offset is calculated based on the
former struct, while off on the latter:

  10488 if (is_tracepoint || is_syscall_tp) {
  10489 int off = trace_event_get_offsets(event->tp_event);
  10490
  10491 if (prog->aux->max_ctx_offset > off)
  10492 return -EACCES;
  10493 }

What bpf program is actually getting is a pointer to struct
syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
the problem by aligning struct syscall_tp_t with with struct
syscall_trace_(enter|exit) and changing the tests to use these structs
to dereference context.

Signed-off-by: Artem Savkov 
Acked-by: Steven Rostedt (Google) 

---
 kernel/trace/trace_syscalls.c| 4 ++--
 tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
 tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdafb..9c581d6da843a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call 
*call, struct pt_regs *re
 {
struct syscall_tp_t {
struct trace_entry ent;
-   unsigned long syscall_nr;
+   int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
int i;
@@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call 
*call, struct pt_regs *reg
 {
struct syscall_tp_t {
struct trace_entry ent;
-   unsigned long syscall_nr;
+   int syscall_nr;
unsigned long ret;
} __aligned(8) param;
 
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h 
b/tools/testing/selftests/bpf/progs/profiler.inc.h
index f799d87e87002..897061930cb76 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -609,7 +609,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
 }
 
 SEC("tracepoint/syscalls/sys_enter_kill")
-int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
 {
struct bpf_func_stats_ctx stats_ctx;
 
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c 
b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index 4b8e37f7fd06c..78b23934d9f8f 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -16,12 +16,12 @@ bool kprobe_called = false;
 bool fentry_called = false;
 
 SEC("tp/syscalls/sys_enter_nanosleep")
-int handle__tp(struct trace_event_raw_sys_enter *args)
+int handle__tp(struct syscall_trace_enter *args)
 {
struct __kernel_timespec *ts;
long tv_nsec;
 
-   if (args->id != __NR_nanosleep)
+   if (args->nr != __NR_nanosleep)
return 0;
 
ts = (void *)args->args[0];
-- 
2.41.0




[PATCH v6 2/4] dts: zynqmp: add properties for TCM in remoteproc

2023-10-12 Thread Tanmay Shah
Add properties as per new bindings in zynqmp remoteproc node
to represent TCM address and size.

This patch also adds alternative remoteproc node to represent
remoteproc cluster in split mode. By default lockstep mode is
enabled and users should disable it before using split mode
dts. Both device-tree nodes can't be used simultaneously one
of them must be disabled. For zcu102-1.0 and zcu102-1.1 board
remoteproc split mode dts node is enabled and lockstep mode
dts is disabled.

Signed-off-by: Tanmay Shah 
---

Changes in v6:
  - Introduce new node entry for r5f cluster split mode dts and
keep it disabled by default.
  - Keep remoteproc lockstep mode enabled by default to maintian
back compatibility.
  - Enable split mode only for zcu102 board to demo split mode use

 .../boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts  |  8 +++
 arch/arm64/boot/dts/xilinx/zynqmp.dtsi| 60 +--
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts 
b/arch/arm64/boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts
index c8f71a1aec89..495ca94b45db 100644
--- a/arch/arm64/boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts
+++ b/arch/arm64/boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts
@@ -14,6 +14,14 @@ / {
compatible = "xlnx,zynqmp-zcu102-rev1.0", "xlnx,zynqmp-zcu102", 
"xlnx,zynqmp";
 };
 
+_split {
+   status = "okay";
+};
+
+_lockstep {
+   status = "disabled";
+};
+
  {
#address-cells = <1>;
#size-cells = <1>;
diff --git a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi 
b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
index b61fc99cd911..602e6aba7ac5 100644
--- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
+++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
@@ -247,19 +247,69 @@ fpga_full: fpga-full {
ranges;
};
 
-   remoteproc {
+   rproc_lockstep: remoteproc@ffe0 {
compatible = "xlnx,zynqmp-r5fss";
xlnx,cluster-mode = <1>;
 
-   r5f-0 {
+   #address-cells = <2>;
+   #size-cells = <2>;
+
+   ranges = <0x0 0x0 0x0 0xffe0 0x0 0x2>,
+<0x0 0x2 0x0 0xffe2 0x0 0x2>,
+<0x1 0x0 0x0 0xffe9 0x0 0x1>,
+<0x1 0x2 0x0 0xffeb 0x0 0x1>;
+
+   r5f@0 {
+   compatible = "xlnx,zynqmp-r5f";
+   reg = <0x0 0x0 0x0 0x2>, <0x0 0x2 0x0 0x2>;
+   reg-names = "atcm", "btcm";
+   power-domains = <_firmware PD_RPU_0>,
+   <_firmware PD_R5_0_ATCM>,
+   <_firmware PD_R5_0_BTCM>;
+   memory-region = <_0_fw_image>;
+   };
+
+   r5f@1 {
+   compatible = "xlnx,zynqmp-r5f";
+   reg = <0x1 0x0 0x0 0x1>, <0x1 0x2 0x0 0x1>;
+   reg-names = "atcm", "btcm";
+   power-domains = <_firmware PD_RPU_1>,
+   <_firmware PD_R5_1_ATCM>,
+   <_firmware PD_R5_1_BTCM>;
+   memory-region = <_1_fw_image>;
+   };
+   };
+
+   rproc_split: remoteproc-split@ffe0 {
+   status = "disabled";
+   compatible = "xlnx,zynqmp-r5fss";
+   xlnx,cluster-mode = <0>;
+
+   #address-cells = <2>;
+   #size-cells = <2>;
+
+   ranges = <0x0 0x0 0x0 0xffe0 0x0 0x1>,
+<0x0 0x2 0x0 0xffe2 0x0 0x1>,
+<0x1 0x0 0x0 0xffe9 0x0 0x1>,
+<0x1 0x2 0x0 0xffeb 0x0 0x1>;
+
+   r5f@0 {
compatible = "xlnx,zynqmp-r5f";
-   power-domains = <_firmware PD_RPU_0>;
+   reg = <0x0 0x0 0x0 0x1>, <0x0 0x2 0x0 0x1>;
+   reg-names = "atcm", "btcm";
+   power-domains = <_firmware PD_RPU_0>,
+   <_firmware PD_R5_0_ATCM>,
+   <_firmware PD_R5_0_BTCM>;
memory-region = <_0_fw_image>;
};
 
-   r5f-1 {
+   r5f@1 {
compatible = "xlnx,zynqmp-r5f";
-   power-domains = <_firmware PD_RPU_1>;
+   reg = <0x1 0x0 0x0 0x1>, <0x1 0x2 0x0 0x1>;
+   reg-names = "atcm", "btcm";
+   power-domains = <_firmware PD_RPU_1>,
+   <_firmware PD_R5_1_ATCM>,
+   <_firmware PD_R5_1_BTCM>;
memory-region = <_1_fw_image>;
};
};
-- 
2.25.1



[PATCH v6 4/4] remoteproc: zynqmp: parse TCM from device tree

2023-10-12 Thread Tanmay Shah
ZynqMP TCM information is fixed in driver. Now ZynqMP TCM information
is available in device-tree. Parse TCM information in driver
as per new bindings.

Signed-off-by: Tanmay Shah 
---

Changes in v6:
  - Missing . at the end of the commit message
  - remove redundant initialization of variables
  - remove fail_tcm label and relevant code to free memory
acquired using devm_* API. As this will be freed when device free it
  - add extra check to see if "reg" property is supported or not

 drivers/remoteproc/xlnx_r5_remoteproc.c | 106 ++--
 1 file changed, 98 insertions(+), 8 deletions(-)

diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
b/drivers/remoteproc/xlnx_r5_remoteproc.c
index 04e95d880184..8c3575970c1d 100644
--- a/drivers/remoteproc/xlnx_r5_remoteproc.c
+++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
@@ -75,8 +75,8 @@ struct mbox_info {
 };
 
 /*
- * Hardcoded TCM bank values. This will be removed once TCM bindings are
- * accepted for system-dt specifications and upstreamed in linux kernel
+ * Hardcoded TCM bank values. This will stay in driver to maintain backward
+ * compatibility with device-tree that does not have TCM information.
  */
 static const struct mem_bank_data zynqmp_tcm_banks_split[] = {
{0xffe0UL, 0x0, 0x1UL, PD_R5_0_ATCM, "atcm0"}, /* TCM 64KB each 
*/
@@ -613,7 +613,8 @@ static int add_tcm_carveout_split_mode(struct rproc *rproc)
 bank_name);
if (!rproc_mem) {
ret = -ENOMEM;
-   zynqmp_pm_release_node(pm_domain_id);
+   if (pm_domain_id)
+   zynqmp_pm_release_node(pm_domain_id);
goto release_tcm_split;
}
 
@@ -626,7 +627,8 @@ static int add_tcm_carveout_split_mode(struct rproc *rproc)
/* If failed, Turn off all TCM banks turned on before */
for (i--; i >= 0; i--) {
pm_domain_id = r5_core->tcm_banks[i]->pm_domain_id;
-   zynqmp_pm_release_node(pm_domain_id);
+   if (pm_domain_id)
+   zynqmp_pm_release_node(pm_domain_id);
}
return ret;
 }
@@ -940,6 +942,8 @@ static int zynqmp_r5_add_pm_domains(struct rproc *rproc)
}
}
 
+   return 0;
+
 fail_add_pm_domains_lockstep:
while (j >= 1) {
if (r5_core->pm_dev_link2 && 
!IS_ERR_OR_NULL(r5_core->pm_dev_link2[j]))
@@ -1102,6 +1106,83 @@ static struct zynqmp_r5_core 
*zynqmp_r5_add_rproc_core(struct device *cdev)
return ERR_PTR(ret);
 }
 
+static int zynqmp_r5_get_tcm_node_from_dt(struct zynqmp_r5_cluster *cluster)
+{
+   struct zynqmp_r5_core *r5_core;
+   int i, j, tcm_bank_count, ret;
+   struct platform_device *cpdev;
+   struct mem_bank_data *tcm;
+   struct device_node *np;
+   struct resource *res;
+   u64 abs_addr, size;
+   struct device *dev;
+
+   for (i = 0; i < cluster->core_count; i++) {
+   r5_core = cluster->r5_cores[i];
+   dev = r5_core->dev;
+   np = dev_of_node(dev);
+
+   /* we have address cell 2 and size cell as 2 */
+   ret = of_property_count_elems_of_size(np, "reg",
+ 4 * sizeof(u32));
+   if (ret <= 0) {
+   dev_err(dev, "can't get reg property err %d\n", ret);
+   return -EINVAL;
+   }
+
+   tcm_bank_count = ret;
+
+   r5_core->tcm_banks = devm_kcalloc(dev, tcm_bank_count,
+ sizeof(struct mem_bank_data 
*),
+ GFP_KERNEL);
+   if (!r5_core->tcm_banks)
+   ret = -ENOMEM;
+
+   r5_core->tcm_bank_count = tcm_bank_count;
+   for (j = 0; j < tcm_bank_count; j++) {
+   tcm = devm_kzalloc(dev, sizeof(struct mem_bank_data *),
+  GFP_KERNEL);
+   if (!tcm)
+   return -ENOMEM;
+
+   r5_core->tcm_banks[j] = tcm;
+
+   /* get tcm address without translation */
+   ret = of_property_read_reg(np, j, _addr, );
+   if (ret) {
+   dev_err(dev, "failed to get reg property\n");
+   return ret;
+   }
+
+   /*
+* remote processor can address only 32 bits
+* so convert 64-bits into 32-bits. This will discard
+* any unwanted upper 32-bits.
+*/
+   tcm->da = (u32)abs_addr;
+   tcm->size = (u32)size;
+
+   cpdev = to_platform_device(dev);
+  

[PATCH v6 1/4] dt-bindings: remoteproc: add Tightly Coupled Memory (TCM) bindings

2023-10-12 Thread Tanmay Shah
From: Radhey Shyam Pandey 

Introduce bindings for TCM memory address space on AMD-xilinx Zynq
UltraScale+ platform. It will help in defining TCM in device-tree
and make it's access platform agnostic and data-driven.

Tightly-coupled memories(TCMs) are low-latency memory that provides
predictable instruction execution and predictable data load/store
timing. Each Cortex-R5F processor contains two 64-bit wide 64 KB memory
banks on the ATCM and BTCM ports, for a total of 128 KB of memory.

The TCM resources(reg, reg-names and power-domain) are documented for
each TCM in the R5 node. The reg and reg-names are made as required
properties as we don't want to hardcode TCM addresses for future
platforms and for zu+ legacy implementation will ensure that the
old dts w/o reg/reg-names works and stable ABI is maintained.

It also extends the examples for TCM split and lockstep modes.

Signed-off-by: Radhey Shyam Pandey 
Signed-off-by: Tanmay Shah 
Acked-by: Rob Herring 
---
 .../remoteproc/xlnx,zynqmp-r5fss.yaml | 131 +++---
 1 file changed, 113 insertions(+), 18 deletions(-)

diff --git 
a/Documentation/devicetree/bindings/remoteproc/xlnx,zynqmp-r5fss.yaml 
b/Documentation/devicetree/bindings/remoteproc/xlnx,zynqmp-r5fss.yaml
index 78aac69f1060..9ecd63ea1b38 100644
--- a/Documentation/devicetree/bindings/remoteproc/xlnx,zynqmp-r5fss.yaml
+++ b/Documentation/devicetree/bindings/remoteproc/xlnx,zynqmp-r5fss.yaml
@@ -20,6 +20,17 @@ properties:
   compatible:
 const: xlnx,zynqmp-r5fss
 
+  "#address-cells":
+const: 2
+
+  "#size-cells":
+const: 2
+
+  ranges:
+description: |
+  Standard ranges definition providing address translations for
+  local R5F TCM address spaces to bus addresses.
+
   xlnx,cluster-mode:
 $ref: /schemas/types.yaml#/definitions/uint32
 enum: [0, 1, 2]
@@ -37,7 +48,7 @@ properties:
   2: single cpu mode
 
 patternProperties:
-  "^r5f-[a-f0-9]+$":
+  "^r5f@[0-9a-f]+$":
 type: object
 description: |
   The RPU is located in the Low Power Domain of the Processor Subsystem.
@@ -54,8 +65,19 @@ patternProperties:
   compatible:
 const: xlnx,zynqmp-r5f
 
+  reg:
+items:
+  - description: ATCM internal memory region
+  - description: BTCM internal memory region
+
+  reg-names:
+items:
+  - const: atcm
+  - const: btcm
+
   power-domains:
-maxItems: 1
+minItems: 1
+maxItems: 3
 
   mboxes:
 minItems: 1
@@ -102,34 +124,107 @@ patternProperties:
 required:
   - compatible
   - power-domains
+  - reg
+  - reg-names
 
 unevaluatedProperties: false
 
 required:
   - compatible
+  - "#address-cells"
+  - "#size-cells"
+  - ranges
 
 additionalProperties: false
 
 examples:
   - |
-remoteproc {
-compatible = "xlnx,zynqmp-r5fss";
-xlnx,cluster-mode = <1>;
-
-r5f-0 {
-compatible = "xlnx,zynqmp-r5f";
-power-domains = <_firmware 0x7>;
-memory-region = <_0_fw_image>, <>, 
<>, <>;
-mboxes = <_mailbox_rpu0 0>, <_mailbox_rpu0 1>;
-mbox-names = "tx", "rx";
+#include 
+
+//Split mode configuration
+soc {
+#address-cells = <2>;
+#size-cells = <2>;
+
+remoteproc@ffe0 {
+compatible = "xlnx,zynqmp-r5fss";
+xlnx,cluster-mode = <0>;
+
+#address-cells = <2>;
+#size-cells = <2>;
+ranges = <0x0 0x0 0x0 0xffe0 0x0 0x1>,
+ <0x0 0x2 0x0 0xffe2 0x0 0x1>,
+ <0x1 0x0 0x0 0xffe9 0x0 0x1>,
+ <0x1 0x2 0x0 0xffeb 0x0 0x1>;
+
+r5f@0 {
+compatible = "xlnx,zynqmp-r5f";
+reg = <0x0 0x0 0x0 0x1>, <0x0 0x2 0x0 0x1>;
+reg-names = "atcm", "btcm";
+power-domains = <_firmware PD_RPU_0>,
+<_firmware PD_R5_0_ATCM>,
+<_firmware PD_R5_0_BTCM>;
+memory-region = <_0_fw_image>, <>,
+<>, <>;
+mboxes = <_mailbox_rpu0 0>, <_mailbox_rpu0 1>;
+mbox-names = "tx", "rx";
+};
+
+r5f@1 {
+compatible = "xlnx,zynqmp-r5f";
+reg = <0x1 0x0 0x0 0x1>, <0x1 0x2 0x0 0x1>;
+reg-names = "atcm", "btcm";
+power-domains = <_firmware PD_RPU_1>,
+<_firmware PD_R5_1_ATCM>,
+<_firmware PD_R5_1_BTCM>;
+memory-region = <_1_fw_image>, <>,
+<>, <>;
+mboxes = <_mailbox_rpu1 0>, <_mailbox_rpu1 1>;
+mbox-names = "tx", "rx";
+};
 };
+};
 
-r5f-1 {
-compatible = "xlnx,zynqmp-r5f";
-

[PATCH v6 3/4] remoteproc: zynqmp: add pm domains support

2023-10-12 Thread Tanmay Shah
Use TCM pm domains extracted from device-tree
to power on/off TCM using general pm domain framework.

Signed-off-by: Tanmay Shah 
---

Changes in v6:
  - Remove spurious change
  - Handle errors in add_pm_domains function
  - Remove redundant code to handle errors from remove_pm_domains

 drivers/remoteproc/xlnx_r5_remoteproc.c | 262 ++--
 1 file changed, 243 insertions(+), 19 deletions(-)

diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
b/drivers/remoteproc/xlnx_r5_remoteproc.c
index 4395edea9a64..04e95d880184 100644
--- a/drivers/remoteproc/xlnx_r5_remoteproc.c
+++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "remoteproc_internal.h"
 
@@ -102,6 +103,12 @@ static const struct mem_bank_data 
zynqmp_tcm_banks_lockstep[] = {
  * @rproc: rproc handle
  * @pm_domain_id: RPU CPU power domain id
  * @ipi: pointer to mailbox information
+ * @num_pm_dev: number of tcm pm domain devices for this core
+ * @pm_dev1: pm domain virtual devices for power domain framework
+ * @pm_dev_link1: pm domain device links after registration
+ * @pm_dev2: used only in lockstep mode. second core's pm domain virtual 
devices
+ * @pm_dev_link2: used only in lockstep mode. second core's pm device links 
after
+ * registration
  */
 struct zynqmp_r5_core {
struct device *dev;
@@ -111,6 +118,11 @@ struct zynqmp_r5_core {
struct rproc *rproc;
u32 pm_domain_id;
struct mbox_info *ipi;
+   int num_pm_dev;
+   struct device **pm_dev1;
+   struct device_link **pm_dev_link1;
+   struct device **pm_dev2;
+   struct device_link **pm_dev_link2;
 };
 
 /**
@@ -575,12 +587,21 @@ static int add_tcm_carveout_split_mode(struct rproc 
*rproc)
bank_size = r5_core->tcm_banks[i]->size;
pm_domain_id = r5_core->tcm_banks[i]->pm_domain_id;
 
-   ret = zynqmp_pm_request_node(pm_domain_id,
-ZYNQMP_PM_CAPABILITY_ACCESS, 0,
-ZYNQMP_PM_REQUEST_ACK_BLOCKING);
-   if (ret < 0) {
-   dev_err(dev, "failed to turn on TCM 0x%x", 
pm_domain_id);
-   goto release_tcm_split;
+   /*
+* If TCM information is available in device-tree then
+* in that case, pm domain framework will power on/off TCM.
+* In that case pm_domain_id is set to 0. If hardcode
+* bindings from driver is used, then only this driver will
+* use pm_domain_id.
+*/
+   if (pm_domain_id) {
+   ret = zynqmp_pm_request_node(pm_domain_id,
+
ZYNQMP_PM_CAPABILITY_ACCESS, 0,
+
ZYNQMP_PM_REQUEST_ACK_BLOCKING);
+   if (ret < 0) {
+   dev_err(dev, "failed to turn on TCM 0x%x", 
pm_domain_id);
+   goto release_tcm_split;
+   }
}
 
dev_dbg(dev, "TCM carveout split mode %s addr=%llx, da=0x%x, 
size=0x%lx",
@@ -646,13 +667,16 @@ static int add_tcm_carveout_lockstep_mode(struct rproc 
*rproc)
for (i = 0; i < num_banks; i++) {
pm_domain_id = r5_core->tcm_banks[i]->pm_domain_id;
 
-   /* Turn on each TCM bank individually */
-   ret = zynqmp_pm_request_node(pm_domain_id,
-ZYNQMP_PM_CAPABILITY_ACCESS, 0,
-ZYNQMP_PM_REQUEST_ACK_BLOCKING);
-   if (ret < 0) {
-   dev_err(dev, "failed to turn on TCM 0x%x", 
pm_domain_id);
-   goto release_tcm_lockstep;
+   if (pm_domain_id) {
+   /* Turn on each TCM bank individually */
+   ret = zynqmp_pm_request_node(pm_domain_id,
+
ZYNQMP_PM_CAPABILITY_ACCESS, 0,
+
ZYNQMP_PM_REQUEST_ACK_BLOCKING);
+   if (ret < 0) {
+   dev_err(dev, "failed to turn on TCM 0x%x",
+   pm_domain_id);
+   goto release_tcm_lockstep;
+   }
}
 
bank_size = r5_core->tcm_banks[i]->size;
@@ -687,7 +711,8 @@ static int add_tcm_carveout_lockstep_mode(struct rproc 
*rproc)
/* If failed, Turn off all TCM banks turned on before */
for (i--; i >= 0; i--) {
pm_domain_id = r5_core->tcm_banks[i]->pm_domain_id;
-   zynqmp_pm_release_node(pm_domain_id);
+   if (pm_domain_id)
+   zynqmp_pm_release_node(pm_domain_id);
}
return ret;
 }
@@ -758,6 

[PATCH v6 0/4] add zynqmp TCM bindings

2023-10-12 Thread Tanmay Shah
Tightly-Coupled Memories(TCMs) are low-latency memory that provides
predictable instruction execution and predictable data load/store
timing. Each Cortex-R5F processor contains exclusive two 64 KB memory
banks on the ATCM and BTCM ports, for a total of 128 KB of memory.
In lockstep mode, both 128KB memory is accessible to the cluster.

As per ZynqMP Ultrascale+ Technical Reference Manual UG1085, following
is address space of TCM memory. The bindings in this patch series
introduces properties to accommodate following address space with
address translation between Linux and Cortex-R5 views.

| | | |
| --- | --- | --- |
|  *Mode*|   *R5 View* | *Linux view* |  Notes   |
| *Split Mode*   | *start addr*| *start addr* |  |
| R5_0 ATCM (64 KB)  | 0x_ | 0xFFE0_  |  |
| R5_0 BTCM (64 KB)  | 0x0002_ | 0xFFE2_  |  |
| R5_1 ATCM (64 KB)  | 0x_ | 0xFFE9_  | alias of 0xFFE1_ |
| R5_1 BTCM (64 KB)  | 0x0002_ | 0xFFEB_  | alias of 0xFFE3_ |
|  ___   | ___ |___   |  |
| *Lockstep Mode*| |  |  |
| R5_0 ATCM (128 KB) | 0x_ | 0xFFE0_  |  |
| R5_0 BTCM (128 KB) | 0x0002_ | 0xFFE2_  |  |

References:
UG1085 TCM address space:
https://docs.xilinx.com/r/en-US/ug1085-zynq-ultrascale-trm/Tightly-Coupled-Memory-Address-Map

Changes in v6:
  - Introduce new node entry for r5f cluster split mode dts and
keep it disabled by default.
  - Keep remoteproc lockstep mode enabled by default to maintian
back compatibility.
  - Enable split mode only for zcu102 board to demo split mode use
  - Remove spurious change
  - Handle errors in add_pm_domains function
  - Remove redundant code to handle errors from remove_pm_domains
  - Missing . at the end of the commit message
  - remove redundant initialization of variables
  - remove fail_tcm label and relevant code to free memory
acquired using devm_* API. As this will be freed when device free it
  - add extra check to see if "reg" property is supported or not

Changes in v5:
  - maintain Rob's Ack on bindings patch as no changes in bindings
  - split previous patch into multiple patches
  - Use pm domain framework to turn on/off TCM
  - Add support of parsing TCM information from device-tree
  - maintain backward compatibility with previous bindings without
TCM information available in device-tree

This patch series continues previous effort to upstream ZynqMP
TCM bindings:
Previous v4 version link:
https://lore.kernel.org/all/20230829181900.2561194-1-tanmay.s...@amd.com/

Previous v3 version link:
https://lore.kernel.org/all/1689964908-22371-1-git-send-email-radhey.shyam.pan...@amd.com/
Radhey Shyam Pandey (1):
  dt-bindings: remoteproc: add Tightly Coupled Memory (TCM) bindings


Radhey Shyam Pandey (1):
  dt-bindings: remoteproc: add Tightly Coupled Memory (TCM) bindings

Tanmay Shah (3):
  dts: zynqmp: add properties for TCM in remoteproc
  remoteproc: zynqmp: add pm domains support
  remoteproc: zynqmp: parse TCM from device tree

 .../remoteproc/xlnx,zynqmp-r5fss.yaml | 131 ++-
 .../boot/dts/xilinx/zynqmp-zcu102-rev1.0.dts  |   8 +
 arch/arm64/boot/dts/xilinx/zynqmp.dtsi|  60 ++-
 drivers/remoteproc/xlnx_r5_remoteproc.c   | 368 --
 4 files changed, 517 insertions(+), 50 deletions(-)


base-commit: a7d272979d3a89b117ca2c547dc8a465c4f28635
-- 
2.25.1



Re: [RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Rod Webster
For the novice with the RT kernel, Could somebody  tell me what kernel
this bug was first introduced in and what kernel we need to install to
get the fix?

This could be the issue we have been experiencing in the Linuxcnc
community with excessive RT network latency (mostly with realtek
NIC's).
I had flagged Lazy preemption as being the possible  changes causing
this issue. I thought Lazy Preemption was added Circa kernel 5.09 as
it affected Debian Bullseye on Kernel 5.10 which coincided with when
we first observed the problem.

Thanks in anticipation.

Rod Webster

Rod Webster
1300 896 832
+61 435 765 611
VMN®
www.vmn.com.au

Sole Queensland Distributor


On Thu, 12 Oct 2023 at 21:46, Artem Savkov  wrote:
>
> linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> for lazy preemption")) that adds an extra member to struct trace_entry.
> This causes the offset of args field in struct trace_event_raw_sys_enter
> be different from the one in struct syscall_trace_enter:
>
> struct trace_event_raw_sys_enter {
> struct trace_entry ent;  /* 012 */
>
> /* XXX last struct has 3 bytes of padding */
> /* XXX 4 bytes hole, try to pack */
>
> long int   id;   /*16 8 */
> long unsigned int  args[6];  /*2448 */
> /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> char   __data[]; /*72 0 */
>
> /* size: 72, cachelines: 2, members: 4 */
> /* sum members: 68, holes: 1, sum holes: 4 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 8 bytes */
> };
>
> struct syscall_trace_enter {
> struct trace_entry ent;  /* 012 */
>
> /* XXX last struct has 3 bytes of padding */
>
> intnr;   /*12 4 */
> long unsigned int  args[];   /*16 0 */
>
> /* size: 16, cachelines: 1, members: 3 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 16 bytes */
> };
>
> This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> test_profiler testcase because max_ctx_offset is calculated based on the
> former struct, while off on the latter:
>
>   10488 if (is_tracepoint || is_syscall_tp) {
>   10489 int off = trace_event_get_offsets(event->tp_event);
>   10490
>   10491 if (prog->aux->max_ctx_offset > off)
>   10492 return -EACCES;
>   10493 }
>
> What bpf program is actually getting is a pointer to struct
> syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> the problem by aligning struct syscall_tp_t with with struct
> syscall_trace_(enter|exit) and changing the tests to use these structs
> to dereference context.
>
> Signed-off-by: Artem Savkov 
> ---
>  kernel/trace/trace_syscalls.c| 4 ++--
>  tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
>  tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
>  3 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index de753403cdafb..9c581d6da843a 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call 
> *call, struct pt_regs *re
>  {
> struct syscall_tp_t {
> struct trace_entry ent;
> -   unsigned long syscall_nr;
> +   int syscall_nr;
> unsigned long args[SYSCALL_DEFINE_MAXARGS];
> } __aligned(8) param;
> int i;
> @@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call 
> *call, struct pt_regs *reg
>  {
> struct syscall_tp_t {
> struct trace_entry ent;
> -   unsigned long syscall_nr;
> +   int syscall_nr;
> unsigned long ret;
> } __aligned(8) param;
>
> diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h 
> b/tools/testing/selftests/bpf/progs/profiler.inc.h
> index f799d87e87002..897061930cb76 100644
> --- a/tools/testing/selftests/bpf/progs/profiler.inc.h
> +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
> @@ -609,7 +609,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
>  }
>
>  SEC("tracepoint/syscalls/sys_enter_kill")
> -int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* 
> ctx)
> +int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
>  {
> struct bpf_func_stats_ctx stats_ctx;
>
> diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c 
> b/tools/testing/selftests/bpf/progs/test_vmlinux.c
> index 4b8e37f7fd06c..78b23934d9f8f 100644
> --- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
> +++ 

Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-12 Thread wuqiang.matt

On 2023/10/13 09:59, Masami Hiramatsu (Google) wrote:

On Fri, 13 Oct 2023 01:36:05 +0800
"wuqiang.matt"  wrote:


On 2023/10/12 22:02, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

On Mon, 9 Oct 2023 17:23:34 +0800
wuqiang  wrote:


Hello Masami,

Just got time for the new patch and got that ages[] was removed. ages[] is
introduced the way like 2-phase commit to keep consitency and must be kept.

Thinking of the following 2 cases that two cpu nodes are operating the same
objpool_slot simultaneously:

Case 1:

 NODE 1:  NODE 2:
 push to an empty slotpop will get wrong value

 try_cmpxchg_acquire(>tail, , next)
  try_cmpxchg_release(>head, , head + 1)
  return slot->entries[head & slot->mask]
 WRITE_ONCE(slot->entries[tail & slot->mask], obj)


Today, I rethink the algorithm.

For this case, we can just add a `commit` to the slot for committing the tail
commit instead of the ages[].

CPU1   CPU2
push to an empty slot  pop from the same slot

commit = tail = slot->tail;
next = tail + 1;
try_cmpxchg_acquire(>tail,
  , next);
 while (head != commit) -> NG1
WRITE_ONCE(slot->entries[tail & slot->mask],
  obj)
 while (head != commit) -> NG2
WRITE_ONCE(>commit, next);
 while (head != commit) -> OK
 try_cmpxchg_acquire(>head,
 , next);
 return slot->entries[head & 
slot->mask]

So the NG1 and NG2 timing, the pop will fail.

This does not expect the nested "push" case because the reserve-commit block
will be interrupt disabled. This doesn't support NMI but that is OK.


If 2 push are performed in a row, slot->commit will be 'tail + 2', CPU2 won't
meet the condition "head == commit".


No, it doesn't happen, because pop() may fetch the object from other's slot, but
push() only happens on its own slot. So push() on the same slot is never
in parallel.


I got it: 'commit' is now the actual tail. Then "head != commit" means there
are free objects ready for pop.

Great idea indeed. I'll give it a try.


Since slot->commit' is always synced to slot->tail after a successful push,
should pop check "tail != commit" ?


No, pop() only need to check the "head != commit". If "head == commit", this 
slot
is empty or the slot owner is trying to push() (not committed yet).


In this case when a push is ongoing with
the slot, pop() has to wait for its completion even if there are objects in
the same slot.


No, pop() will go to other slots to find available object.




If we need to support such NMI or remove irq-disable, we can do following
(please remember the push operation only happens on the slot owned by that
   CPU. So this is per-cpu process)

---
do {
commit = tail = slot->tail;
next = tail + 1;
} while (!try_cmpxchg_acquire(>tail, , next));

WRITE_ONCE(slot->entries[tail & slot->mask], obj);

// At this point, "commit == slot->tail - 1" in this nest level.
// Only outmost (non-nexted) case, "commit == slot->commit"
if (commit != slot->commit)
return; // do nothing. it must be updated by outmost one.

// catch up commit to tail.
do {
commit = READ_ONCE(slot->tail);
WRITE_ONCE(slot->commit, commit);
 // note that someone can interrupt this loop too.
} while (commit != READ_ONCE(slot->tail));
---


Yes, I got you: push can only happen on local node and the first try has the
right to extend 'commit'. It does resolve nested push attempts, and preemption
must be disabled.


Yes.



The overflow/ABA issue can still happen if the cpu is being interrupted for
long time.


Yes, but ages[] also can not avoid that. (Note that head, tail and commit are
32bit counter, and per-cpu slot ring has enough big to store the pointer of
all objects.)

Thank you,




For the rethook this may be too much.

Thank you,




Case 2:

 NODE 1:  NODE 2
 push to slot w/ 1 objpop will get wrong value

  try_cmpxchg_release(>head, , head + 1)
 try_cmpxchg_acquire(>tail, , next)
 WRITE_ONCE(slot->entries[tail & slot->mask], obj)
  return slot->entries[head & slot->mask]


The pre-condition should be: CPU 1 tries to push to a full slot, in this case
tail = head + capacity but tail & mask == head & mask



Regards,
wuqiang

On 2023/9/25 17:42, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

On Tue,  5 Sep 2023 09:52:51 +0800
"wuqiang.matt"  wrote:


The object pool is a scalable implementaion of high performance queue
for object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate the hot spot of memory
contention, it could deliver 

Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-12 Thread Google
On Fri, 13 Oct 2023 01:36:05 +0800
"wuqiang.matt"  wrote:

> On 2023/10/12 22:02, Masami Hiramatsu (Google) wrote:
> > Hi Wuqiang,
> > 
> > On Mon, 9 Oct 2023 17:23:34 +0800
> > wuqiang  wrote:
> > 
> >> Hello Masami,
> >>
> >> Just got time for the new patch and got that ages[] was removed. ages[] is
> >> introduced the way like 2-phase commit to keep consitency and must be kept.
> >>
> >> Thinking of the following 2 cases that two cpu nodes are operating the same
> >> objpool_slot simultaneously:
> >>
> >> Case 1:
> >>
> >> NODE 1:  NODE 2:
> >> push to an empty slotpop will get wrong value
> >>
> >> try_cmpxchg_acquire(>tail, , next)
> >>  try_cmpxchg_release(>head, , head 
> >> + 1)
> >>  return slot->entries[head & slot->mask]
> >> WRITE_ONCE(slot->entries[tail & slot->mask], obj)
> > 
> > Today, I rethink the algorithm.
> > 
> > For this case, we can just add a `commit` to the slot for committing the 
> > tail
> > commit instead of the ages[].
> > 
> > CPU1   CPU2
> > push to an empty slot  pop from the same slot
> > 
> > commit = tail = slot->tail;
> > next = tail + 1;
> > try_cmpxchg_acquire(>tail,
> >  , next);
> > while (head != commit) -> NG1
> > WRITE_ONCE(slot->entries[tail & slot->mask],
> >  obj)
> > while (head != commit) -> NG2
> > WRITE_ONCE(>commit, next);
> > while (head != commit) -> OK
> > try_cmpxchg_acquire(>head,
> > , 
> > next);
> > return slot->entries[head & 
> > slot->mask]
> > 
> > So the NG1 and NG2 timing, the pop will fail.
> > 
> > This does not expect the nested "push" case because the reserve-commit block
> > will be interrupt disabled. This doesn't support NMI but that is OK.
> 
> If 2 push are performed in a row, slot->commit will be 'tail + 2', CPU2 won't
> meet the condition "head == commit".

No, it doesn't happen, because pop() may fetch the object from other's slot, but
push() only happens on its own slot. So push() on the same slot is never
in parallel.

> 
> Since slot->commit' is always synced to slot->tail after a successful push,
> should pop check "tail != commit" ?

No, pop() only need to check the "head != commit". If "head == commit", this 
slot
is empty or the slot owner is trying to push() (not committed yet).

> In this case when a push is ongoing with
> the slot, pop() has to wait for its completion even if there are objects in
> the same slot.

No, pop() will go to other slots to find available object.

> 
> > If we need to support such NMI or remove irq-disable, we can do following
> > (please remember the push operation only happens on the slot owned by that
> >   CPU. So this is per-cpu process)
> > 
> > ---
> > do {
> >commit = tail = slot->tail;
> >next = tail + 1;
> > } while (!try_cmpxchg_acquire(>tail, , next));
> > 
> > WRITE_ONCE(slot->entries[tail & slot->mask], obj);
> > 
> > // At this point, "commit == slot->tail - 1" in this nest level.
> > // Only outmost (non-nexted) case, "commit == slot->commit"
> > if (commit != slot->commit)
> >return; // do nothing. it must be updated by outmost one.
> > 
> > // catch up commit to tail.
> > do {
> >commit = READ_ONCE(slot->tail);
> >WRITE_ONCE(slot->commit, commit);
> > // note that someone can interrupt this loop too.
> > } while (commit != READ_ONCE(slot->tail));
> > ---
> 
> Yes, I got you: push can only happen on local node and the first try has the
> right to extend 'commit'. It does resolve nested push attempts, and preemption
> must be disabled.

Yes.

> 
> The overflow/ABA issue can still happen if the cpu is being interrupted for
> long time.

Yes, but ages[] also can not avoid that. (Note that head, tail and commit are
32bit counter, and per-cpu slot ring has enough big to store the pointer of
all objects.)

Thank you,

> 
> > For the rethook this may be too much.
> > 
> > Thank you,
> > 
> >>
> >>
> >> Case 2:
> >>
> >> NODE 1:  NODE 2
> >> push to slot w/ 1 objpop will get wrong value
> >>
> >>  try_cmpxchg_release(>head, , head 
> >> + 1)
> >> try_cmpxchg_acquire(>tail, , next)
> >> WRITE_ONCE(slot->entries[tail & slot->mask], obj)
> >>  return slot->entries[head & slot->mask]
> 
> The pre-condition should be: CPU 1 tries to push to a full slot, in this case
> tail = head + capacity but tail & mask == head & mask
> 
> >>
> >> Regards,
> >> wuqiang
> >>
> >> On 2023/9/25 17:42, Masami Hiramatsu (Google) wrote:
> >>> Hi Wuqiang,
> >>>
> >>> On Tue,  5 Sep 2023 09:52:51 +0800
> >>> "wuqiang.matt"  wrote:
> >>>
> 

RE: [PATCH v1 1/2] ACPI: NFIT: Fix memory leak, and local use of devm_*()

2023-10-12 Thread Dan Williams
Michal Wilczynski wrote:
> devm_*() family of functions purpose is managing memory attached to a
> device. So in general it should only be used for allocations that should
> last for the whole lifecycle of the device. This is not the case for
> acpi_nfit_init_interleave_set(). There are two allocations that are only
> used locally in this function. What's more - if the function exits on
> error path memory is never freed. It's still attached to dev and would
> be freed on device detach, so this leak could be called a 'local leak'.

This analysis is incorrect devm cleans up on driver ->probe() failure in
addition to ->remove(), and these error returns result in ->probe()
failures. No leak, i.e. this is not a fix.

The conversion to modern probe is ok if you want to resubmit that one
without this intervening change.



Re: [RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Andrii Nakryiko
On Thu, Oct 12, 2023 at 6:43 AM Steven Rostedt  wrote:
>
> On Thu, 12 Oct 2023 13:45:50 +0200
> Artem Savkov  wrote:
>
> > linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> > for lazy preemption")) that adds an extra member to struct trace_entry.
> > This causes the offset of args field in struct trace_event_raw_sys_enter
> > be different from the one in struct syscall_trace_enter:
> >
> > struct trace_event_raw_sys_enter {
> > struct trace_entry ent;  /* 012 */
> >
> > /* XXX last struct has 3 bytes of padding */
> > /* XXX 4 bytes hole, try to pack */
> >
> > long int   id;   /*16 8 */
> > long unsigned int  args[6];  /*2448 */
> > /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> > char   __data[]; /*72 0 */
> >
> > /* size: 72, cachelines: 2, members: 4 */
> > /* sum members: 68, holes: 1, sum holes: 4 */
> > /* paddings: 1, sum paddings: 3 */
> > /* last cacheline: 8 bytes */
> > };
> >
> > struct syscall_trace_enter {
> > struct trace_entry ent;  /* 012 */
> >
> > /* XXX last struct has 3 bytes of padding */
> >
> > intnr;   /*12 4 */
> > long unsigned int  args[];   /*16 0 */
> >
> > /* size: 16, cachelines: 1, members: 3 */
> > /* paddings: 1, sum paddings: 3 */
> > /* last cacheline: 16 bytes */
> > };
> >
> > This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> > test_profiler testcase because max_ctx_offset is calculated based on the
> > former struct, while off on the latter:
> >
> >   10488 if (is_tracepoint || is_syscall_tp) {
> >   10489 int off = trace_event_get_offsets(event->tp_event);
> >   10490
> >   10491 if (prog->aux->max_ctx_offset > off)
> >   10492 return -EACCES;
> >   10493 }
> >
> > What bpf program is actually getting is a pointer to struct
> > syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> > the problem by aligning struct syscall_tp_t with with struct
> > syscall_trace_(enter|exit) and changing the tests to use these structs
> > to dereference context.
> >
> > Signed-off-by: Artem Savkov 
>

I think these changes make sense regardless, can you please resend the
patch without RFC tag so that our CI can run tests for it?

> Thanks for doing a proper fix.
>
> Acked-by: Steven Rostedt (Google) 

But looking at [0] and briefly reading some of the discussions you,
Steven, had. I'm just wondering if it would be best to avoid
increasing struct trace_entry altogether? It seems like preempt_count
is actually a 4-bit field in trace context, so it doesn't seem like we
really need to allocate an entire byte for both preempt_count and
preempt_lazy_count. Why can't we just combine them and not waste 8
extra bytes for each trace event in a ring buffer?

  [0] 
https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git/commit/?id=b1773eac3f29cbdcdfd16e0339f1a164066e9f71

>
> -- Steve



[PATCH] ACPI: NFIT: Optimize nfit_mem_cmp() for efficiency

2023-10-12 Thread Kuan-Wei Chiu
The original code used conditional branching in the nfit_mem_cmp
function to compare two values and return -1, 1, or 0 based on the
result. However, the list_sort comparison function only needs results
<0, >0, or =0. This patch optimizes the code to make the comparison
branchless, improving efficiency and reducing code size. This change
reduces the number of comparison operations from 1-2 to a single
subtraction operation, thereby saving the number of instructions.

Signed-off-by: Kuan-Wei Chiu 
---
 drivers/acpi/nfit/core.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index f96bf32cd368..eea827d9af08 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1138,11 +1138,7 @@ static int nfit_mem_cmp(void *priv, const struct 
list_head *_a,
 
handleA = __to_nfit_memdev(a)->device_handle;
handleB = __to_nfit_memdev(b)->device_handle;
-   if (handleA < handleB)
-   return -1;
-   else if (handleA > handleB)
-   return 1;
-   return 0;
+   return handleA - handleB;
 }
 
 static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
-- 
2.25.1




Re: [PATCH -fixes] riscv: Fix ftrace syscall handling which are now prefixed with __riscv_

2023-10-12 Thread patchwork-bot+linux-riscv
Hello:

This patch was applied to riscv/linux.git (fixes)
by Palmer Dabbelt :

On Tue,  3 Oct 2023 20:24:07 +0200 you wrote:
> ftrace creates entries for each syscall in the tracefs but has failed
> since commit 08d0ce30e0e4 ("riscv: Implement syscall wrappers") which
> prefixes all riscv syscalls with __riscv_.
> 
> So fix this by implementing arch_syscall_match_sym_name() which allows us
> to ignore this prefix.
> 
> [...]

Here is the summary with links:
  - [-fixes] riscv: Fix ftrace syscall handling which are now prefixed with 
__riscv_
https://git.kernel.org/riscv/c/a87e7d3e8832

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html





Re: [PATCH v2] trace: tracing_event_filter: fast path when no subsystem filters

2023-10-12 Thread Nick Lowell
On Wed, Oct 4, 2023 at 10:52 AM Steven Rostedt  wrote:
>
> On Wed, 4 Oct 2023 10:39:33 -0400
> Nick Lowell  wrote:
>
> > > [ cut here ]
> > >  WARNING: CPU: 5 PID: 944 at kernel/trace/trace_events_filter.c:2423 
> > > apply_subsystem_event_filter+0x18c/0x5e0
> > >  Modules linked in:
> > >  CPU: 5 PID: 944 Comm: trace-cmd Not tainted 
> > > 6.6.0-rc4-test-9-gff7cd7446fe5 #102
> > >  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> > > 1.16.2-debian-1.16.2-1 04/01/2014
> > >  RIP: 0010:apply_subsystem_event_filter+0x18c/0x5e0
> > >  Code: 44 24 08 00 00 00 00 48 8b 6d 00 4c 39 f5 75 bc 48 8b 44 24 18 4c 
> > > 8b 60 18 4c 89 e5 45 84 ff 75 14 48 85 ed 0f 84 37 ff ff ff <0f> 0b eb 10 
> > > e8 4b be fd ff eb b0 4d 85 e4 0f 84 a3 02 00 00 48 8b
> > >  RSP: 0018:9b4941607db8 EFLAGS: 00010286
> > >  RAX: 8b2780a77280 RBX: 8b2780a77400 RCX: 
> > >  RDX:  RSI: 8b2781c11c38 RDI: 8b2781c11c38
> > >  RBP: 8b28df449030 R08: 8b2781c11c38 R09: 
> > >  R10: 8b2781c11c38 R11:  R12: 8b28df449030
> > >  R13: aaf64de0 R14: aaf66bb8 R15: 
> > >  FS:  7fd221def3c0() GS:8b28f7d4() 
> > > knlGS:
> > >  CS:  0010 DS:  ES:  CR0: 80050033
> > >  CR2: 56117c93e160 CR3: 00010173a003 CR4: 00170ee0
> > >  Call Trace:
> > >   
> > >   ? apply_subsystem_event_filter+0x18c/0x5e0
> > >   ? __warn+0x81/0x130
> > >   ? apply_subsystem_event_filter+0x18c/0x5e0
> > >   ? report_bug+0x191/0x1c0
> > >   ? handle_bug+0x3c/0x80
> > >   ? exc_invalid_op+0x17/0x70
> > >   ? asm_exc_invalid_op+0x1a/0x20
> > >   ? apply_subsystem_event_filter+0x18c/0x5e0
> > >   ? apply_subsystem_event_filter+0x5b/0x5e0
> > >   ? __check_object_size+0x25b/0x2c0
> > >   subsystem_filter_write+0x41/0x70
> > >   vfs_write+0xf2/0x440
> > >   ? kmem_cache_free+0x22/0x350
> > >   ksys_write+0x6f/0xf0
> > >   do_syscall_64+0x3f/0xc0
> > >   entry_SYSCALL_64_after_hwframe+0x6e/0xd8
> > >  RIP: 0033:0x7fd221ee7ae0
> > >
> > > -- Steve
> >
> > Is this just informative indicating that there are issues with how
> > filters are being used or are you saying there is something else I
> > need to do before this patch is approved?
> > What version of trace-cmd is that using?
>
> Not sure if it matters, but the above was with trace-cmd v3.2.
>
> So, I guess we need to look a bit deeper at the change.
>
> > @@ -2411,7 +2418,12 @@ int apply_subsystem_event_filter(struct 
> > trace_subsystem_dir *dir,
> >   }
> >
> >   if (!strcmp(strstrip(filter_string), "0")) {
> > - filter_free_subsystem_preds(dir, tr);
> > + /* If nothing was freed, we do not need to sync */
> > + if (!filter_free_subsystem_preds(dir, tr)) {
> > + if(!(WARN_ON_ONCE(system->filter)))
> > + goto out_unlock;
>
> When do we want to skip the below?
>
> The original version just did the "goto out_unlock" before the
> "system->filter" check, and that would have caused a memory leak, or just
> left the "system->filter" around when unneeded.
>
> In other words, the patch is incorrect in general then.
>
> > + }
> > +
> >   remove_filter_string(system->filter);
> >   filter = system->filter;
> >   system->filter = NULL;
>
> I believe, what you really want here is simply:
>
> bool sync;
>
> [..]
>
> if (!strcmp(strstrip(filter_string), "0")) {
> +   sync = filter_free_subsystem_preds(dir, tr);
> remove_filter_string(system->filter);
> filter = system->filter;
> system->filter = NULL;
> -   /* Ensure all filters are no longer used */
> -   tracepoint_synchronize_unregister();
> +   if (sync) {
> +   /* Ensure all filters are no longer used */
> +   tracepoint_synchronize_unregister();
> +   }
> filter_free_subsystem_filters(dir, tr);
>

I really appreciate the continued feedback.  I was able to reproduce.
I think I'm understanding better but still need some help.
I am actually wondering if remove_filter_string(system->filter) should
also return bool as an OR'd input for sync.
Should it be something like this?

if (!strcmp(strstrip(filter_string), "0")) {
-   filter_free_subsystem_preds(dir, tr);
-   remove_filter_string(system->filter);
+  bool sync;
+
+  sync = filter_free_subsystem_preds(dir, tr);
+  sync = sync || remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
-   /* Ensure all filters are no longer used */
-   tracepoint_synchronize_unregister();
+  /* If nothing was freed, we do not need to sync 

Re: [PATCH v2] module: Add CONFIG_MODULE_LOAD_IN_SEQUENCE option

2023-10-12 Thread kernel test robot
Hi Joey,

kernel test robot noticed the following build warnings:

[auto build test WARNING on mcgrof/modules-next]
[also build test WARNING on linus/master v6.6-rc5 next-20231012]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Joey-Jiao/module-Add-CONFIG_MODULE_LOAD_IN_SEQUENCE-option/20231011-154640
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git 
modules-next
patch link:
https://lore.kernel.org/r/20231011074438.6098-1-quic_jiangenj%40quicinc.com
patch subject: [PATCH v2] module: Add CONFIG_MODULE_LOAD_IN_SEQUENCE option
config: um-i386_defconfig 
(https://download.01.org/0day-ci/archive/20231013/202310130206.f778hunp-...@intel.com/config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20231013/202310130206.f778hunp-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202310130206.f778hunp-...@intel.com/

All warnings (new ones prefixed by >>):

   kernel/module/main.c: In function 'do_init_module':
   kernel/module/main.c:2627:12: error: invalid storage class for function 
'may_init_module'
static int may_init_module(void)
   ^~~
   kernel/module/main.c:2636:13: error: invalid storage class for function 
'finished_loading'
static bool finished_loading(const char *name)
^~~~
   kernel/module/main.c:2657:12: error: invalid storage class for function 
'module_patient_check_exists'
static int module_patient_check_exists(const char *name,
   ^~~
   kernel/module/main.c:2701:12: error: invalid storage class for function 
'add_unformed_module'
static int add_unformed_module(struct module *mod)
   ^~~
   kernel/module/main.c:2722:12: error: invalid storage class for function 
'complete_formation'
static int complete_formation(struct module *mod, struct load_info *info)
   ^~
   kernel/module/main.c:2755:12: error: invalid storage class for function 
'prepare_coming_module'
static int prepare_coming_module(struct module *mod)
   ^
   kernel/module/main.c:2773:12: error: invalid storage class for function 
'unknown_module_param_cb'
static int unknown_module_param_cb(char *param, char *val, const char 
*modname,
   ^~~
   kernel/module/main.c:2793:12: error: invalid storage class for function 
'early_mod_check'
static int early_mod_check(struct load_info *info, int flags)
   ^~~
   kernel/module/main.c:2829:12: error: invalid storage class for function 
'load_module'
static int load_module(struct load_info *info, const char __user *uargs,
   ^~~
>> kernel/module/main.c:3039:1: warning: 'alias' attribute ignored 
>> [-Wattributes]
SYSCALL_DEFINE3(init_module, void __user *, umod,
^~~
   In file included from kernel/module/main.c:26:0:
   include/linux/syscalls.h:247:21: error: invalid storage class for function 
'__do_sys_init_module'
 static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
^
   include/linux/syscalls.h:230:2: note: in expansion of macro 
'__SYSCALL_DEFINEx'
 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 ^
   include/linux/syscalls.h:221:36: note: in expansion of macro 
'SYSCALL_DEFINEx'
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
   ^~~
   kernel/module/main.c:3039:1: note: in expansion of macro 'SYSCALL_DEFINE3'
SYSCALL_DEFINE3(init_module, void __user *, umod,
^~~
   include/linux/syscalls.h:249:18: error: static declaration of 
'__se_sys_init_module' follows non-static declaration
 asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
 ^
   include/linux/syscalls.h:230:2: note: in expansion of macro 
'__SYSCALL_DEFINEx'
 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 ^
   include/linux/syscalls.h:221:36: note: in expansion of macro 
'SYSCALL_DEFINEx'
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
   ^~~
   kernel/module/main.c:3039:1: note: in expansion of macro 'SYSCALL_DEFINE3'
SYSCALL_DEFINE3(init_module, void __user *, umod,
^~~
   include/linux/syscalls.h:248:18: note: previous declaration of 
'__se_sys_init_module' was here
 asmlinkage long __se

Re: [PATCH v2] module: Add CONFIG_MODULE_LOAD_IN_SEQUENCE option

2023-10-12 Thread kernel test robot
Hi Joey,

kernel test robot noticed the following build errors:

[auto build test ERROR on mcgrof/modules-next]
[also build test ERROR on linus/master v6.6-rc5 next-20231012]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Joey-Jiao/module-Add-CONFIG_MODULE_LOAD_IN_SEQUENCE-option/20231011-154640
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git 
modules-next
patch link:
https://lore.kernel.org/r/20231011074438.6098-1-quic_jiangenj%40quicinc.com
patch subject: [PATCH v2] module: Add CONFIG_MODULE_LOAD_IN_SEQUENCE option
config: um-allnoconfig 
(https://download.01.org/0day-ci/archive/20231013/202310130236.lybpy0lh-...@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 
4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20231013/202310130236.lybpy0lh-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202310130236.lybpy0lh-...@intel.com/

All errors (new ones prefixed by >>):

   In file included from kernel/module/main.c:14:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 547 | val = __raw_readb(PCI_IOBASE + addr);
 |   ~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 560 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + 
addr));
 | ~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from 
macro '__le16_to_cpu'
  37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
 |   ^
   In file included from kernel/module/main.c:14:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 573 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + 
addr));
 | ~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from 
macro '__le32_to_cpu'
  35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
 |   ^
   In file included from kernel/module/main.c:14:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 584 | __raw_writeb(value, PCI_IOBASE + addr);
 | ~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 594 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + 
addr);
 |   ~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
 604 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + 
addr);
 |   ~~ ^
   include/asm-generic/io.h:692:20: warning: performing pointer arit

Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-12 Thread wuqiang.matt

On 2023/10/12 22:02, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

On Mon, 9 Oct 2023 17:23:34 +0800
wuqiang  wrote:


Hello Masami,

Just got time for the new patch and got that ages[] was removed. ages[] is
introduced the way like 2-phase commit to keep consitency and must be kept.

Thinking of the following 2 cases that two cpu nodes are operating the same
objpool_slot simultaneously:

Case 1:

NODE 1:  NODE 2:
push to an empty slotpop will get wrong value

try_cmpxchg_acquire(>tail, , next)
 try_cmpxchg_release(>head, , head + 1)
 return slot->entries[head & slot->mask]
WRITE_ONCE(slot->entries[tail & slot->mask], obj)


Today, I rethink the algorithm.

For this case, we can just add a `commit` to the slot for committing the tail
commit instead of the ages[].

CPU1   CPU2
push to an empty slot  pop from the same slot

commit = tail = slot->tail;
next = tail + 1;
try_cmpxchg_acquire(>tail,
 , next);
while (head != commit) -> NG1
WRITE_ONCE(slot->entries[tail & slot->mask],
 obj)
while (head != commit) -> NG2
WRITE_ONCE(>commit, next);
while (head != commit) -> OK
try_cmpxchg_acquire(>head,
, next);
return slot->entries[head & 
slot->mask]

So the NG1 and NG2 timing, the pop will fail.

This does not expect the nested "push" case because the reserve-commit block
will be interrupt disabled. This doesn't support NMI but that is OK.


If 2 push are performed in a row, slot->commit will be 'tail + 2', CPU2 won't
meet the condition "head == commit".

Since slot->commit' is always synced to slot->tail after a successful push,
should pop check "tail != commit" ? In this case when a push is ongoing with
the slot, pop() has to wait for its completion even if there are objects in
the same slot.


If we need to support such NMI or remove irq-disable, we can do following
(please remember the push operation only happens on the slot owned by that
  CPU. So this is per-cpu process)

---
do {
   commit = tail = slot->tail;
   next = tail + 1;
} while (!try_cmpxchg_acquire(>tail, , next));

WRITE_ONCE(slot->entries[tail & slot->mask], obj);

// At this point, "commit == slot->tail - 1" in this nest level.
// Only outmost (non-nexted) case, "commit == slot->commit"
if (commit != slot->commit)
   return; // do nothing. it must be updated by outmost one.

// catch up commit to tail.
do {
   commit = READ_ONCE(slot->tail);
   WRITE_ONCE(slot->commit, commit);
// note that someone can interrupt this loop too.
} while (commit != READ_ONCE(slot->tail));
---


Yes, I got you: push can only happen on local node and the first try has the
right to extend 'commit'. It does resolve nested push attempts, and preemption
must be disabled.

The overflow/ABA issue can still happen if the cpu is being interrupted for
long time.


For the rethook this may be too much.

Thank you,




Case 2:

NODE 1:  NODE 2
push to slot w/ 1 objpop will get wrong value

 try_cmpxchg_release(>head, , head + 1)
try_cmpxchg_acquire(>tail, , next)
WRITE_ONCE(slot->entries[tail & slot->mask], obj)
 return slot->entries[head & slot->mask]


The pre-condition should be: CPU 1 tries to push to a full slot, in this case
tail = head + capacity but tail & mask == head & mask



Regards,
wuqiang

On 2023/9/25 17:42, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

On Tue,  5 Sep 2023 09:52:51 +0800
"wuqiang.matt"  wrote:


The object pool is a scalable implementaion of high performance queue
for object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate the hot spot of memory
contention, it could deliver near-linear scalability for high parallel
scenarios. The objpool is best suited for following cases:
1) Memory allocation or reclamation are prohibited or too expensive
2) Consumers are of different priorities, such as irqs and threads

Limitations:
1) Maximum objects (capacity) is determined during pool initializing
 and can't be modified (extended) after objpool creation
2) The memory of objects won't be freed until objpool is finalized
3) Object allocation (pop) may fail after trying all cpu slots


I made a simplifying patch on this by (mainly) removing ages array.
I also rename local variable to use more readable names, like slot,
pool, and obj.

Here the results which I run the test_objpool.ko.

Original:
[   50.500235] Summary of testcases:
[   50.503139] duration: 1027135us  hits:   30628293miss:  
0sync: percpu objpool
[   

Re: [PATCH v3] module: Add CONFIG_MODULE_DISABLE_INIT_FREE option

2023-10-12 Thread Luis Chamberlain
On Thu, Oct 12, 2023 at 09:50:27AM -0700, Luis Chamberlain wrote:
> > In the kernel, selecting the CONFIG_MODULE_DISABLE_INIT_FREE
> > option disables the asynchronous freeing of init sections.
> 
> No it does not.

I take it back, your patch effectively only does this.

  Luis



Re: [PATCH v4] module: Add CONFIG_MODULE_DISABLE_INIT_FREE option

2023-10-12 Thread Luis Chamberlain
On Thu, Oct 12, 2023 at 07:17:19AM +0530, Joey Jiao wrote:
>  
> +config MODULE_DISABLE_INIT_FREE
> + bool "Disable freeing of init sections"
> + default n
> + help
> +   By default, kernel will free init sections after module being fully
> +   loaded.
> +
> +   MODULE_DISABLE_INIT_FREE allows users to prevent the freeing of init
> +   sections. This option is particularly helpful for syzkaller fuzzing,
> +   ensuring that the module consistently loads into the same address
> +   across reboots.

How and why does not free'ing init help with syzkaller exactly? I don't
see the relationship between not free'ing init and ensuring th emodule
loads into the same address. There could be many things which could
incur an address gets allocated from a module at another location which
a module can take. I cannot fathom how this simple toggle could ensure
modules following the address allocations accross reboots. That seems
like odd chance, not something actually deterministic.

> +
>  endif # MODULES
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index 98fedfdb8db5..0f242b7b29fe 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -2593,7 +2593,8 @@ static noinline int do_init_module(struct module *mod)
>* be cleaned up needs to sync with the queued work - ie
>* rcu_barrier()
>*/
> - if (llist_add(>node, _free_list))
> + if (llist_add(>node, _free_list) &&
> + !IS_ENABLED(CONFIG_MODULE_DISABLE_INIT_FREE))
>   schedule_work(_free_wq);





Re: [PATCH v3] module: Add CONFIG_MODULE_DISABLE_INIT_FREE option

2023-10-12 Thread Luis Chamberlain
On Thu, Oct 12, 2023 at 07:10:11AM +0530, Joey Jiao wrote:
> To facilitate syzkaller test, it's essential for the module to retain the same
> address across reboots.

Why?

> In userspace, the execution of modprobe commands must
> occur sequentially.

Why?

> In the kernel, selecting the CONFIG_MODULE_DISABLE_INIT_FREE
> option disables the asynchronous freeing of init sections.

No it does not.

> Signed-off-by: Joey Jiao 
> ---
>  kernel/module/Kconfig | 8 
>  kernel/module/main.c  | 5 +++--
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
> index 33a2e991f608..1cdbee4c51de 100644
> --- a/kernel/module/Kconfig
> +++ b/kernel/module/Kconfig
> @@ -389,4 +389,12 @@ config MODULES_TREE_LOOKUP
>   def_bool y
>   depends on PERF_EVENTS || TRACING || CFI_CLANG
>  
> +config MODULE_DISABLE_INIT_FREE
> + bool "Disable freeing of init sections"
> + default n
> + help
> +   Allows users to prevent the freeing of init sections. This option is
> +   particularly helpful for syzkaller fuzzing, ensuring that the module
> +   consistently loads into the same address across reboots.
> +
>  endif # MODULES
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index 98fedfdb8db5..a5210b90c078 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -2593,8 +2593,9 @@ static noinline int do_init_module(struct module *mod)
>* be cleaned up needs to sync with the queued work - ie
>* rcu_barrier()
>*/
> - if (llist_add(>node, _free_list))
> - schedule_work(_free_wq);
> + if (llist_add(>node, _free_list) &&
> + !IS_ENABLED(CONFIG_MODULE_DISABLE_INIT_FREE))
> + schedule_work(_free_wq);

llist_add() returns true if the list was empty prior to adding the
entry, so the functionality you are adding makes no sense with the
commit log in any way shape or form.

  Luis



Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-12 Thread wuqiang.matt

Hello Masami,

I've udpated the objpool patch and did some function testings for X64 and
ARM64. Later I'll do the performance testings and more regressions.

Here are the changelogs:
1) new struct objpool_node added to represent the real percpu ring arrary
   and struct objpool_slot now represents the expansion of objpool_node.
   ages[] and entries[] are now managed by objpool_slot (which is managed
   by objpool_head)
2) ages[] added back to objpool_try_add_slot and objpool_try_get_slot
3) unnecessary OBJPOOL_FLAG definitions are removed
4) unnecessary head/tail loading removed since try_cmpxchg_acuiqre and
   try_cmxchg_release have inherent memory loading embeded
5) objpool_fini refined to make sure the extra refcount to be released

The new version is attached in this mail for your review. And I will
prepare the full patch after the regression testings.

Best regards,
wuqiang


diff --git a/include/linux/objpool.h b/include/linux/objpool.h
new file mode 100644
index ..f3e066601df2
--- /dev/null
+++ b/include/linux/objpool.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include 
+#include 
+
+/*
+ * objpool: ring-array based lockless MPMC queue
+ *
+ * Copyright: wuqiang.m...@bytedance.com,mhira...@kernel.org
+ *
+ * The object pool is a scalable implementaion of high performance queue
+ * for objects allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging per-cpu ring-array to mitigate the hot spots of memory
+ * contention, it could deliver near-linear scalability for high parallel
+ * scenarios. objpomol is best suited for following cases:
+ * 1) Memory allocation or reclamation are prohibited or too expensive
+ * 2) Consumers are of different priorities, such as irqs and threads
+ *
+ * Limitations:
+ * 1) Maximum objects (capacity) is determined during pool initializing
+ * 2) The memory of objects won't be freed until the pool is finalized
+ * 3) Object allocation (pop) may fail after trying all cpu slots
+ */
+
+/**
+ * struct objpool_node - percpu ring array of objpool
+ * @head: head sequence of the local ring array
+ * @tail: tail sequence of the local ring array
+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool. The percpu objpool node is to be
+ * allocated from local memory for NUMA system, and to be kept compact in
+ * continuous memory: CPU assigned number of objects are stored just after
+ * the body of objpool_node.
+ *
+ * Real size of the ring array is far too smaller than the value range of
+ * head and tail, typed as uint32_t: [0, 2^32), so only lower bits of head
+ * and tail are used as the actual position in the ring array. In general
+ * the ring array is acting like a small sliding window, which is always
+ * moving forward in the loop of [0, 2^32).
+ */
+ struct objpool_node {
+   uint32_thead;
+   uint32_ttail;
+} __packed;
+
+/**
+ * struct objpool_slot - the expansion of percpu objpool_node
+ * @node: the pointer of percpu objpool_node
+ * @ages: unique sequence number to avoid ABA
+ * @entries: object entries on this slot
+ */
+struct objpool_slot {
+   struct objpool_node *node;
+   uint32_t*ages;
+   void *  *entries;
+};
+
+struct objpool_head;
+
+/*
+ * caller-specified callback for object initial setup, it's only called
+ * once for each object (just after the memory allocation of the object)
+ */
+typedef int (*objpool_init_obj_cb)(void *obj, void *context);
+
+/* caller-specified cleanup callback for objpool destruction */
+typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
+
+/**
+ * struct objpool_head - object pooling metadata
+ * @obj_size:   object size, aligned to sizeof(void *)
+ * @nr_objs:total objs (to be pre-allocated with objpool)
+ * @nr_cpus:local copy of nr_cpu_ids
+ * @capacity:   max objs can be managed by one objpool_slot
+ * @gfp:gfp flags for kmalloc & vmalloc
+ * @ref:refcount of objpool
+ * @flags:  flags for objpool management
+ * @cpu_slots:  pointer to the array of objpool_slot
+ * @release:resource cleanup callback
+ * @context:caller-provided context
+ */
+struct objpool_head {
+   int obj_size;
+   int nr_objs;
+   int nr_cpus;
+   int capacity;
+   gfp_t   gfp;
+   refcount_t  ref;
+   unsigned long   flags;
+   struct objpool_slot*cpu_slots;
+   objpool_fini_cb release;
+   void   *context;
+};
+
+#define OBJPOOL_NR_OBJECT_MAX  (1UL << 24) /* maximum numbers of total objects 
*/
+#define OBJPOOL_OBJECT_SIZE_MAX(1UL << 16) /* maximum size of an 
object */
+
+/**
+ * objpool_init() - initialize objpool and pre-allocated objects
+ * @pool:the 

Re: [PATCH] KVM: deprecate KVM_WERROR in favor of general WERROR

2023-10-12 Thread Paolo Bonzini
On Wed, Oct 11, 2023 at 1:46 AM Sean Christopherson  wrote:
> > > The DRM_I915_WERROR config depends on EXPERT and !COMPILE_TEST, and to
> > > my knowledge this has never caused issues outside of i915 developers and
> > > CI.
> >
> > Ack, I think you do it right. I was trying to establish a precedent
> > so that we can delete these as soon as they cause an issue, not sooner.
>
> So isn't the underlying problem simply that KVM_WERROR is enabled by default 
> for
> some configurations?  If that's the case, then my proposal to make KVM_WERROR
> always off by default, and "depends on KVM && EXPERT && !KASAN", would make 
> this
> go away, no?

No objection to adding EXPERT. Adding W=1 when build-testing KVM
patches is a good
idea, you will still get the occasional patch from someone who didn't
have it but that's fine.

I added KVM_WERROR a relatively long time ago after a warning scrolled
away too quickly (a
harmless one, but also a kind that honestly shouldn't have made it to
Linus). At the time there
were still too many warnings to enable WERROR globally, and I feel
that now we're on the
same boat with W=1. I think we should keep KVM_WERROR (which was based on
DRM_I915_WERROR indeed) and maintainers should just add W=1 when build-testing
KVM patches.

Paolo



Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-12 Thread Google
Hi Wuqiang,

On Mon, 9 Oct 2023 17:23:34 +0800
wuqiang  wrote:

> Hello Masami,
> 
> Just got time for the new patch and got that ages[] was removed. ages[] is
> introduced the way like 2-phase commit to keep consitency and must be kept.
> 
> Thinking of the following 2 cases that two cpu nodes are operating the same
> objpool_slot simultaneously:
> 
> Case 1:
> 
>NODE 1:  NODE 2:
>push to an empty slotpop will get wrong value
> 
>try_cmpxchg_acquire(>tail, , next)
> try_cmpxchg_release(>head, , head + 1)
> return slot->entries[head & slot->mask]
>WRITE_ONCE(slot->entries[tail & slot->mask], obj)

Today, I rethink the algorithm.

For this case, we can just add a `commit` to the slot for committing the tail
commit instead of the ages[]. 

CPU1   CPU2
push to an empty slot  pop from the same slot

commit = tail = slot->tail;
next = tail + 1;
try_cmpxchg_acquire(>tail,
, next);
   while (head != commit) -> NG1
WRITE_ONCE(slot->entries[tail & slot->mask],
obj)
   while (head != commit) -> NG2
WRITE_ONCE(>commit, next);
   while (head != commit) -> OK
   try_cmpxchg_acquire(>head,
   , next);
   return slot->entries[head & 
slot->mask]

So the NG1 and NG2 timing, the pop will fail.

This does not expect the nested "push" case because the reserve-commit block
will be interrupt disabled. This doesn't support NMI but that is OK.

If we need to support such NMI or remove irq-disable, we can do following
(please remember the push operation only happens on the slot owned by that
 CPU. So this is per-cpu process)

---
do {
  commit = tail = slot->tail;
  next = tail + 1;
} while (!try_cmpxchg_acquire(>tail, , next));

WRITE_ONCE(slot->entries[tail & slot->mask], obj);

// At this point, "commit == slot->tail - 1" in this nest level.
// Only outmost (non-nexted) case, "commit == slot->commit"
if (commit != slot->commit)
  return; // do nothing. it must be updated by outmost one.

// catch up commit to tail.
do {
  commit = READ_ONCE(slot->tail);
  WRITE_ONCE(slot->commit, commit);
   // note that someone can interrupt this loop too.
} while (commit != READ_ONCE(slot->tail));
---

For the rethook this may be too much.

Thank you,

> 
> 
> Case 2:
> 
>NODE 1:  NODE 2
>push to slot w/ 1 objpop will get wrong value
> 
> try_cmpxchg_release(>head, , head + 1)
>try_cmpxchg_acquire(>tail, , next)
>WRITE_ONCE(slot->entries[tail & slot->mask], obj)
> return slot->entries[head & slot->mask]
> 
> 
> Regards,
> wuqiang
> 
> On 2023/9/25 17:42, Masami Hiramatsu (Google) wrote:
> > Hi Wuqiang,
> > 
> > On Tue,  5 Sep 2023 09:52:51 +0800
> > "wuqiang.matt"  wrote:
> > 
> >> The object pool is a scalable implementaion of high performance queue
> >> for object allocation and reclamation, such as kretprobe instances.
> >>
> >> With leveraging percpu ring-array to mitigate the hot spot of memory
> >> contention, it could deliver near-linear scalability for high parallel
> >> scenarios. The objpool is best suited for following cases:
> >> 1) Memory allocation or reclamation are prohibited or too expensive
> >> 2) Consumers are of different priorities, such as irqs and threads
> >>
> >> Limitations:
> >> 1) Maximum objects (capacity) is determined during pool initializing
> >> and can't be modified (extended) after objpool creation
> >> 2) The memory of objects won't be freed until objpool is finalized
> >> 3) Object allocation (pop) may fail after trying all cpu slots
> > 
> > I made a simplifying patch on this by (mainly) removing ages array.
> > I also rename local variable to use more readable names, like slot,
> > pool, and obj.
> > 
> > Here the results which I run the test_objpool.ko.
> > 
> > Original:
> > [   50.500235] Summary of testcases:
> > [   50.503139] duration: 1027135us  hits:   30628293miss:   
> >0sync: percpu objpool
> > [   50.510416] duration: 1047977us  hits:   30453023miss:   
> >0sync: percpu objpool from vmalloc
> > [   50.517421] duration: 1047098us  hits:   31145034miss:   
> >0sync & hrtimer: percpu objpool
> > [   50.524671] duration: 1053233us  hits:   30919640miss:   
> >0sync & hrtimer: percpu objpool from vmalloc
> > [   50.531382] duration: 1055822us  hits:3407221miss:   
> >   830219sync overrun: percpu objpool
> > [   50.538135] duration: 1055998us  hits:3404624miss:   
> >   854160sync 

Re: [RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Steven Rostedt
On Thu, 12 Oct 2023 13:45:50 +0200
Artem Savkov  wrote:

> linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
> for lazy preemption")) that adds an extra member to struct trace_entry.
> This causes the offset of args field in struct trace_event_raw_sys_enter
> be different from the one in struct syscall_trace_enter:
> 
> struct trace_event_raw_sys_enter {
> struct trace_entry ent;  /* 012 */
> 
> /* XXX last struct has 3 bytes of padding */
> /* XXX 4 bytes hole, try to pack */
> 
> long int   id;   /*16 8 */
> long unsigned int  args[6];  /*2448 */
> /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
> char   __data[]; /*72 0 */
> 
> /* size: 72, cachelines: 2, members: 4 */
> /* sum members: 68, holes: 1, sum holes: 4 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 8 bytes */
> };
> 
> struct syscall_trace_enter {
> struct trace_entry ent;  /* 012 */
> 
> /* XXX last struct has 3 bytes of padding */
> 
> intnr;   /*12 4 */
> long unsigned int  args[];   /*16 0 */
> 
> /* size: 16, cachelines: 1, members: 3 */
> /* paddings: 1, sum paddings: 3 */
> /* last cacheline: 16 bytes */
> };
> 
> This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
> test_profiler testcase because max_ctx_offset is calculated based on the
> former struct, while off on the latter:
> 
>   10488 if (is_tracepoint || is_syscall_tp) {
>   10489 int off = trace_event_get_offsets(event->tp_event);
>   10490
>   10491 if (prog->aux->max_ctx_offset > off)
>   10492 return -EACCES;
>   10493 }
> 
> What bpf program is actually getting is a pointer to struct
> syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
> the problem by aligning struct syscall_tp_t with with struct
> syscall_trace_(enter|exit) and changing the tests to use these structs
> to dereference context.
> 
> Signed-off-by: Artem Savkov 

Thanks for doing a proper fix.

Acked-by: Steven Rostedt (Google) 

-- Steve



Re: [PATCH v5 12/18] x86/sgx: Add EPC OOM path to forcefully reclaim EPC

2023-10-12 Thread Haitao Huang

On Tue, 10 Oct 2023 19:51:17 -0500, Huang, Kai  wrote:
[...]
(btw, even you track VA/SECS pages in unreclaimable list, given they  
both have

'enclave' as the owner,  do you still need SGX_EPC_OWNER_ENCL and
SGX_EPC_OWNER_PAGE ?)


Let me think about it, there might be also a way just track encl objects  
not unreclaimable pages.


I still not get why we need kill the VM not just remove just enough pages.  
Is it due to the static allocation not able to reclaim?



If we always remove all vEPC pages/kill VM, then we should not need track  
individual vepc pages.


Thanks

Haitao


[RFC PATCH bpf-next] bpf: change syscall_nr type to int in struct syscall_tp_t

2023-10-12 Thread Artem Savkov
linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
for lazy preemption")) that adds an extra member to struct trace_entry.
This causes the offset of args field in struct trace_event_raw_sys_enter
be different from the one in struct syscall_trace_enter:

struct trace_event_raw_sys_enter {
struct trace_entry ent;  /* 012 */

/* XXX last struct has 3 bytes of padding */
/* XXX 4 bytes hole, try to pack */

long int   id;   /*16 8 */
long unsigned int  args[6];  /*2448 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
char   __data[]; /*72 0 */

/* size: 72, cachelines: 2, members: 4 */
/* sum members: 68, holes: 1, sum holes: 4 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 8 bytes */
};

struct syscall_trace_enter {
struct trace_entry ent;  /* 012 */

/* XXX last struct has 3 bytes of padding */

intnr;   /*12 4 */
long unsigned int  args[];   /*16 0 */

/* size: 16, cachelines: 1, members: 3 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 16 bytes */
};

This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
test_profiler testcase because max_ctx_offset is calculated based on the
former struct, while off on the latter:

  10488 if (is_tracepoint || is_syscall_tp) {
  10489 int off = trace_event_get_offsets(event->tp_event);
  10490
  10491 if (prog->aux->max_ctx_offset > off)
  10492 return -EACCES;
  10493 }

What bpf program is actually getting is a pointer to struct
syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
the problem by aligning struct syscall_tp_t with with struct
syscall_trace_(enter|exit) and changing the tests to use these structs
to dereference context.

Signed-off-by: Artem Savkov 
---
 kernel/trace/trace_syscalls.c| 4 ++--
 tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
 tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdafb..9c581d6da843a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call 
*call, struct pt_regs *re
 {
struct syscall_tp_t {
struct trace_entry ent;
-   unsigned long syscall_nr;
+   int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
int i;
@@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call 
*call, struct pt_regs *reg
 {
struct syscall_tp_t {
struct trace_entry ent;
-   unsigned long syscall_nr;
+   int syscall_nr;
unsigned long ret;
} __aligned(8) param;
 
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h 
b/tools/testing/selftests/bpf/progs/profiler.inc.h
index f799d87e87002..897061930cb76 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -609,7 +609,7 @@ ssize_t BPF_KPROBE(kprobe__proc_sys_write,
 }
 
 SEC("tracepoint/syscalls/sys_enter_kill")
-int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
 {
struct bpf_func_stats_ctx stats_ctx;
 
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c 
b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index 4b8e37f7fd06c..78b23934d9f8f 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -16,12 +16,12 @@ bool kprobe_called = false;
 bool fentry_called = false;
 
 SEC("tp/syscalls/sys_enter_nanosleep")
-int handle__tp(struct trace_event_raw_sys_enter *args)
+int handle__tp(struct syscall_trace_enter *args)
 {
struct __kernel_timespec *ts;
long tv_nsec;
 
-   if (args->id != __NR_nanosleep)
+   if (args->nr != __NR_nanosleep)
return 0;
 
ts = (void *)args->args[0];
-- 
2.41.0




Re: [PATCH] tracing/eprobe: drop unneeded breaks

2023-10-12 Thread Dan Carpenter
On Sat, Sep 30, 2023 at 06:19:02PM +0900, Masami Hiramatsu wrote:
> On Fri, 29 Sep 2023 13:37:08 +0200 (CEST)
> Julia Lawall  wrote:
> 
> > 
> > 
> > On Fri, 29 Sep 2023, Masami Hiramatsu  wrote:
> > 
> > > On Thu, 28 Sep 2023 12:43:34 +0200
> > > Julia Lawall  wrote:
> > >
> > > > Drop break after return.
> > > >
> > >
> > > Good catch! This looks good to me.
> > >
> > > Acked-by: Masami Hiramatsu (Google) 
> > >
> > > And
> > >
> > > Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events")
> > 
> > Thanks.  I didn't include that because it's not a bug.  But it does break
> > Coccinelle, which is how I noticed it.
> 
> OK, I got it. I thought it may cause a compiler warning because the
> 'break' never be executed. (maybe it is just a flow-control word,
> so it may not need to be warned, but a bit storange.)

I don't think GCC warns about unreachable code, but yeah, in Smatch
unreachable break statements do not trigger a warning.  People like
to add extra break statements to switch statements.

regards,
dan carpenter




Re: [PATCH v6 0/2] leds: Add a driver for KTD202x

2023-10-12 Thread Lee Jones
On Mon, 02 Oct 2023 18:48:26 +0200, André Apitzsch wrote:
> Add the binding description and the corresponding driver for
> the Kinetic KTD2026 and KTD2027.
> 
> 

Applied, thanks!

[1/2] dt-bindings: leds: Add Kinetic KTD2026/2027 LED
  commit: 25766993d24a623c4ddcbd34a78fdc76026d9b40
[2/2] leds: add ktd202x driver
  commit: 4239b17b5de0dcd5900727be5597ba061acd00b8

--
Lee Jones [李琼斯]



Re: [PATCH v5 1/2] mm/memory_hotplug: split memmap_on_memory requests across memblocks

2023-10-12 Thread David Hildenbrand

On 12.10.23 07:53, Verma, Vishal L wrote:

On Mon, 2023-10-09 at 17:04 +0200, David Hildenbrand wrote:

On 07.10.23 10:55, Huang, Ying wrote:

Vishal Verma  writes:


@@ -2167,47 +2221,28 @@ static int __ref try_remove_memory(u64 start, u64 size)
 if (rc)
 return rc;
   
+   mem_hotplug_begin();

+
 /*
-    * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
-    * the same granularity it was added - a single memory block.
+    * For memmap_on_memory, the altmaps could have been added on
+    * a per-memblock basis. Loop through the entire range if so,
+    * and remove each memblock and its altmap.
  */
 if (mhp_memmap_on_memory()) {


IIUC, even if mhp_memmap_on_memory() returns true, it's still possible
that the memmap is put in DRAM after [2/2].  So that,
arch_remove_memory() are called for each memory block unnecessarily.  Can
we detect this (via altmap?) and call remove_memory_block_and_altmap()
for the whole range?


Good point. We should handle memblock-per-memblock onny if we have to
handle the altmap. Otherwise, just call a separate function that doesn't
care about -- e.g., called remove_memory_blocks_no_altmap().

We could simply walk all memory blocks and make sure either all have an
altmap or none has an altmap. If there is a mix, we should bail out with
WARN_ON_ONCE().


Ok I think I follow - based on both of these threads, here's my
understanding in an incremental diff from the original patches (may not
apply directly as I've already committed changes from the other bits of
feedback - but this should provide an idea of the direction) -

---

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 507291e44c0b..30addcb063b4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2201,6 +2201,40 @@ static void __ref remove_memory_block_and_altmap(u64 
start, u64 size)
}
  }
  
+static bool memblocks_have_altmaps(u64 start, u64 size)

+{
+   unsigned long memblock_size = memory_block_size_bytes();
+   u64 num_altmaps = 0, num_no_altmaps = 0;
+   struct memory_block *mem;
+   u64 cur_start;
+   int rc = 0;
+
+   if (!mhp_memmap_on_memory())
+   return false;


Probably can remove that, checked by the caller. (or drop the one in the 
caller)



+
+   for (cur_start = start; cur_start < start + size;
+cur_start += memblock_size) {
+   if (walk_memory_blocks(cur_start, memblock_size, ,
+  test_has_altmap_cb))
+   num_altmaps++;
+   else
+   num_no_altmaps++;
+   }


You should do that without the outer loop, by doing the counting in the 
callback function instead.	



+
+   if (!num_altmaps && num_no_altmaps > 0)
+   return false;
+
+   if (!num_no_altmaps && num_altmaps > 0)
+   return true;
+
+   /*
+* If there is a mix of memblocks with and without altmaps,
+* something has gone very wrong. WARN and bail.
+*/
+   WARN_ONCE(1, "memblocks have a mix of missing and present altmaps");


It would be better if we could even make try_remove_memory() fail in 
this case.



+   return false;
+}
+
  static int __ref try_remove_memory(u64 start, u64 size)
  {
int rc, nid = NUMA_NO_NODE;
@@ -2230,7 +2264,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * a per-memblock basis. Loop through the entire range if so,
 * and remove each memblock and its altmap.
 */
-   if (mhp_memmap_on_memory()) {
+   if (mhp_memmap_on_memory() && memblocks_have_altmaps(start, size)) {
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
  
@@ -2239,7 +2273,8 @@ static int __ref try_remove_memory(u64 start, u64 size)

remove_memory_block_and_altmap(cur_start,
   memblock_size);


^ probably cleaner move the loop into remove_memory_block_and_altmap() 
and call it remove_memory_blocks_and_altmaps(start, size) instead.



} else {
-   remove_memory_block_and_altmap(start, size);
+   remove_memory_block_devices(start, size);
+   arch_remove_memory(start, size, NULL);
}
  
  	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {




--
Cheers,

David / dhildenb




Re: [PATCH 3/5] parisc: remove broken vdso_install

2023-10-12 Thread Helge Deller

Hi Masahiro,

On 10/9/23 14:42, Masahiro Yamada wrote:

'make ARCH=parisc vdso_install' has never worked. It attempts to
descend into arch/parisc/kernel/vdso/, which does not exist.

The command just fails:

   scripts/Makefile.build:41: arch/parisc/kernel/vdso/Makefile: No such file or 
directory

The second line is also meaningless because parisc does not define
CONFIG_COMPAT_VDSO.

It appears that this code was copied from another architecture without
proper adaptation.


Yes.


Remove the broken code.

Signed-off-by: Masahiro Yamada 


Thanks for cleaning this up and making it consistent across the architectures.

Acked-by: Helge Deller   # parisc

In case you do a v2 version of the patch, would you add to arch/parisc/Makefile
(otherwise I can send a follow-up patch in the parisc git tree):

vdso-install-y   += arch/parisc/kernel/vdso32/vdso32.so
vdso-install-$(CONFIG_64BIT) += arch/parisc/kernel/vdso64/vdso64.so

Thanks!
Helge


Re: [PATCH 4/5] kbuild: unify vdso_install rules

2023-10-12 Thread Guo Ren
On Wed, Oct 11, 2023 at 8:53 PM Masahiro Yamada  wrote:
>
> On Wed, Oct 11, 2023 at 11:24 AM Guo Ren  wrote:
> >
> > On Mon, Oct 9, 2023 at 8:42 PM Masahiro Yamada  wrote:
>
> > > --- a/arch/riscv/Makefile
> > > +++ b/arch/riscv/Makefile
> > > @@ -131,12 +131,6 @@ endif
> > >  libs-y += arch/riscv/lib/
> > >  libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
> > >
> > > -PHONY += vdso_install
> > > -vdso_install:
> > > -   $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
> > > -   $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
> > > -   $(build)=arch/riscv/kernel/compat_vdso compat_$@)
> > > -
> > >  ifeq ($(KBUILD_EXTMOD),)
> > >  ifeq ($(CONFIG_MMU),y)
> > >  prepare: vdso_prepare
> > > @@ -148,6 +142,9 @@ vdso_prepare: prepare0
> > >  endif
> > >  endif
> > >
> > > +vdso-install-y += arch/riscv/kernel/vdso/vdso.so.dbg
> > > +vdso-install-$(CONFIG_COMPAT)  += 
> > > arch/riscv/kernel/compat_vdso/compat_vdso.so.dbg:../compat_vdso/compat_vdso.so
> > Why do we need ":../compat_vdso/compat_vdso.so" here?
>
>
>
>
> All architectures except riscv install vdso files
> to /lib/modules/$(uname -r)/vdso/.
>
>
>
> See the following code in arch/riscv/kernel/compat_vdso/Makefile:
>
>
> quiet_cmd_compat_vdso_install = INSTALL $@
>   cmd_compat_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/compat_vdso/$@
>
>
>
>
> Riscv copies the compat vdso to
> /lib/modules/$(uname -r)/compat_vdso/.
>
>
>
> This commit preserves the current installation path as-is.
>
> If the riscv maintainers agree, we can change the
> installation destination to /lib/modules/$(uname -r)/vdso/
> for consistency.
Yes, but it should be another patch. Thx for the clarification.

Reviewed-by: Guo Ren 

>
>
>
> --
> Best Regards
> Masahiro Yamada



-- 
Best Regards
 Guo Ren