[PATCH v3 2/2] scsi: ibmvfc: interface updates for future FPIN and MQ support
VIOS partitions with SLI-4 enabled Emulex adapters will be capable of driving IO in parallel through mulitple work queues or channels, and with new hyperviosr firmware that supports multiple interrupt sources an ibmvfc NPIV single initiator can be modified to exploit end to end channelization in a PowerVM environment. VIOS hosts will also be able to expose fabric perfromance impact notifications (FPIN) via a new asynchronous event to ibmvfc clients that advertise support via IBMVFC_CAN_HANDLE_FPIN in their capabilities flag during NPIV_LOGIN. This patch introduces three new Management Datagrams (MADs) for channelization support negotiation as well as the FPIN asynchronous event and FPIN status flags. Follow up work is required to plumb the ibmvfc client driver to use these new interfaces. Signed-off-by: Tyrel Datwyler --- v2 -> v3: Fixup checkpatch warnings about using __attribute__() v1 -> v2: Fixup complier errors from neglected commit --amend --- drivers/scsi/ibmvscsi/ibmvfc.h | 66 +- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h index 6da23666f5be..e6e1c255a79c 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.h +++ b/drivers/scsi/ibmvscsi/ibmvfc.h @@ -124,6 +124,9 @@ enum ibmvfc_mad_types { IBMVFC_PASSTHRU = 0x0200, IBMVFC_TMF_MAD = 0x0100, IBMVFC_NPIV_LOGOUT = 0x0800, + IBMVFC_CHANNEL_ENQUIRY = 0x1000, + IBMVFC_CHANNEL_SETUP= 0x2000, + IBMVFC_CONNECTION_INFO = 0x4000, }; struct ibmvfc_mad_common { @@ -162,6 +165,8 @@ struct ibmvfc_npiv_login { __be32 max_cmds; __be64 capabilities; #define IBMVFC_CAN_MIGRATE 0x01 +#define IBMVFC_CAN_USE_CHANNELS0x02 +#define IBMVFC_CAN_HANDLE_FPIN 0x04 __be64 node_name; struct srp_direct_buf async; u8 partition_name[IBMVFC_MAX_NAME]; @@ -204,6 +209,7 @@ struct ibmvfc_npiv_login_resp { __be64 capabilities; #define IBMVFC_CAN_FLUSH_ON_HALT 0x08 #define IBMVFC_CAN_SUPPRESS_ABTS 0x10 +#define IBMVFC_CAN_SUPPORT_CHANNELS0x20 __be32 max_cmds; __be32 scsi_id_sz; __be64 max_dma_len; @@ -482,6 +488,52 @@ struct ibmvfc_passthru_mad { struct ibmvfc_passthru_fc_iu fc_iu; } __packed __aligned(8); +struct ibmvfc_channel_enquiry { + struct ibmvfc_mad_common common; + __be32 flags; +#define IBMVFC_NO_CHANNELS_TO_CRQ_SUPPORT 0x01 +#define IBMVFC_SUPPORT_VARIABLE_SUBQ_MSG 0x02 +#define IBMVFC_NO_N_TO_M_CHANNELS_SUPPORT 0x04 + __be32 num_scsi_subq_channels; + __be32 num_nvmeof_subq_channels; + __be32 num_scsi_vas_channels; + __be32 num_nvmeof_vas_channels; +} __packed __aligned(8); + +struct ibmvfc_channel_setup_mad { + struct ibmvfc_mad_common common; + struct srp_direct_buf buffer; +} __packed __aligned(8); + +#define IBMVFC_MAX_CHANNELS502 + +struct ibmvfc_channel_setup { + __be32 flags; +#define IBMVFC_CANCEL_CHANNELS 0x01 +#define IBMVFC_USE_BUFFER 0x02 +#define IBMVFC_CHANNELS_CANCELED 0x04 + __be32 reserved; + __be32 num_scsi_subq_channels; + __be32 num_nvmeof_subq_channels; + __be32 num_scsi_vas_channels; + __be32 num_nvmeof_vas_channels; + struct srp_direct_buf buffer; + __be64 reserved2[5]; + __be64 channel_handles[IBMVFC_MAX_CHANNELS]; +} __packed __aligned(8); + +struct ibmvfc_connection_info { + struct ibmvfc_mad_common common; + __be64 information_bits; +#define IBMVFC_NO_FC_IO_CHANNEL0x01 +#define IBMVFC_NO_PHYP_VAS 0x02 +#define IBMVFC_NO_PHYP_SUBQ0x04 +#define IBMVFC_PHYP_DEPRECATED_SUBQ0x08 +#define IBMVFC_PHYP_PRESERVED_SUBQ 0x10 +#define IBMVFC_PHYP_FULL_SUBQ 0x20 + __be64 reserved[16]; +} __packed __aligned(8); + struct ibmvfc_trace_start_entry { u32 xfer_len; } __packed; @@ -532,6 +584,7 @@ enum ibmvfc_async_event { IBMVFC_AE_HALT = 0x0400, IBMVFC_AE_RESUME= 0x0800, IBMVFC_AE_ADAPTER_FAILED= 0x1000, + IBMVFC_AE_FPIN = 0x2000, }; struct ibmvfc_async_desc { @@ -560,10 +613,18 @@ enum ibmvfc_ae_link_state { IBMVFC_AE_LS_LINK_DEAD = 0x08, }; +enum ibmvfc_ae_fpin_status { + IBMVFC_AE_FPIN_LINK_CONGESTED = 0x1, + IBMVFC_AE_FPIN_PORT_CONGESTED = 0x2, + IBMVFC_AE_FPIN_PORT_CLEARED = 0x3, + IBMVFC_AE_FPIN_PORT_DEGRADED= 0x4, +}; + struct ibmvfc_async_crq { volatile u8 valid; u8 link_state; - u8 pad[2]; + u8 fpin_status; + u8 pad; __be32 pad2; volatile __be64 event; volatile __be64 scsi_id; @@ -590,6 +651,9 @@ union ibmvfc_iu { struct ibmvfc_tmf tmf; struct ibmvfc_cmd cmd;
[PATCH v3 1/2] scsi: ibmvfc: use compiler attribute defines instead of __attribute__()
Update ibmvfc.h structs to use the preferred __packed and __aligned() attribute macros defined in include/linux/compiler_attributes.h in place of __attribute__(). Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.h | 56 +- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h index 907889f1fa9d..6da23666f5be 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.h +++ b/drivers/scsi/ibmvscsi/ibmvfc.h @@ -133,16 +133,16 @@ struct ibmvfc_mad_common { __be16 status; __be16 length; __be64 tag; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_npiv_login_mad { struct ibmvfc_mad_common common; struct srp_direct_buf buffer; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_npiv_logout_mad { struct ibmvfc_mad_common common; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); #define IBMVFC_MAX_NAME 256 @@ -168,7 +168,7 @@ struct ibmvfc_npiv_login { u8 device_name[IBMVFC_MAX_NAME]; u8 drc_name[IBMVFC_MAX_NAME]; __be64 reserved2[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_common_svc_parms { __be16 fcph_version; @@ -177,7 +177,7 @@ struct ibmvfc_common_svc_parms { __be16 bb_rcv_sz; /* upper nibble is BB_SC_N */ __be32 ratov; __be32 edtov; -}__attribute__((packed, aligned (4))); +} __packed __aligned(4); struct ibmvfc_service_parms { struct ibmvfc_common_svc_parms common; @@ -192,7 +192,7 @@ struct ibmvfc_service_parms { __be32 ext_len; __be32 reserved[30]; __be32 clk_sync_qos[2]; -}__attribute__((packed, aligned (4))); +} __packed __aligned(4); struct ibmvfc_npiv_login_resp { __be32 version; @@ -217,12 +217,12 @@ struct ibmvfc_npiv_login_resp { u8 drc_name[IBMVFC_MAX_NAME]; struct ibmvfc_service_parms service_parms; __be64 reserved2; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); union ibmvfc_npiv_login_data { struct ibmvfc_npiv_login login; struct ibmvfc_npiv_login_resp resp; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_discover_targets_buf { __be32 scsi_id[1]; @@ -239,7 +239,7 @@ struct ibmvfc_discover_targets { __be32 num_avail; __be32 num_written; __be64 reserved[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); enum ibmvfc_fc_reason { IBMVFC_INVALID_ELS_CMD_CODE = 0x01, @@ -283,7 +283,7 @@ struct ibmvfc_port_login { struct ibmvfc_service_parms service_parms; struct ibmvfc_service_parms service_parms_change; __be64 reserved3[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_prli_svc_parms { u8 type; @@ -303,7 +303,7 @@ struct ibmvfc_prli_svc_parms { #define IBMVFC_PRLI_TARGET_FUNC0x0010 #define IBMVFC_PRLI_READ_FCP_XFER_RDY_DISABLED 0x0002 #define IBMVFC_PRLI_WR_FCP_XFER_RDY_DISABLED 0x0001 -}__attribute__((packed, aligned (4))); +} __packed __aligned(4); struct ibmvfc_process_login { struct ibmvfc_mad_common common; @@ -314,7 +314,7 @@ struct ibmvfc_process_login { __be16 error; /* also fc_reason */ __be32 reserved2; __be64 reserved3[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_query_tgt { struct ibmvfc_mad_common common; @@ -325,13 +325,13 @@ struct ibmvfc_query_tgt { __be16 fc_explain; __be16 fc_type; __be64 reserved[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_implicit_logout { struct ibmvfc_mad_common common; __be64 old_scsi_id; __be64 reserved[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_tmf { struct ibmvfc_mad_common common; @@ -348,7 +348,7 @@ struct ibmvfc_tmf { __be32 my_cancel_key; __be32 pad; __be64 reserved[2]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); enum ibmvfc_fcp_rsp_info_codes { RSP_NO_FAILURE = 0x00, @@ -361,7 +361,7 @@ struct ibmvfc_fcp_rsp_info { u8 reserved[3]; u8 rsp_code; u8 reserved2[4]; -}__attribute__((packed, aligned (2))); +} __packed __aligned(2); enum ibmvfc_fcp_rsp_flags { FCP_BIDI_RSP= 0x80, @@ -377,7 +377,7 @@ enum ibmvfc_fcp_rsp_flags { union ibmvfc_fcp_rsp_data { struct ibmvfc_fcp_rsp_info info; u8 sense[SCSI_SENSE_BUFFERSIZE + sizeof(struct ibmvfc_fcp_rsp_info)]; -}__attribute__((packed, aligned (8))); +} __packed __aligned(8); struct ibmvfc_fcp_rsp { __be64 reserved; @@ -388,7 +388,7 @@ struct ibmvfc_fcp_rsp { __be32
[PATCH 5/5] powerpc/tau: Disable TAU between measurements
Enabling CONFIG_TAU_INT causes random crashes: Unrecoverable exception 1700 at c0009414 (msr=1000) Oops: Unrecoverable exception, sig: 6 [#1] BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-pmac-00043-gd5f545e1a8593 #5 NIP: c0009414 LR: c0009414 CTR: c00116fc REGS: c0799eb8 TRAP: 1700 Not tainted (5.7.0-pmac-00043-gd5f545e1a8593) MSR: 1000 CR: 22000228 XER: 0100 GPR00: c0799f70 c076e300 0080 0291c0ac 00e0 c076e300 00049032 GPR08: 0001 c00116fc dfbd3200 007f80a8 GPR16: c075ce04 GPR24: c075ce04 dfff8880 c07b c075ce04 0008 0001 c079ef98 c079ef5c NIP [c0009414] arch_cpu_idle+0x24/0x6c LR [c0009414] arch_cpu_idle+0x24/0x6c Call Trace: [c0799f70] [0001] 0x1 (unreliable) [c0799f80] [c0060990] do_idle+0xd8/0x17c [c0799fa0] [c0060ba4] cpu_startup_entry+0x20/0x28 [c0799fb0] [c072d220] start_kernel+0x434/0x44c [c0799ff0] [3860] 0x3860 Instruction dump: 3d20c07b 7c0802a6 4e800421 7d2000a6 ---[ end trace 3a0c9b5cb216db6b ]--- Resolve this problem by disabling each THRMn comparator when handling the associated THRMn interrupt and by disabling the TAU entirely when updating THRMn thresholds. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/powerpc/kernel/tau_6xx.c | 65 +- arch/powerpc/platforms/Kconfig | 9 ++--- 2 files changed, 26 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index 614b5b272d9c6..0b4694b8d2482 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -42,8 +42,6 @@ static struct tau_temp static bool tau_int_enable; -#undef DEBUG - /* TODO: put these in a /proc interface, with some sanity checks, and maybe * dynamic adjustment to minimize # of interrupts */ /* configurable values for step size and how much to expand the window when @@ -67,42 +65,33 @@ static void set_thresholds(unsigned long cpu) static void TAUupdate(int cpu) { - unsigned thrm; - -#ifdef DEBUG - printk("TAUupdate "); -#endif + u32 thrm; + u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V; /* if both thresholds are crossed, the step_sizes cancel out * and the window winds up getting expanded twice. */ - if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed low threshold */ - if (tau[cpu].low >= step_size){ - tau[cpu].low -= step_size; - tau[cpu].high -= (step_size - window_expand); - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("low threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM1); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM1, 0); + + if (tau[cpu].low >= step_size) { + tau[cpu].low -= step_size; + tau[cpu].high -= (step_size - window_expand); } + tau[cpu].grew = 1; + pr_debug("%s: low threshold crossed\n", __func__); } - if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */ - if(thrm & THRM1_TIN){ /* crossed high threshold */ - if (tau[cpu].high <= 127-step_size){ - tau[cpu].low += (step_size - window_expand); - tau[cpu].high += step_size; - } - tau[cpu].grew = 1; -#ifdef DEBUG - printk("high threshold crossed "); -#endif + thrm = mfspr(SPRN_THRM2); + if ((thrm & bits) == bits) { + mtspr(SPRN_THRM2, 0); + + if (tau[cpu].high <= 127 - step_size) { + tau[cpu].low += (step_size - window_expand); + tau[cpu].high += step_size; } + tau[cpu].grew = 1; + pr_debug("%s: high threshold crossed\n", __func__); } - -#ifdef DEBUG - printk("grew = %d\n", tau[cpu].grew); -#endif } #ifdef CONFIG_TAU_INT @@ -127,17 +116,17 @@ void TAUException(struct pt_regs * regs) static void tau_timeout(void * info) { int cpu; - unsigned long flags; int size; int shrink; - /* disabling interrupts *should* be okay */ - local_irq_save(flags); cpu = smp_processor_id(); if (!tau_int_enable) TAUupdate(cpu); + /* Stop thermal sensor comparisons and interrupts */ + mtspr(SPRN_THRM3, 0); + size = tau[cpu].high - tau[cpu].low; if (size > min_window && !
[PATCH 4/5] powerpc/tau: Check processor type before enabling TAU interrupt
According to Freescale's documentation, MPC74XX processors have an erratum that prevents the TAU interrupt from working, so don't try to use it when running on those processors. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/powerpc/kernel/tau_6xx.c | 33 ++--- arch/powerpc/platforms/Kconfig | 5 ++--- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index b8d7e7d498e0a..614b5b272d9c6 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -40,6 +40,8 @@ static struct tau_temp unsigned char grew; } tau[NR_CPUS]; +static bool tau_int_enable; + #undef DEBUG /* TODO: put these in a /proc interface, with some sanity checks, and maybe @@ -54,22 +56,13 @@ static struct tau_temp static void set_thresholds(unsigned long cpu) { -#ifdef CONFIG_TAU_INT - /* -* setup THRM1, -* threshold, valid bit, enable interrupts, interrupt when below threshold -*/ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID); + u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0; - /* setup THRM2, -* threshold, valid bit, enable interrupts, interrupt when above threshold -*/ - mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE); -#else - /* same thing but don't enable interrupts */ - mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID); - mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V); -#endif + /* setup THRM1, threshold, valid bit, interrupt when below threshold */ + mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID); + + /* setup THRM2, threshold, valid bit, interrupt when above threshold */ + mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie); } static void TAUupdate(int cpu) @@ -142,9 +135,8 @@ static void tau_timeout(void * info) local_irq_save(flags); cpu = smp_processor_id(); -#ifndef CONFIG_TAU_INT - TAUupdate(cpu); -#endif + if (!tau_int_enable) + TAUupdate(cpu); size = tau[cpu].high - tau[cpu].low; if (size > min_window && ! tau[cpu].grew) { @@ -225,6 +217,9 @@ static int __init TAU_init(void) return 1; } + tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) && +!strcmp(cur_cpu_spec->platform, "ppc750"); + tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0); if (!tau_workq) return -ENOMEM; @@ -234,7 +229,7 @@ static int __init TAU_init(void) queue_work(tau_workq, _work); pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n", - IS_ENABLED(CONFIG_TAU_INT) ? "interrupts" : "workqueue", shrink_timer); + tau_int_enable ? "interrupts" : "workqueue", shrink_timer); tau_initialized = 1; return 0; diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index fb7515b4fa9c6..9fe36f0b54c1a 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -223,9 +223,8 @@ config TAU temperature within 2-4 degrees Celsius. This option shows the current on-die temperature in /proc/cpuinfo if the cpu supports it. - Unfortunately, on some chip revisions, this sensor is very inaccurate - and in many cases, does not work at all, so don't assume the cpu - temp is actually what /proc/cpuinfo says it is. + Unfortunately, this sensor is very inaccurate when uncalibrated, so + don't assume the cpu temp is actually what /proc/cpuinfo says it is. config TAU_INT bool "Interrupt driven TAU driver (DANGEROUS)" -- 2.26.2
[PATCH 2/5] powerpc/tau: Convert from timer to workqueue
Since commit 19dbdcb8039cf ("smp: Warn on function calls from softirq context") the Thermal Assist Unit driver causes a warning like the following when CONFIG_SMP is enabled. [ cut here ] WARNING: CPU: 0 PID: 0 at kernel/smp.c:428 smp_call_function_many_cond+0xf4/0x38c Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-pmac #3 NIP: c00b37a8 LR: c00b3abc CTR: c001218c REGS: c0799c60 TRAP: 0700 Not tainted (5.7.0-pmac) MSR: 00029032 CR: 42000224 XER: GPR00: c00b3abc c0799d18 c076e300 c079ef5c c0011fec GPR08: 0100 0100 8000 42000224 c079d040 c079d044 GPR16: 0001 0004 c0799da0 c079f054 c07a c07a GPR24: c0011fec c079ef5c c079ef5c NIP [c00b37a8] smp_call_function_many_cond+0xf4/0x38c LR [c00b3abc] on_each_cpu+0x38/0x68 Call Trace: [c0799d18] [] 0x (unreliable) [c0799d68] [c00b3abc] on_each_cpu+0x38/0x68 [c0799d88] [c0096704] call_timer_fn.isra.26+0x20/0x7c [c0799d98] [c0096b40] run_timer_softirq+0x1d4/0x3fc [c0799df8] [c05b4368] __do_softirq+0x118/0x240 [c0799e58] [c0039c44] irq_exit+0xc4/0xcc [c0799e68] [c000ade8] timer_interrupt+0x1b0/0x230 [c0799ea8] [c0013520] ret_from_except+0x0/0x14 --- interrupt: 901 at arch_cpu_idle+0x24/0x6c LR = arch_cpu_idle+0x24/0x6c [c0799f70] [0001] 0x1 (unreliable) [c0799f80] [c0060990] do_idle+0xd8/0x17c [c0799fa0] [c0060ba8] cpu_startup_entry+0x24/0x28 [c0799fb0] [c072d220] start_kernel+0x434/0x44c [c0799ff0] [3860] 0x3860 Instruction dump: 8129f204 2f89 40beff98 3d20c07a 8929eec4 2f89 40beff88 0fe0 8122 552805de 550802ef 4182ff84 <0fe0> 3860 7f65db78 7f44d378 ---[ end trace 34a886e47819c2eb ]--- Don't call on_each_cpu() from a timer callback, call it from a worker thread instead. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/powerpc/kernel/tau_6xx.c | 38 +-- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index 976d5bc1b5176..268205cc347da 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -13,13 +13,14 @@ */ #include -#include #include #include #include #include #include #include +#include +#include #include #include @@ -39,8 +40,6 @@ static struct tau_temp unsigned char grew; } tau[NR_CPUS]; -struct timer_list tau_timer; - #undef DEBUG /* TODO: put these in a /proc interface, with some sanity checks, and maybe @@ -50,7 +49,7 @@ struct timer_list tau_timer; #define step_size 2 /* step size when temp goes out of range */ #define window_expand 1 /* expand the window by this much */ /* configurable values for shrinking the window */ -#define shrink_timer 2*HZ/* period between shrinking the window */ +#define shrink_timer 2000/* period between shrinking the window */ #define min_window 2 /* minimum window size, degrees C */ static void set_thresholds(unsigned long cpu) @@ -187,14 +186,18 @@ static void tau_timeout(void * info) local_irq_restore(flags); } -static void tau_timeout_smp(struct timer_list *unused) -{ +static struct workqueue_struct *tau_workq; - /* schedule ourselves to be run again */ - mod_timer(_timer, jiffies + shrink_timer) ; +static void tau_work_func(struct work_struct *work) +{ + msleep(shrink_timer); on_each_cpu(tau_timeout, NULL, 0); + /* schedule ourselves to be run again */ + queue_work(tau_workq, work); } +DECLARE_WORK(tau_work, tau_work_func); + /* * setup the TAU * @@ -227,21 +230,16 @@ static int __init TAU_init(void) return 1; } - - /* first, set up the window shrinking timer */ - timer_setup(_timer, tau_timeout_smp, 0); - tau_timer.expires = jiffies + shrink_timer; - add_timer(_timer); + tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0); + if (!tau_workq) + return -ENOMEM; on_each_cpu(TAU_init_smp, NULL, 0); - printk("Thermal assist unit "); -#ifdef CONFIG_TAU_INT - printk("using interrupts, "); -#else - printk("using timers, "); -#endif - printk("shrink_timer: %d jiffies\n", shrink_timer); + queue_work(tau_workq, _work); + + pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n", + IS_ENABLED(CONFIG_TAU_INT) ? "interrupts" : "workqueue", shrink_timer); tau_initialized = 1; return 0; -- 2.26.2
[PATCH 3/5] powerpc/tau: Remove duplicated set_thresholds() call
The commentary at the call site seems to disagree with the code. The conditional prevents calling set_thresholds() via the exception handler, which appears to crash. Perhaps that's because it immediately triggers another TAU exception. Anyway, calling set_thresholds() from TAUupdate() is redundant because tau_timeout() does so. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/powerpc/kernel/tau_6xx.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index 268205cc347da..b8d7e7d498e0a 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -110,11 +110,6 @@ static void TAUupdate(int cpu) #ifdef DEBUG printk("grew = %d\n", tau[cpu].grew); #endif - -#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */ - set_thresholds(cpu); -#endif - } #ifdef CONFIG_TAU_INT -- 2.26.2
[PATCH 1/5] powerpc/tau: Use appropriate temperature sample interval
According to the MPC750 Users Manual, the SITV value in Thermal Management Register 3 is 13 bits long. The present code calculates the SITV value as 60 * 500 cycles. This would overflow to give 10 us on a 500 MHz CPU rather than the intended 60 us. (But according to the Microprocessor Datasheet, there is also a factor of 266 that has to be applied to this value on certain parts i.e. speed sort above 266 MHz.) Always use the maximum cycle count, as recommended by the Datasheet. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Tested-by: Stan Johnson Signed-off-by: Finn Thain --- arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/tau_6xx.c | 12 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 88e6c78100d9b..c750afc62887c 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -815,7 +815,7 @@ #define THRM1_TIN (1 << 31) #define THRM1_TIV (1 << 30) #define THRM1_THRES(x) ((x&0x7f)<<23) -#define THRM3_SITV(x) ((x&0x3fff)<<1) +#define THRM3_SITV(x) ((x & 0x1fff) << 1) #define THRM1_TID (1<<2) #define THRM1_TIE (1<<1) #define THRM1_V(1<<0) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index e2ab8a111b693..976d5bc1b5176 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -178,15 +178,11 @@ static void tau_timeout(void * info) * complex sleep code needs to be added. One mtspr every time * tau_timeout is called is probably not a big deal. * -* Enable thermal sensor and set up sample interval timer -* need 20 us to do the compare.. until a nice 'cpu_speed' function -* call is implemented, just assume a 500 mhz clock. It doesn't really -* matter if we take too long for a compare since it's all interrupt -* driven anyway. -* -* use a extra long time.. (60 us @ 500 mhz) +* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet" +* recommends that "the maximum value be set in THRM3 under all +* conditions." */ - mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E); + mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E); local_irq_restore(flags); } -- 2.26.2
[PATCH 0/5] powerpc/tau: TAU driver fixes
This patch series fixes various bugs in the Thermal Assist Unit driver. It was tested on 266 MHz and 292 MHz PowerBook G3 laptops. Finn Thain (5): powerpc/tau: Use appropriate temperature sample interval powerpc/tau: Convert from timer to workqueue powerpc/tau: Remove duplicated set_thresholds() call powerpc/tau: Check processor type before enabling TAU interrupt powerpc/tau: Disable TAU between measurements arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/tau_6xx.c | 147 + arch/powerpc/platforms/Kconfig | 14 +--- 3 files changed, 62 insertions(+), 101 deletions(-) -- 2.26.2
[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
https://bugzilla.kernel.org/show_bug.cgi?id=208181 Erhard F. (erhar...@mailbox.org) changed: What|Removed |Added Status|NEW |RESOLVED Resolution|--- |OBSOLETE --- Comment #19 from Erhard F. (erhar...@mailbox.org) --- I noticed that I covered the "do_IRQ: stack overflow: " problem already in bug #207129 so closing this one as suggested before. -- You are receiving this mail because: You are watching the assignee of the bug.
[Bug 205099] KASAN hit at raid6_pq: BUG: Unable to handle kernel data access at 0x00f0fd0d
https://bugzilla.kernel.org/show_bug.cgi?id=205099 Erhard F. (erhar...@mailbox.org) changed: What|Removed |Added Attachment #288413|0 |1 is obsolete|| --- Comment #31 from Erhard F. (erhar...@mailbox.org) --- Created attachment 292347 --> https://bugzilla.kernel.org/attachment.cgi?id=292347=edit kernel .config (5.9-rc3, OUTLINE KASAN, PowerMac G4 DP) Does happen even if RAID support is not actively selected in the config as btrfs pulls in RAID6_PQ on its own. # CONFIG_DM_RAID is not set CONFIG_RAID6_PQ=m -- You are receiving this mail because: You are watching the assignee of the bug.
[Bug 205099] KASAN hit at raid6_pq: BUG: Unable to handle kernel data access at 0x00f0fd0d
https://bugzilla.kernel.org/show_bug.cgi?id=205099 Erhard F. (erhar...@mailbox.org) changed: What|Removed |Added Attachment #287625|0 |1 is obsolete|| Attachment #288411|0 |1 is obsolete|| --- Comment #30 from Erhard F. (erhar...@mailbox.org) --- Created attachment 292345 --> https://bugzilla.kernel.org/attachment.cgi?id=292345=edit dmesg (5.9-rc3, OUTLINE KASAN, PowerMac G4 DP) Re-tested with v5.9-rc3 out of curiosity. Not much change here, the bug shows up with OUTLINE KASAN but not with INLINE KASAN, everything else being equal: == BUG: KASAN: user-memory-access in raid6_altivec8_gen_syndrome_real+0x2b0/0x480 [raid6_pq] Read of size 4 at addr 5764b118 by task modprobe/126 CPU: 1 PID: 126 Comm: modprobe Tainted: GW 5.9.0-rc3-PowerMacG4 #2 Call Trace: [e32cb7b8] [c0517aac] dump_stack+0xc4/0xf8 (unreliable) [e32cb7e8] [c026e73c] kasan_report+0x16c/0x170 [e32cb828] [b02004e0] raid6_altivec8_gen_syndrome_real+0x2b0/0x480 [raid6_pq] [e32cba18] [b02006fc] raid6_altivec8_gen_syndrome+0x4c/0x88 [raid6_pq] [e32cba38] [b021a42c] init_module+0x42c/0x590 [raid6_pq] [e32cbb08] [c00058a0] do_one_initcall+0xb8/0x3dc [e32cbbd8] [c011c0fc] do_init_module+0xa8/0x2c4 [e32cbc08] [c011f02c] load_module+0x2b98/0x2d4c [e32cbe18] [c011f448] sys_finit_module+0x100/0x138 [e32cbf38] [c001a1cc] ret_from_syscall+0x0/0x34 --- interrupt: c01 at 0x3d2068 LR = 0x506104 == BUG: Unable to handle kernel data access on read at 0x5764b118 Faulting instruction address: 0xb02004e0 Oops: Kernel access of bad area, sig: 11 [#1] -- You are receiving this mail because: You are watching the assignee of the bug.
[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
https://bugzilla.kernel.org/show_bug.cgi?id=208181 --- Comment #18 from Erhard F. (erhar...@mailbox.org) --- Created attachment 292339 --> https://bugzilla.kernel.org/attachment.cgi?id=292339=edit kernel .config (5.9-rc3, PowerMac G4 DP) -- You are receiving this mail because: You are watching the assignee of the bug.
[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
https://bugzilla.kernel.org/show_bug.cgi?id=208181 --- Comment #17 from Erhard F. (erhar...@mailbox.org) --- Created attachment 292337 --> https://bugzilla.kernel.org/attachment.cgi?id=292337=edit dmesg (5.9-rc3, INLINE KASAN, PowerMac G4 DP) Re-tried with 5.9-rc3 (inline KASAN). The original problem (stack-out-of-bounds in strcmp+0x58/0xd8) is gone, but still problems with stack usage when doing larger build jobs: [...] [ 1929.683510] do_IRQ: stack overflow: 1696 [ 1929.690727] CPU: 1 PID: 735 Comm: mount.nfs Tainted: GW 5.9.0-rc3-PowerMacG4 #1 [ 1929.697847] Call Trace: [ 1929.704633] [d0ca4670] [c0a75518] dump_stack+0xfc/0x130 (unreliable) [ 1929.711507] [d0ca46a0] [c000b094] do_IRQ+0x128/0x180 [ 1929.717998] [d0ca46d0] [c002e560] ret_from_except+0x0/0x14 [ 1929.724652] --- interrupt: 501 at _raw_spin_unlock_irqrestore+0x3c/0xa4 LR = _raw_spin_unlock_irqrestore+0x38/0xa4 [ 1929.738722] [d0ca47b8] [c0a6dc90] stack_depot_save+0x20c/0x390 [ 1929.746132] [d0ca4818] [c04d4b70] kasan_save_stack+0x40/0x48 [ 1929.753675] [d0ca4928] [c04d4b9c] kasan_set_track+0x24/0x30 [ 1929.761298] [d0ca4938] [c04d710c] kasan_set_free_info+0x28/0x3c [ 1929.769073] [d0ca4948] [c04d4f74] __kasan_slab_free+0x104/0x118 [ 1929.776983] [d0ca4968] [c04ce800] slab_free_freelist_hook+0xec/0x17c [ 1929.785111] [d0ca49a8] [c04d3468] kmem_cache_free+0x58/0x2a0 [ 1929.793391] [d0ca49f8] [c11b251c] packet_rcv+0xb9c/0xbb4 [ 1929.801797] [d0ca4a48] [c0dbfd98] dev_queue_xmit_nit+0x6e4/0x748 [ 1929.810434] [d0ca4ab8] [c0dcaf80] dev_hard_start_xmit+0xec/0x880 [ 1929.819207] [d0ca4b18] [c0ea4814] sch_direct_xmit+0x1f8/0x818 [ 1929.828111] [d0ca4bf8] [c0dcc884] __dev_queue_xmit+0xed4/0x136c [ 1929.837202] [d0ca4d28] [c0f256dc] ip_finish_output2+0xfcc/0x1028 [ 1929.846472] [d0ca4d88] [c0f2d848] __ip_queue_xmit+0xde0/0x1018 [ 1929.855892] [d0ca4df8] [c0f929d8] __tcp_transmit_skb+0x2550/0x2cb8 [ 1929.865486] [d0ca4ee8] [c0f98470] tcp_write_xmit+0x1d28/0x3498 [ 1929.875216] [d0ca4f78] [c0f99c8c] __tcp_push_pending_frames+0xac/0x1c4 [ 1929.885189] [d0ca4f98] [c0f5a970] tcp_sendmsg_locked+0x1c50/0x2294 [ 1929.895338] [d0ca5098] [c0f5afe4] tcp_sendmsg+0x30/0x48 [ 1929.905564] [d0ca50b8] [c0d598b0] sock_sendmsg_nosec+0xf4/0x10c [ 1929.916463] [d0ca50d8] [b0a31840] xprt_sock_sendmsg+0x2c0/0x6e8 [sunrpc] [ 1929.927494] [d0ca51b8] [b0a34ce8] xs_tcp_send_request+0x360/0x580 [sunrpc] [ 1929.938699] [d0ca52e8] [b0a2eae8] xprt_transmit+0x4f8/0xe30 [sunrpc] [ 1929.950044] [d0ca5368] [b0a1dcd8] call_transmit+0x238/0x25c [sunrpc] [ 1929.961450] [d0ca5388] [b0a6641c] __rpc_execute+0x35c/0xbf8 [sunrpc] [ 1929.972996] [d0ca5448] [b0a21d18] rpc_run_task+0x790/0x79c [sunrpc] [ 1929.984850] [d0ca5498] [b1282e50] nfs4_call_sync_custom+0x14/0x80 [nfsv4] [ 1929.996821] [d0ca54b8] [b128302c] nfs4_do_call_sync+0x170/0x1a8 [nfsv4] [ 1930.008922] [d0ca55a8] [b12b3570] nfs4_proc_lookup_common+0x314/0xc54 [nfsv4] [ 1930.020820] [d0ca5758] [b12b4244] nfs4_proc_lookup+0x158/0x2f0 [nfsv4] [ 1930.032753] [d0ca57f8] [b0b49544] nfs_lookup+0x2ac/0x9ac [nfs] [ 1930.044062] [d0ca5838] [c052c984] __lookup_slow+0x278/0x2a8 [ 1930.055461] [d0ca5958] [c05340a0] walk_component+0x288/0x30c [ 1930.066816] [d0ca5a08] [c0534e5c] path_lookupat.isra.0+0x1b8/0x438 [ 1930.078282] [d0ca5a48] [c05372a0] filename_lookup+0x144/0x1c4 [ 1930.089834] [d0ca5b98] [c05373fc] vfs_path_lookup+0x94/0xc0 [ 1930.101389] [d0ca5c18] [c05714b8] mount_subtree+0x1c4/0x250 [ 1930.113267] [d0ca5ca8] [b12e1b2c] do_nfs4_mount+0x570/0x7fc [nfsv4] [ 1930.125298] [d0ca5d68] [b12e202c] nfs4_try_get_tree+0xfc/0x16c [nfsv4] [ 1930.137200] [d0ca5d88] [c050e434] vfs_get_tree+0xf8/0x398 [ 1930.149133] [d0ca5db8] [c056f968] path_mount+0x1074/0x113c [ 1930.161107] [d0ca5e78] [c056fad8] do_mount+0xa8/0xe4 [ 1930.173109] [d0ca5f08] [c0570054] sys_mount+0xa8/0xb8 [ 1930.185160] [d0ca5f38] [c002e1cc] ret_from_syscall+0x0/0x34 [ 1930.197313] --- interrupt: c01 at 0x8b5754 LR = 0xac0be0 [ 1930.222896] Kernel panic - not syncing: corrupted stack end detected inside scheduler But feel free to close this bug if appropriate as the original issue is solved. -- You are receiving this mail because: You are watching the assignee of the bug.
RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
From: Alexey Dobriyan > Sent: 04 September 2020 18:58 > > On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote: > > * Christoph Hellwig wrote: > > > this series removes the last set_fs() used to force a kernel address > > > space for the uaccess code in the kernel read/write/splice code, and then > > > stops implementing the address space overrides entirely for x86 and > > > powerpc. > > > > Cool! For the x86 bits: > > > > Acked-by: Ingo Molnar > > set_fs() is older than some kernel hackers! > > $ cd linux-0.11/ > $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3 > ./include/asm/segment.h:61:extern inline void set_fs(unsigned long val) > ./include/asm/segment.h-62-{ > ./include/asm/segment.h-63- __asm__("mov %0,%%fs"::"a" ((unsigned > short) val)); > ./include/asm/segment.h-64-} What is this strange %fs register you are talking about. Figure 2-4 only has CS, DS, SS and ES. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Re: [PATCH 0/1] powerpc/numa: do not skip node 0 in lookup table
I discussed this a bit with Aneesh Kumar in IBM internal Slack, a few weeks ago, and he informed me that that this patch does not make sense with the design used by the kernel. The kernel will assume that, for node 0, all associativity domains must also be zeroed. This is why node 0 is skipped when creating the distance table. This of course has consequences for QEMU, so based on that, I've adapted the QEMU implementation to not touch node 0. Daniel On 8/14/20 5:34 PM, Daniel Henrique Barboza wrote: Hi, This is a simple fix that I made while testing NUMA changes I'm making in QEMU [1]. Setting any non-zero value to the associativity of NUMA node 0 has no impact in the output of 'numactl' because the distance_lookup_table is never initialized for node 0. Seeing through the LOPAPR spec and git history I found no technical reason to skip node 0, which makes me believe this is a bug that got under the radar up until now because no one attempted to set node 0 associativity like I'm doing now. For anyone wishing to give it a spin, using the QEMU build in [1] and experimenting with NUMA distances, such as: sudo ./qemu-system-ppc64 -machine pseries-5.2,accel=kvm,usb=off,dump-guest-core=off -m 65536 -overcommit mem-lock=off -smp 4,sockets=4,cores=1,threads=1 -rtc base=utc -display none -vga none -nographic -boot menu=on -device spapr-pci-host-bridge,index=1,id=pci.1 -device spapr-pci-host-bridge,index=2,id=pci.2 -device spapr-pci-host-bridge,index=3,id=pci.3 -device spapr-pci-host-bridge,index=4,id=pci.4 -device qemu-xhci,id=usb,bus=pci.0,addr=0x2 -drive file=/home/danielhb/f32.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x3,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -device usb-kbd,id=input0,bus=usb.0,port=1 -device usb-mouse,id=input1,bus=usb.0,port=2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on \ -numa node,nodeid=0,cpus=0 -numa node,nodeid=1,cpus=1 \ -numa node,nodeid=2,cpus=2 -numa node,nodeid=3,cpus=3 \ -numa dist,src=0,dst=1,val=80 -numa dist,src=0,dst=2,val=80 \ -numa dist,src=0,dst=3,val=80 -numa dist,src=1,dst=2,val=80 \ -numa dist,src=1,dst=3,val=80 -numa dist,src=2,dst=3,val=80 The current kernel code will ignore the associativity of node 0, and numactl will output this: node distances: node 0 1 2 3 0: 10 160 160 160 1: 160 10 80 80 2: 160 80 10 80 3: 160 80 80 10 With this patch: node distances: node 0 1 2 3 0: 10 160 160 160 1: 160 10 80 40 2: 160 80 10 20 3: 160 40 20 10 If anyone wonders, this patch has no conflict with the proposed NUMA changes in [2] because Aneesh isn't changing this line. [1] https://github.com/danielhb/qemu/tree/spapr_numa_v1 [2] https://patchwork.ozlabs.org/project/linuxppc-dev/patch/2020073916.243569-1-aneesh.ku...@linux.ibm.com/ Daniel Henrique Barboza (1): powerpc/numa: do not skip node 0 when init lookup table arch/powerpc/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
On Fri, Sep 4, 2020 at 10:58 AM Alexey Dobriyan wrote: > > set_fs() is older than some kernel hackers! > > $ cd linux-0.11/ > $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3 Oh, it's older than that. It was there (as set_fs) in 0.10, and may even predate that. But sadly, I don't have tar-balls for 0.02 and 0.03, so can't check. The actual use of %fs as the user space segment is already there in 0.01, but there was no 'set_fs()'. That was a simpler and more direct time, and "get_fs()" looked like this back then: #define _fs() ({ \ register unsigned short __res; \ __asm__("mov %%fs,%%ax":"=a" (__res):); \ __res;}) and all the setting was basically part of the kernel entry asm and. Lovely. Linus
Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote: > * Christoph Hellwig wrote: > > this series removes the last set_fs() used to force a kernel address > > space for the uaccess code in the kernel read/write/splice code, and then > > stops implementing the address space overrides entirely for x86 and > > powerpc. > > Cool! For the x86 bits: > > Acked-by: Ingo Molnar set_fs() is older than some kernel hackers! $ cd linux-0.11/ $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3 ./include/asm/segment.h:61:extern inline void set_fs(unsigned long val) ./include/asm/segment.h-62-{ ./include/asm/segment.h-63- __asm__("mov %0,%%fs"::"a" ((unsigned short) val)); ./include/asm/segment.h-64-}
[PATCH] kbuild: preprocess module linker script
There was a request to preprocess the module linker script like we do for the vmlinux one (https://lkml.org/lkml/2020/8/21/512). The difference between vmlinux.lds and module.lds is that the latter is needed for external module builds, thus must be cleaned up by 'make mrproper' instead of 'make clean' (also, it must be created by 'make modules_prepare'). You cannot put it in arch/*/kernel/ because 'make clean' descends into it. I moved arch/*/kernel/module.lds to arch/*/include/asm/module.lds.h, which is included from scripts/module.lds.S. scripts/module.lds is fine because 'make clean' keeps all the build artifacts under scripts/. You can add arch-specific sections in . Signed-off-by: Masahiro Yamada Tested-by: Jessica Yu --- Makefile | 1 - arch/arm/Makefile | 4 .../{kernel/module.lds => include/asm/module.lds.h}| 2 ++ arch/arm64/Makefile| 4 .../{kernel/module.lds => include/asm/module.lds.h}| 2 ++ arch/ia64/Makefile | 1 - arch/ia64/{module.lds => include/asm/module.lds.h} | 0 arch/m68k/Makefile | 1 - .../{kernel/module.lds => include/asm/module.lds.h}| 0 arch/powerpc/Makefile | 1 - .../{kernel/module.lds => include/asm/module.lds.h}| 0 arch/riscv/Makefile| 3 --- .../{kernel/module.lds => include/asm/module.lds.h}| 3 ++- arch/um/include/asm/Kbuild | 1 + include/asm-generic/Kbuild | 1 + include/asm-generic/module.lds.h | 10 ++ scripts/.gitignore | 1 + scripts/Makefile | 2 ++ scripts/Makefile.modfinal | 5 ++--- scripts/{module-common.lds => module.lds.S}| 3 +++ scripts/package/builddeb | 2 +- 21 files changed, 27 insertions(+), 20 deletions(-) rename arch/arm/{kernel/module.lds => include/asm/module.lds.h} (72%) rename arch/arm64/{kernel/module.lds => include/asm/module.lds.h} (76%) rename arch/ia64/{module.lds => include/asm/module.lds.h} (100%) rename arch/m68k/{kernel/module.lds => include/asm/module.lds.h} (100%) rename arch/powerpc/{kernel/module.lds => include/asm/module.lds.h} (100%) rename arch/riscv/{kernel/module.lds => include/asm/module.lds.h} (84%) create mode 100644 include/asm-generic/module.lds.h rename scripts/{module-common.lds => module.lds.S} (93%) diff --git a/Makefile b/Makefile index 9cac6fde3479..3d9b56c6b47e 100644 --- a/Makefile +++ b/Makefile @@ -506,7 +506,6 @@ KBUILD_CFLAGS_KERNEL := KBUILD_AFLAGS_MODULE := -DMODULE KBUILD_CFLAGS_MODULE := -DMODULE KBUILD_LDFLAGS_MODULE := -export KBUILD_LDS_MODULE := $(srctree)/scripts/module-common.lds KBUILD_LDFLAGS := CLANG_FLAGS := diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 4e877354515f..a0cb15de9677 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -16,10 +16,6 @@ LDFLAGS_vmlinux += --be8 KBUILD_LDFLAGS_MODULE += --be8 endif -ifeq ($(CONFIG_ARM_MODULE_PLTS),y) -KBUILD_LDS_MODULE += $(srctree)/arch/arm/kernel/module.lds -endif - GZFLAGS:=-9 #KBUILD_CFLAGS +=-pipe diff --git a/arch/arm/kernel/module.lds b/arch/arm/include/asm/module.lds.h similarity index 72% rename from arch/arm/kernel/module.lds rename to arch/arm/include/asm/module.lds.h index 79cb6af565e5..0e7cb4e314b4 100644 --- a/arch/arm/kernel/module.lds +++ b/arch/arm/include/asm/module.lds.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_ARM_MODULE_PLTS SECTIONS { .plt : { BYTE(0) } .init.plt : { BYTE(0) } } +#endif diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 55bc8546d9c7..232547ec07d8 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -115,10 +115,6 @@ endif CHECKFLAGS += -D__aarch64__ -ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) -KBUILD_LDS_MODULE += $(srctree)/arch/arm64/kernel/module.lds -endif - ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y) KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY CC_FLAGS_FTRACE := -fpatchable-function-entry=2 diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/include/asm/module.lds.h similarity index 76% rename from arch/arm64/kernel/module.lds rename to arch/arm64/include/asm/module.lds.h index 22e36a21c113..691f15af788e 100644 --- a/arch/arm64/kernel/module.lds +++ b/arch/arm64/include/asm/module.lds.h @@ -1,5 +1,7 @@ +#ifdef CONFIG_ARM64_MODULE_PLTS SECTIONS { .plt (NOLOAD) : { BYTE(0) } .init.plt (NOLOAD) : { BYTE(0) } .text.ftrace_trampoline (NOLOAD) : { BYTE(0) } } +#endif diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 2876a7df1b0a..703b1c4f6d12 100644 --- a/arch/ia64/Makefile
[PATCH 3/3] powerpc/uaccess: Remove __put_user_asm() and __put_user_asm2()
__put_user_asm() and __put_user_asm2() are not used anymore. Remove them. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 41 -- 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 96d1c144f92b..26781b044932 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -151,42 +151,6 @@ static inline int __access_ok(unsigned long addr, unsigned long size, extern long __put_user_bad(void); -/* - * We don't tell gcc that we are accessing memory, but this is OK - * because we do not write to any memory gcc knows about, so there - * are no aliasing issues. - */ -#define __put_user_asm(x, addr, err, op) \ - __asm__ __volatile__( \ - "1: " op "%U2%X2 %1,%2 # put_user\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: li %0,%3\n" \ - " b 2b\n" \ - ".previous\n" \ - EX_TABLE(1b, 3b)\ - : "=r" (err)\ - : "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err)) - -#ifdef __powerpc64__ -#define __put_user_asm2(x, ptr, retval)\ - __put_user_asm(x, ptr, retval, "std") -#else /* __powerpc64__ */ -#define __put_user_asm2(x, addr, err) \ - __asm__ __volatile__( \ - "1: stw%X2 %1,%2\n" \ - "2: stw%X2 %L1,%L2\n" \ - "3:\n" \ - ".section .fixup,\"ax\"\n" \ - "4: li %0,%3\n" \ - " b 3b\n" \ - ".previous\n" \ - EX_TABLE(1b, 4b)\ - EX_TABLE(2b, 4b)\ - : "=r" (err)\ - : "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err)) -#endif /* __powerpc64__ */ - #define __put_user_size_allowed(x, ptr, size, retval) \ do { \ __label__ __pu_failed; \ @@ -249,6 +213,11 @@ do { \ }) +/* + * We don't tell gcc that we are accessing memory, but this is OK + * because we do not write to any memory gcc knows about, so there + * are no aliasing issues. + */ #define __put_user_asm_goto(x, addr, label, op)\ asm volatile goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ -- 2.25.0
[PATCH 2/3] powerpc/uaccess: Switch __patch_instruction() to __put_user_asm_goto()
__patch_instruction() is the only user of __put_user_asm() outside of asm/uaccess.h Switch to the new __put_user_asm_goto() to enable retirement of __put_user_asm() in a later patch. Signed-off-by: Christophe Leroy --- arch/powerpc/lib/code-patching.c | 17 +++-- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 8c3934ea6220..2333625b5e31 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -21,21 +21,18 @@ static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr, struct ppc_inst *patch_addr) { - int err = 0; - - if (!ppc_inst_prefixed(instr)) { - __put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw"); - } else { - __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std"); - } - - if (err) - return err; + if (!ppc_inst_prefixed(instr)) + __put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, "stw"); + else + __put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, "std"); asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), "r" (exec_addr)); return 0; + +failed: + return -EFAULT; } int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr) -- 2.25.0
[PATCH 1/3] powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()
__put_user_asm_goto() provides more flexibility to GCC and avoids using a local variable to tell if the write succeeded or not. GCC can then avoid implementing a cmp in the fast path. See the difference for a small function like the PPC64 version of save_general_regs() in arch/powerpc/kernel/signal_32.c: Before the patch (unreachable nop removed): 0c10 <.save_general_regs>: c10: 39 20 00 2c li r9,44 c14: 39 40 00 00 li r10,0 c18: 7d 29 03 a6 mtctr r9 c1c: 38 c0 00 00 li r6,0 c20: 48 00 00 14 b c34 <.save_general_regs+0x24> c30: 42 40 00 40 bdz c70 <.save_general_regs+0x60> c34: 28 2a 00 27 cmpldi r10,39 c38: 7c c8 33 78 mr r8,r6 c3c: 79 47 1f 24 rldicr r7,r10,3,60 c40: 39 20 00 01 li r9,1 c44: 41 82 00 0c beq c50 <.save_general_regs+0x40> c48: 7d 23 38 2a ldx r9,r3,r7 c4c: 79 29 00 20 clrldi r9,r9,32 c50: 91 24 00 00 stw r9,0(r4) c54: 2c 28 00 00 cmpdi r8,0 c58: 39 4a 00 01 addir10,r10,1 c5c: 38 84 00 04 addir4,r4,4 c60: 41 82 ff d0 beq c30 <.save_general_regs+0x20> c64: 38 60 ff f2 li r3,-14 c68: 4e 80 00 20 blr c70: 38 60 00 00 li r3,0 c74: 4e 80 00 20 blr <.fixup>: cc: 39 00 ff f2 li r8,-14 d0: 48 00 00 00 b d0 <.fixup+0xd0> d0: R_PPC64_REL24 .text+0xc54 After the patch: 1490 <.save_general_regs>: 1490: 39 20 00 2c li r9,44 1494: 39 40 00 00 li r10,0 1498: 7d 29 03 a6 mtctr r9 149c: 60 00 00 00 nop 14a0: 28 2a 00 27 cmpldi r10,39 14a4: 79 48 1f 24 rldicr r8,r10,3,60 14a8: 39 20 00 01 li r9,1 14ac: 41 82 00 0c beq 14b8 <.save_general_regs+0x28> 14b0: 7d 23 40 2a ldx r9,r3,r8 14b4: 79 29 00 20 clrldi r9,r9,32 14b8: 91 24 00 00 stw r9,0(r4) 14bc: 39 4a 00 01 addir10,r10,1 14c0: 38 84 00 04 addir4,r4,4 14c4: 42 00 ff dc bdnz14a0 <.save_general_regs+0x10> 14c8: 38 60 00 00 li r3,0 14cc: 4e 80 00 20 blr 14d0: 38 60 ff f2 li r3,-14 14d4: 4e 80 00 20 blr Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a5cfe867fbdc..96d1c144f92b 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -189,14 +189,14 @@ extern long __put_user_bad(void); #define __put_user_size_allowed(x, ptr, size, retval) \ do { \ + __label__ __pu_failed; \ + \ retval = 0; \ - switch (size) { \ - case 1: __put_user_asm(x, ptr, retval, "stb"); break; \ - case 2: __put_user_asm(x, ptr, retval, "sth"); break; \ - case 4: __put_user_asm(x, ptr, retval, "stw"); break; \ - case 8: __put_user_asm2(x, ptr, retval); break; \ - default: __put_user_bad();\ - } \ + __put_user_size_goto(x, ptr, size, __pu_failed);\ + break; \ + \ +__pu_failed: \ + retval = -EFAULT; \ } while (0) #define __put_user_size(x, ptr, size, retval) \ -- 2.25.0
[PATCH] powerpc/uaccess: Add pre-update addressing to __put_user_asm_goto()
Enable pre-update addressing mode in __put_user_asm_goto() Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 7c2427f237e1..a5cfe867fbdc 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -254,7 +254,7 @@ do { \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ - : "r" (x), "m" (*addr) \ + : "r" (x), "m<>" (*addr)\ : \ : label) -- 2.25.0
Re: ptrace_syscall_32 is failing
Andy, On Wed, Sep 02 2020 at 09:49, Andy Lutomirski wrote: > On Wed, Sep 2, 2020 at 1:29 AM Thomas Gleixner wrote: >> >> But you might tell me where exactly you want to inject the SIGTRAP in >> the syscall exit code flow. > > It would be a bit complicated. Definitely after any signals from the > syscall are delivered. Right now, I think that we don't deliver a > SIGTRAP on the instruction boundary after SYSCALL while > single-stepping. (I think we used to, but only sometimes, and now we > are at least consistent.) This is because IRET will not trap if it > starts with TF clear and ends up setting it. (I asked Intel to > document this, and I think they finally did, although I haven't gotten > around to reading the new docs. Certainly the old docs as of a year > or two ago had no description whatsoever of how TF changes worked.) > > Deciding exactly *when* a trap should occur would be nontrivial -- we > can't trap on sigreturn() from a SIGTRAP, for example. > > So this isn't fully worked out. Oh well. >> >> I don't think we want that in general. The current variant is perfectly >> >> fine for everything except the 32bit fast syscall nonsense. Also >> >> irqentry_entry/exit is not equivalent to the syscall_enter/exit >> >> counterparts. >> > >> > If there are any architectures in which actual work is needed to >> > figure out whether something is a syscall in the first place, they'll >> > want to do the usual kernel entry work before the syscall entry work. >> >> That's low level entry code which does not require RCU, lockdep, tracing >> or whatever muck we setup before actual work can be done. >> >> arch_asm_entry() >> ... >> arch_c_entry(cause) { >> switch(cause) { >> case EXCEPTION: arch_c_exception(...); >> case SYSCALL: arch_c_syscall(...); >> ... >> } > > You're assuming that figuring out the cause doesn't need the kernel > entry code to run first. In the case of the 32-bit vDSO fast > syscalls, we arguably don't know whether an entry is a syscall until > we have done a user memory access. Logically, we're doing: > > if (get_user() < 0) { > /* Not a syscall. This is actually a silly operation that sets AX = > -EFAULT and returns. Do not audit or invoke ptrace. */ > } else { > /* This actually is a syscall. */ > } Yes, that's what I've addressed with providing split interfaces. >> You really want to differentiate between exception and syscall >> entry/exit. >> > > Why do we want to distinguish between exception and syscall > entry/exit? For the enter part, AFAICS the exception case boils down > to enter_from_user_mode() and the syscall case is: > > enter_from_user_mode(regs); > instrumentation_begin(); > > local_irq_enable(); > ti_work = READ_ONCE(current_thread_info()->flags); > if (ti_work & SYSCALL_ENTER_WORK) > syscall = syscall_trace_enter(regs, syscall, ti_work); > instrumentation_end(); > > Which would decompose quite nicely as a regular (non-syscall) entry > plus the syscall part later. There is a difference between syscall entry and exception entry at least in my view: syscall: enter_from_user_mode(regs); local_irq_enable(); exception: enter_from_user_mode(regs); >> we'd have: >> >> arch_c_entry() >> irqentry_enter(); >> local_irq_enble(); >> nr = syscall_enter_from_user_mode_work(); >> ... >> >> which enforces two calls for sane entries and more code in arch/ > > This is why I still like my: > > arch_c_entry() > irqentry_enter_from_user_mode(); > generic_syscall(); > exit... So what we have now (with my patch applied) is either: 1) arch_c_entry() nr = syscall_enter_from_user_mode(); arch_handle_syscall(nr); syscall_exit_to_user_mode(); or for that extra 32bit fast syscall thing: 2) arch_c_entry() syscall_enter_from_user_mode_prepare(); arch_do_stuff(); nr = syscall_enter_from_user_mode_work(); arch_handle_syscall(nr); syscall_exit_to_user_mode(); So for sane cases you just use #1. Ideally we'd not need arch_handle_syscall(nr) at all, but that does not work with multiple ABIs supported, i.e. the compat muck. The only way we could make that work is to have: syscall_enter_exit(regs, mode) nr = syscall_enter_from_user_mode(); arch_handle_syscall(mode, nr); syscall_exit_to_user_mode(); and then arch_c_entry() becomes: syscall_enter_exit(regs, mode); which means that arch_handle_syscall() would have to evaluate the mode and chose the appropriate syscall table. Not sure whether that's a win. Thanks, tglx
RE: [PATCH 12/14] x86: remove address space overrides using set_fs()
From: Linus Torvalds > Sent: 04 September 2020 00:26 > > On Thu, Sep 3, 2020 at 2:30 PM David Laight wrote: > > > > A non-canonical (is that the right term) address between the highest > > valid user address and the lowest valid kernel address (7ffe to fffe?) > > will fault anyway. > > Yes. > > But we actually warn against that fault, because it's been a good way > to catch places that didn't use the proper "access_ok()" pattern. > > See ex_handler_uaccess() and the > > WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in > user access. Non-canonical address?"); > > warning. It's been good for randomized testing - a missing range check > on a user address will often hit this. > > Of course, you should never see it in real life (and hopefully not in > testing either any more). But belt-and-suspenders.. That could still be effective, just pick an address limit that is appropriate for the one access_ok() is using. Even if access_ok() uses 1<<63 there are plenty of addresses above it that fault. But the upper limit for 5-level page tables could be used all the time. One option is to test '(address | length) < (3<<62)' in access_ok(). That is also moderately suitable for masking invalid addresses to 0. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Re: [PATCH 12/14] x86: remove address space overrides using set_fs()
On Fri, Sep 04, 2020 at 08:38:13AM +0200, Christoph Hellwig wrote: > > Wait a sec... how is that supposed to build with X86_5LEVEL? Do you mean > > > > #define LOAD_TASK_SIZE_MINUS_N(n) \ > > ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ > > __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), > > X86_FEATURE_LA57 > > > > there? > > Don't ask me about the how, but it builds and works with X86_5LEVEL, > and the style is copied from elsewhere.. Actually, it doesn't any more. Looks like the change to pass the n parameter as suggested by Linus broke the previously working version.
Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes
On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote: > This patch series includes fixes for debug_vm_pgtable test code so that > they follow page table updates rules correctly. The first two patches > introduce > changes w.r.t ppc64. The patches are included in this series for > completeness. We can > merge them via ppc64 tree if required. > > Hugetlb test is disabled on ppc64 because that needs larger change to satisfy > page table update rules. > > These tests are broken w.r.t page table update rules and results in kernel > crash as below. > > [ 21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304! > cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0] > pc: c009a5ec: assert_pte_locked+0x14c/0x380 > lr: c05c: pte_update+0x11c/0x190 > sp: c00c6d1e7950 >msr: 82029033 > current = 0xc00c6d172c80 > paca= 0xc3ba irqmask: 0x03 irq_happened: 0x01 > pid = 1, comm = swapper/0 > kernel BUG at arch/powerpc/mm/pgtable.c:304! > [link register ] c05c pte_update+0x11c/0x190 > [c00c6d1e7950] 0001 (unreliable) > [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190 > [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8 > [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338 > [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0 > [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4 > [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160 > [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c > > With DEBUG_VM disabled > > [ 20.530152] BUG: Kernel NULL pointer dereference on read at 0x > [ 20.530183] Faulting instruction address: 0xc00df330 > cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700] > pc: c00df330: memset+0x68/0x104 > lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0 > sp: c00c6d19f990 >msr: 82009033 >dar: 0 > current = 0xc00c6d177480 > paca= 0xc0001ec4f400 irqmask: 0x03 irq_happened: 0x01 > pid = 1, comm = swapper/0 > [link register ] c009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0 > [c00c6d19f990] c009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 > (unreliable) > [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378 > [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244 > [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0 > [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4 > [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160 > [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c > > Changes from v3: > * Address review feedback > * Move page table depost and withdraw patch after adding pmdlock to avoid > bisect failure. This version - Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with DEBUG_VM_PGTABLE) - Runs on arm64 and x86 without any regression, atleast nothing that I have noticed - Will be great if this could get tested on s390, arc, riscv, ppc32 platforms as well + linux-riscv + linux-snps-...@lists.infradead.org + linux-s...@vger.kernel.org + Gerald Schaefer + Vineet Gupta There is still an open git bisect issue on arm64 platform which ideally should be fixed. - Anshuman
Re: [PATCH 12/14] x86: remove address space overrides using set_fs()
On Fri, Sep 04, 2020 at 03:55:10AM +0100, Al Viro wrote: > On Thu, Sep 03, 2020 at 04:22:40PM +0200, Christoph Hellwig wrote: > > > diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S > > index c8a85b512796e1..94f7be4971ed04 100644 > > --- a/arch/x86/lib/getuser.S > > +++ b/arch/x86/lib/getuser.S > > @@ -35,10 +35,19 @@ > > #include > > #include > > > > +#ifdef CONFIG_X86_5LEVEL > > +#define LOAD_TASK_SIZE_MINUS_N(n) \ > > + ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rdx", \ > > + "mov $((1 << 56) - 4096 - (n)),%rdx", X86_FEATURE_LA57 > > +#else > > +#define LOAD_TASK_SIZE_MINUS_N(n) \ > > + mov $(TASK_SIZE_MAX - (n)),%_ASM_DX > > +#endif > > Wait a sec... how is that supposed to build with X86_5LEVEL? Do you mean > > #define LOAD_TASK_SIZE_MINUS_N(n) \ > ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ > __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), > X86_FEATURE_LA57 > > there? Don't ask me about the how, but it builds and works with X86_5LEVEL, and the style is copied from elsewhere..
Re: [PATCH v4 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64
On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote: > The seems to be missing quite a lot of details w.r.t allocating > the correct pgtable_t page (huge_pte_alloc()), holding the right > lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA. > > ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these. > Hence disable the test on ppc64. > > Signed-off-by: Aneesh Kumar K.V > --- > mm/debug_vm_pgtable.c | 4 > 1 file changed, 4 insertions(+) > > diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c > index b53903fdee85..9afa1354326b 100644 > --- a/mm/debug_vm_pgtable.c > +++ b/mm/debug_vm_pgtable.c > @@ -811,6 +811,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, > pgprot_t prot) > #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ > } > > +#ifndef CONFIG_PPC_BOOK3S_64 > static void __init hugetlb_advanced_tests(struct mm_struct *mm, > struct vm_area_struct *vma, > pte_t *ptep, unsigned long pfn, > @@ -853,6 +854,7 @@ static void __init hugetlb_advanced_tests(struct > mm_struct *mm, > pte = huge_ptep_get(ptep); > WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte))); > } > +#endif > #else /* !CONFIG_HUGETLB_PAGE */ > static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } > static void __init hugetlb_advanced_tests(struct mm_struct *mm, > @@ -1065,7 +1067,9 @@ static int __init debug_vm_pgtable(void) > pud_populate_tests(mm, pudp, saved_pmdp); > spin_unlock(ptl); > > +#ifndef CONFIG_PPC_BOOK3S_64 > hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); > +#endif > > spin_lock(>page_table_lock); > p4d_clear_tests(mm, p4dp); > Is it still required now that DEBUG_VM_PGTABLE has been dropped from powerpc or you would like to re-enabled it back ? https://lore.kernel.org/linuxppc-dev/159913592797.5893.5829441560236719450.b4...@ellerman.id.au/T/#m6d890e2fe84cf180cb875fae5f791e9c83db8d30
Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
On Thu, 2020-09-03 at 14:41 +1000, Alexey Kardashevskiy wrote: > I am new to this, so I am trying to understand how a memory page mapped > > as DMA, and used for something else could be a problem. > > From the device prospective, there is PCI space and everything from 0 > till 1<<64 is accessible and what is that mapped to - the device does > not know. PHB's IOMMU is the thing to notice invalid access and raise > EEH but PHB only knows about PCI->physical memory mapping (with IOMMU > pages) but nothing about the host kernel pages. Does this help? Thanks, According to our conversation on Slack: 1- There is a problem if a hypervisor gives to it's VMs contiguous memory blocks that are not aligned to IOMMU pages, because then an iommu_map_page() could map some memory in this VM and some memory in other VM / process. 2- To guarantee this, we should have system pagesize >= iommu_pagesize One way to get (2) is by doing this in enable_ddw(): if ((query.page_size & 4) && PAGE_SHIFT >= 24) { page_shift = 24; /* 16MB */ } else if ((query.page_size & 2) && PAGE_SHIFT >= 16 ) { page_shift = 16; /* 64kB */ } else if (query.page_size & 1 && PAGE_SHIFT >= 12) { page_shift = 12; /* 4kB */ [...] Another way of solving this, would be adding in LoPAR documentation that the blocksize of contiguous memory the hypervisor gives a VM should always be aligned to IOMMU pagesize offered. I think the best approach would be first sending the above patch, which is faster, and then get working into adding that to documentation, so hypervisors guarantee this. If this gets into the docs, we can revert the patch. What do you think? Best regards!
Re: [PATCH v4 11/13] mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote: > pmd_clear() should not be used to clear pmd level pte entries. > > Signed-off-by: Aneesh Kumar K.V > --- > mm/debug_vm_pgtable.c | 7 --- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c > index 26023d990bd0..b53903fdee85 100644 > --- a/mm/debug_vm_pgtable.c > +++ b/mm/debug_vm_pgtable.c > @@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct > *mm, > pmd = READ_ONCE(*pmdp); > WARN_ON(pmd_young(pmd)); > > + /* Clear the pte entries */ > + pmdp_huge_get_and_clear(mm, vaddr, pmdp); > pgtable = pgtable_trans_huge_withdraw(mm, pmdp); > } > > @@ -319,6 +321,8 @@ static void __init pud_advanced_tests(struct mm_struct > *mm, > pudp_test_and_clear_young(vma, vaddr, pudp); > pud = READ_ONCE(*pudp); > WARN_ON(pud_young(pud)); > + > + pudp_huge_get_and_clear(mm, vaddr, pudp); > } > > static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) > @@ -442,8 +446,6 @@ static void __init pud_populate_tests(struct mm_struct > *mm, pud_t *pudp, >* This entry points to next level page table page. >* Hence this must not qualify as pud_bad(). >*/ > - pmd_clear(pmdp); > - pud_clear(pudp); > pud_populate(mm, pudp, pmdp); > pud = READ_ONCE(*pudp); > WARN_ON(pud_bad(pud)); > @@ -575,7 +577,6 @@ static void __init pmd_populate_tests(struct mm_struct > *mm, pmd_t *pmdp, >* This entry points to next level page table page. >* Hence this must not qualify as pmd_bad(). >*/ > - pmd_clear(pmdp); > pmd_populate(mm, pmdp, pgtable); > pmd = READ_ONCE(*pmdp); > WARN_ON(pmd_bad(pmd)); > Why pxxp_huge_get_and_clear() cannot be called inside pxx_populate_tests() functions itself ? Nonetheless, this does not seem to cause any problem.
Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
* Christoph Hellwig wrote: > Hi all, > > this series removes the last set_fs() used to force a kernel address > space for the uaccess code in the kernel read/write/splice code, and then > stops implementing the address space overrides entirely for x86 and > powerpc. Cool! For the x86 bits: Acked-by: Ingo Molnar Thanks, Ingo