[PATCH v3 2/2] scsi: ibmvfc: interface updates for future FPIN and MQ support

2020-09-04 Thread Tyrel Datwyler
VIOS partitions with SLI-4 enabled Emulex adapters will be capable of
driving IO in parallel through mulitple work queues or channels, and
with new hyperviosr firmware that supports multiple interrupt sources
an ibmvfc NPIV single initiator can be modified to exploit end to end
channelization in a PowerVM environment.

VIOS hosts will also be able to expose fabric perfromance impact
notifications (FPIN) via a new asynchronous event to ibmvfc clients that
advertise support via IBMVFC_CAN_HANDLE_FPIN in their capabilities flag
during NPIV_LOGIN.

This patch introduces three new Management Datagrams (MADs) for
channelization support negotiation as well as the FPIN asynchronous
event and FPIN status flags. Follow up work is required to plumb the
ibmvfc client driver to use these new interfaces.

Signed-off-by: Tyrel Datwyler 
---
v2 -> v3:
Fixup checkpatch warnings about using __attribute__()
v1 -> v2:
Fixup complier errors from neglected commit --amend

---
 drivers/scsi/ibmvscsi/ibmvfc.h | 66 +-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 6da23666f5be..e6e1c255a79c 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -124,6 +124,9 @@ enum ibmvfc_mad_types {
IBMVFC_PASSTHRU = 0x0200,
IBMVFC_TMF_MAD  = 0x0100,
IBMVFC_NPIV_LOGOUT  = 0x0800,
+   IBMVFC_CHANNEL_ENQUIRY  = 0x1000,
+   IBMVFC_CHANNEL_SETUP= 0x2000,
+   IBMVFC_CONNECTION_INFO  = 0x4000,
 };
 
 struct ibmvfc_mad_common {
@@ -162,6 +165,8 @@ struct ibmvfc_npiv_login {
__be32 max_cmds;
__be64 capabilities;
 #define IBMVFC_CAN_MIGRATE 0x01
+#define IBMVFC_CAN_USE_CHANNELS0x02
+#define IBMVFC_CAN_HANDLE_FPIN 0x04
__be64 node_name;
struct srp_direct_buf async;
u8 partition_name[IBMVFC_MAX_NAME];
@@ -204,6 +209,7 @@ struct ibmvfc_npiv_login_resp {
__be64 capabilities;
 #define IBMVFC_CAN_FLUSH_ON_HALT   0x08
 #define IBMVFC_CAN_SUPPRESS_ABTS   0x10
+#define IBMVFC_CAN_SUPPORT_CHANNELS0x20
__be32 max_cmds;
__be32 scsi_id_sz;
__be64 max_dma_len;
@@ -482,6 +488,52 @@ struct ibmvfc_passthru_mad {
struct ibmvfc_passthru_fc_iu fc_iu;
 } __packed __aligned(8);
 
+struct ibmvfc_channel_enquiry {
+   struct ibmvfc_mad_common common;
+   __be32 flags;
+#define IBMVFC_NO_CHANNELS_TO_CRQ_SUPPORT  0x01
+#define IBMVFC_SUPPORT_VARIABLE_SUBQ_MSG   0x02
+#define IBMVFC_NO_N_TO_M_CHANNELS_SUPPORT  0x04
+   __be32 num_scsi_subq_channels;
+   __be32 num_nvmeof_subq_channels;
+   __be32 num_scsi_vas_channels;
+   __be32 num_nvmeof_vas_channels;
+} __packed __aligned(8);
+
+struct ibmvfc_channel_setup_mad {
+   struct ibmvfc_mad_common common;
+   struct srp_direct_buf buffer;
+} __packed __aligned(8);
+
+#define IBMVFC_MAX_CHANNELS502
+
+struct ibmvfc_channel_setup {
+   __be32 flags;
+#define IBMVFC_CANCEL_CHANNELS 0x01
+#define IBMVFC_USE_BUFFER  0x02
+#define IBMVFC_CHANNELS_CANCELED   0x04
+   __be32 reserved;
+   __be32 num_scsi_subq_channels;
+   __be32 num_nvmeof_subq_channels;
+   __be32 num_scsi_vas_channels;
+   __be32 num_nvmeof_vas_channels;
+   struct srp_direct_buf buffer;
+   __be64 reserved2[5];
+   __be64 channel_handles[IBMVFC_MAX_CHANNELS];
+} __packed __aligned(8);
+
+struct ibmvfc_connection_info {
+   struct ibmvfc_mad_common common;
+   __be64 information_bits;
+#define IBMVFC_NO_FC_IO_CHANNEL0x01
+#define IBMVFC_NO_PHYP_VAS 0x02
+#define IBMVFC_NO_PHYP_SUBQ0x04
+#define IBMVFC_PHYP_DEPRECATED_SUBQ0x08
+#define IBMVFC_PHYP_PRESERVED_SUBQ 0x10
+#define IBMVFC_PHYP_FULL_SUBQ  0x20
+   __be64 reserved[16];
+} __packed __aligned(8);
+
 struct ibmvfc_trace_start_entry {
u32 xfer_len;
 } __packed;
@@ -532,6 +584,7 @@ enum ibmvfc_async_event {
IBMVFC_AE_HALT  = 0x0400,
IBMVFC_AE_RESUME= 0x0800,
IBMVFC_AE_ADAPTER_FAILED= 0x1000,
+   IBMVFC_AE_FPIN  = 0x2000,
 };
 
 struct ibmvfc_async_desc {
@@ -560,10 +613,18 @@ enum ibmvfc_ae_link_state {
IBMVFC_AE_LS_LINK_DEAD  = 0x08,
 };
 
+enum ibmvfc_ae_fpin_status {
+   IBMVFC_AE_FPIN_LINK_CONGESTED   = 0x1,
+   IBMVFC_AE_FPIN_PORT_CONGESTED   = 0x2,
+   IBMVFC_AE_FPIN_PORT_CLEARED = 0x3,
+   IBMVFC_AE_FPIN_PORT_DEGRADED= 0x4,
+};
+
 struct ibmvfc_async_crq {
volatile u8 valid;
u8 link_state;
-   u8 pad[2];
+   u8 fpin_status;
+   u8 pad;
__be32 pad2;
volatile __be64 event;
volatile __be64 scsi_id;
@@ -590,6 +651,9 @@ union ibmvfc_iu {
struct ibmvfc_tmf tmf;
struct ibmvfc_cmd cmd;

[PATCH v3 1/2] scsi: ibmvfc: use compiler attribute defines instead of __attribute__()

2020-09-04 Thread Tyrel Datwyler
Update ibmvfc.h structs to use the preferred  __packed and __aligned()
attribute macros defined in include/linux/compiler_attributes.h in place
of __attribute__().

Signed-off-by: Tyrel Datwyler 
---
 drivers/scsi/ibmvscsi/ibmvfc.h | 56 +-
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 907889f1fa9d..6da23666f5be 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -133,16 +133,16 @@ struct ibmvfc_mad_common {
__be16 status;
__be16 length;
__be64 tag;
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_npiv_login_mad {
struct ibmvfc_mad_common common;
struct srp_direct_buf buffer;
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_npiv_logout_mad {
struct ibmvfc_mad_common common;
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 #define IBMVFC_MAX_NAME 256
 
@@ -168,7 +168,7 @@ struct ibmvfc_npiv_login {
u8 device_name[IBMVFC_MAX_NAME];
u8 drc_name[IBMVFC_MAX_NAME];
__be64 reserved2[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_common_svc_parms {
__be16 fcph_version;
@@ -177,7 +177,7 @@ struct ibmvfc_common_svc_parms {
__be16 bb_rcv_sz; /* upper nibble is BB_SC_N */
__be32 ratov;
__be32 edtov;
-}__attribute__((packed, aligned (4)));
+} __packed __aligned(4);
 
 struct ibmvfc_service_parms {
struct ibmvfc_common_svc_parms common;
@@ -192,7 +192,7 @@ struct ibmvfc_service_parms {
__be32 ext_len;
__be32 reserved[30];
__be32 clk_sync_qos[2];
-}__attribute__((packed, aligned (4)));
+} __packed __aligned(4);
 
 struct ibmvfc_npiv_login_resp {
__be32 version;
@@ -217,12 +217,12 @@ struct ibmvfc_npiv_login_resp {
u8 drc_name[IBMVFC_MAX_NAME];
struct ibmvfc_service_parms service_parms;
__be64 reserved2;
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 union ibmvfc_npiv_login_data {
struct ibmvfc_npiv_login login;
struct ibmvfc_npiv_login_resp resp;
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_discover_targets_buf {
__be32 scsi_id[1];
@@ -239,7 +239,7 @@ struct ibmvfc_discover_targets {
__be32 num_avail;
__be32 num_written;
__be64 reserved[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 enum ibmvfc_fc_reason {
IBMVFC_INVALID_ELS_CMD_CODE = 0x01,
@@ -283,7 +283,7 @@ struct ibmvfc_port_login {
struct ibmvfc_service_parms service_parms;
struct ibmvfc_service_parms service_parms_change;
__be64 reserved3[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_prli_svc_parms {
u8 type;
@@ -303,7 +303,7 @@ struct ibmvfc_prli_svc_parms {
 #define IBMVFC_PRLI_TARGET_FUNC0x0010
 #define IBMVFC_PRLI_READ_FCP_XFER_RDY_DISABLED 0x0002
 #define IBMVFC_PRLI_WR_FCP_XFER_RDY_DISABLED   0x0001
-}__attribute__((packed, aligned (4)));
+} __packed __aligned(4);
 
 struct ibmvfc_process_login {
struct ibmvfc_mad_common common;
@@ -314,7 +314,7 @@ struct ibmvfc_process_login {
__be16 error;   /* also fc_reason */
__be32 reserved2;
__be64 reserved3[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_query_tgt {
struct ibmvfc_mad_common common;
@@ -325,13 +325,13 @@ struct ibmvfc_query_tgt {
__be16 fc_explain;
__be16 fc_type;
__be64 reserved[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_implicit_logout {
struct ibmvfc_mad_common common;
__be64 old_scsi_id;
__be64 reserved[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_tmf {
struct ibmvfc_mad_common common;
@@ -348,7 +348,7 @@ struct ibmvfc_tmf {
__be32 my_cancel_key;
__be32 pad;
__be64 reserved[2];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 enum ibmvfc_fcp_rsp_info_codes {
RSP_NO_FAILURE  = 0x00,
@@ -361,7 +361,7 @@ struct ibmvfc_fcp_rsp_info {
u8 reserved[3];
u8 rsp_code;
u8 reserved2[4];
-}__attribute__((packed, aligned (2)));
+} __packed __aligned(2);
 
 enum ibmvfc_fcp_rsp_flags {
FCP_BIDI_RSP= 0x80,
@@ -377,7 +377,7 @@ enum ibmvfc_fcp_rsp_flags {
 union ibmvfc_fcp_rsp_data {
struct ibmvfc_fcp_rsp_info info;
u8 sense[SCSI_SENSE_BUFFERSIZE + sizeof(struct ibmvfc_fcp_rsp_info)];
-}__attribute__((packed, aligned (8)));
+} __packed __aligned(8);
 
 struct ibmvfc_fcp_rsp {
__be64 reserved;
@@ -388,7 +388,7 @@ struct ibmvfc_fcp_rsp {
__be32 

[PATCH 5/5] powerpc/tau: Disable TAU between measurements

2020-09-04 Thread Finn Thain
Enabling CONFIG_TAU_INT causes random crashes:

Unrecoverable exception 1700 at c0009414 (msr=1000)
Oops: Unrecoverable exception, sig: 6 [#1]
BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-pmac-00043-gd5f545e1a8593 #5
NIP:  c0009414 LR: c0009414 CTR: c00116fc
REGS: c0799eb8 TRAP: 1700   Not tainted  (5.7.0-pmac-00043-gd5f545e1a8593)
MSR:  1000   CR: 22000228  XER: 0100

GPR00:  c0799f70 c076e300 0080 0291c0ac 00e0 c076e300 00049032
GPR08: 0001 c00116fc  dfbd3200  007f80a8  
GPR16:        c075ce04
GPR24: c075ce04 dfff8880 c07b c075ce04 0008 0001 c079ef98 c079ef5c
NIP [c0009414] arch_cpu_idle+0x24/0x6c
LR [c0009414] arch_cpu_idle+0x24/0x6c
Call Trace:
[c0799f70] [0001] 0x1 (unreliable)
[c0799f80] [c0060990] do_idle+0xd8/0x17c
[c0799fa0] [c0060ba4] cpu_startup_entry+0x20/0x28
[c0799fb0] [c072d220] start_kernel+0x434/0x44c
[c0799ff0] [3860] 0x3860
Instruction dump:
   3d20c07b    7c0802a6
   4e800421    7d2000a6
---[ end trace 3a0c9b5cb216db6b ]---

Resolve this problem by disabling each THRMn comparator when handling
the associated THRMn interrupt and by disabling the TAU entirely when
updating THRMn thresholds.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/powerpc/kernel/tau_6xx.c  | 65 +-
 arch/powerpc/platforms/Kconfig |  9 ++---
 2 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 614b5b272d9c6..0b4694b8d2482 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -42,8 +42,6 @@ static struct tau_temp
 
 static bool tau_int_enable;
 
-#undef DEBUG
-
 /* TODO: put these in a /proc interface, with some sanity checks, and maybe
  * dynamic adjustment to minimize # of interrupts */
 /* configurable values for step size and how much to expand the window when
@@ -67,42 +65,33 @@ static void set_thresholds(unsigned long cpu)
 
 static void TAUupdate(int cpu)
 {
-   unsigned thrm;
-
-#ifdef DEBUG
-   printk("TAUupdate ");
-#endif
+   u32 thrm;
+   u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
 
/* if both thresholds are crossed, the step_sizes cancel out
 * and the window winds up getting expanded twice. */
-   if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
-   if(thrm & THRM1_TIN){ /* crossed low threshold */
-   if (tau[cpu].low >= step_size){
-   tau[cpu].low -= step_size;
-   tau[cpu].high -= (step_size - window_expand);
-   }
-   tau[cpu].grew = 1;
-#ifdef DEBUG
-   printk("low threshold crossed ");
-#endif
+   thrm = mfspr(SPRN_THRM1);
+   if ((thrm & bits) == bits) {
+   mtspr(SPRN_THRM1, 0);
+
+   if (tau[cpu].low >= step_size) {
+   tau[cpu].low -= step_size;
+   tau[cpu].high -= (step_size - window_expand);
}
+   tau[cpu].grew = 1;
+   pr_debug("%s: low threshold crossed\n", __func__);
}
-   if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
-   if(thrm & THRM1_TIN){ /* crossed high threshold */
-   if (tau[cpu].high <= 127-step_size){
-   tau[cpu].low += (step_size - window_expand);
-   tau[cpu].high += step_size;
-   }
-   tau[cpu].grew = 1;
-#ifdef DEBUG
-   printk("high threshold crossed ");
-#endif
+   thrm = mfspr(SPRN_THRM2);
+   if ((thrm & bits) == bits) {
+   mtspr(SPRN_THRM2, 0);
+
+   if (tau[cpu].high <= 127 - step_size) {
+   tau[cpu].low += (step_size - window_expand);
+   tau[cpu].high += step_size;
}
+   tau[cpu].grew = 1;
+   pr_debug("%s: high threshold crossed\n", __func__);
}
-
-#ifdef DEBUG
-   printk("grew = %d\n", tau[cpu].grew);
-#endif
 }
 
 #ifdef CONFIG_TAU_INT
@@ -127,17 +116,17 @@ void TAUException(struct pt_regs * regs)
 static void tau_timeout(void * info)
 {
int cpu;
-   unsigned long flags;
int size;
int shrink;
 
-   /* disabling interrupts *should* be okay */
-   local_irq_save(flags);
cpu = smp_processor_id();
 
if (!tau_int_enable)
TAUupdate(cpu);
 
+   /* Stop thermal sensor comparisons and interrupts */
+   mtspr(SPRN_THRM3, 0);
+
size = tau[cpu].high - tau[cpu].low;
if (size > min_window && ! 

[PATCH 4/5] powerpc/tau: Check processor type before enabling TAU interrupt

2020-09-04 Thread Finn Thain
According to Freescale's documentation, MPC74XX processors have an
erratum that prevents the TAU interrupt from working, so don't try to
use it when running on those processors.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/powerpc/kernel/tau_6xx.c  | 33 ++---
 arch/powerpc/platforms/Kconfig |  5 ++---
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index b8d7e7d498e0a..614b5b272d9c6 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -40,6 +40,8 @@ static struct tau_temp
unsigned char grew;
 } tau[NR_CPUS];
 
+static bool tau_int_enable;
+
 #undef DEBUG
 
 /* TODO: put these in a /proc interface, with some sanity checks, and maybe
@@ -54,22 +56,13 @@ static struct tau_temp
 
 static void set_thresholds(unsigned long cpu)
 {
-#ifdef CONFIG_TAU_INT
-   /*
-* setup THRM1,
-* threshold, valid bit, enable interrupts, interrupt when below 
threshold
-*/
-   mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | 
THRM1_TID);
+   u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
 
-   /* setup THRM2,
-* threshold, valid bit, enable interrupts, interrupt when above 
threshold
-*/
-   mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
-#else
-   /* same thing but don't enable interrupts */
-   mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
-   mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
-#endif
+   /* setup THRM1, threshold, valid bit, interrupt when below threshold */
+   mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | 
THRM1_TID);
+
+   /* setup THRM2, threshold, valid bit, interrupt when above threshold */
+   mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
 }
 
 static void TAUupdate(int cpu)
@@ -142,9 +135,8 @@ static void tau_timeout(void * info)
local_irq_save(flags);
cpu = smp_processor_id();
 
-#ifndef CONFIG_TAU_INT
-   TAUupdate(cpu);
-#endif
+   if (!tau_int_enable)
+   TAUupdate(cpu);
 
size = tau[cpu].high - tau[cpu].low;
if (size > min_window && ! tau[cpu].grew) {
@@ -225,6 +217,9 @@ static int __init TAU_init(void)
return 1;
}
 
+   tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
+!strcmp(cur_cpu_spec->platform, "ppc750");
+
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0);
if (!tau_workq)
return -ENOMEM;
@@ -234,7 +229,7 @@ static int __init TAU_init(void)
queue_work(tau_workq, _work);
 
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
-   IS_ENABLED(CONFIG_TAU_INT) ? "interrupts" : "workqueue", 
shrink_timer);
+   tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
tau_initialized = 1;
 
return 0;
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index fb7515b4fa9c6..9fe36f0b54c1a 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -223,9 +223,8 @@ config TAU
  temperature within 2-4 degrees Celsius. This option shows the current
  on-die temperature in /proc/cpuinfo if the cpu supports it.
 
- Unfortunately, on some chip revisions, this sensor is very inaccurate
- and in many cases, does not work at all, so don't assume the cpu
- temp is actually what /proc/cpuinfo says it is.
+ Unfortunately, this sensor is very inaccurate when uncalibrated, so
+ don't assume the cpu temp is actually what /proc/cpuinfo says it is.
 
 config TAU_INT
bool "Interrupt driven TAU driver (DANGEROUS)"
-- 
2.26.2



[PATCH 2/5] powerpc/tau: Convert from timer to workqueue

2020-09-04 Thread Finn Thain
Since commit 19dbdcb8039cf ("smp: Warn on function calls from softirq
context") the Thermal Assist Unit driver causes a warning like the
following when CONFIG_SMP is enabled.

[ cut here ]
WARNING: CPU: 0 PID: 0 at kernel/smp.c:428 
smp_call_function_many_cond+0xf4/0x38c
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-pmac #3
NIP:  c00b37a8 LR: c00b3abc CTR: c001218c
REGS: c0799c60 TRAP: 0700   Not tainted  (5.7.0-pmac)
MSR:  00029032   CR: 42000224  XER: 

GPR00: c00b3abc c0799d18 c076e300 c079ef5c c0011fec   
GPR08: 0100 0100 8000  42000224  c079d040 c079d044
GPR16: 0001  0004 c0799da0 c079f054 c07a c07a 
GPR24: c0011fec  c079ef5c c079ef5c    
NIP [c00b37a8] smp_call_function_many_cond+0xf4/0x38c
LR [c00b3abc] on_each_cpu+0x38/0x68
Call Trace:
[c0799d18] [] 0x (unreliable)
[c0799d68] [c00b3abc] on_each_cpu+0x38/0x68
[c0799d88] [c0096704] call_timer_fn.isra.26+0x20/0x7c
[c0799d98] [c0096b40] run_timer_softirq+0x1d4/0x3fc
[c0799df8] [c05b4368] __do_softirq+0x118/0x240
[c0799e58] [c0039c44] irq_exit+0xc4/0xcc
[c0799e68] [c000ade8] timer_interrupt+0x1b0/0x230
[c0799ea8] [c0013520] ret_from_except+0x0/0x14
--- interrupt: 901 at arch_cpu_idle+0x24/0x6c
LR = arch_cpu_idle+0x24/0x6c
[c0799f70] [0001] 0x1 (unreliable)
[c0799f80] [c0060990] do_idle+0xd8/0x17c
[c0799fa0] [c0060ba8] cpu_startup_entry+0x24/0x28
[c0799fb0] [c072d220] start_kernel+0x434/0x44c
[c0799ff0] [3860] 0x3860
Instruction dump:
8129f204 2f89 40beff98 3d20c07a 8929eec4 2f89 40beff88 0fe0
8122 552805de 550802ef 4182ff84 <0fe0> 3860 7f65db78 7f44d378
---[ end trace 34a886e47819c2eb ]---

Don't call on_each_cpu() from a timer callback, call it from a worker
thread instead.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/powerpc/kernel/tau_6xx.c | 38 +--
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 976d5bc1b5176..268205cc347da 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -13,13 +13,14 @@
  */
 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -39,8 +40,6 @@ static struct tau_temp
unsigned char grew;
 } tau[NR_CPUS];
 
-struct timer_list tau_timer;
-
 #undef DEBUG
 
 /* TODO: put these in a /proc interface, with some sanity checks, and maybe
@@ -50,7 +49,7 @@ struct timer_list tau_timer;
 #define step_size  2   /* step size when temp goes out of 
range */
 #define window_expand  1   /* expand the window by this much */
 /* configurable values for shrinking the window */
-#define shrink_timer   2*HZ/* period between shrinking the window */
+#define shrink_timer   2000/* period between shrinking the window */
 #define min_window 2   /* minimum window size, degrees C */
 
 static void set_thresholds(unsigned long cpu)
@@ -187,14 +186,18 @@ static void tau_timeout(void * info)
local_irq_restore(flags);
 }
 
-static void tau_timeout_smp(struct timer_list *unused)
-{
+static struct workqueue_struct *tau_workq;
 
-   /* schedule ourselves to be run again */
-   mod_timer(_timer, jiffies + shrink_timer) ;
+static void tau_work_func(struct work_struct *work)
+{
+   msleep(shrink_timer);
on_each_cpu(tau_timeout, NULL, 0);
+   /* schedule ourselves to be run again */
+   queue_work(tau_workq, work);
 }
 
+DECLARE_WORK(tau_work, tau_work_func);
+
 /*
  * setup the TAU
  *
@@ -227,21 +230,16 @@ static int __init TAU_init(void)
return 1;
}
 
-
-   /* first, set up the window shrinking timer */
-   timer_setup(_timer, tau_timeout_smp, 0);
-   tau_timer.expires = jiffies + shrink_timer;
-   add_timer(_timer);
+   tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1, 0);
+   if (!tau_workq)
+   return -ENOMEM;
 
on_each_cpu(TAU_init_smp, NULL, 0);
 
-   printk("Thermal assist unit ");
-#ifdef CONFIG_TAU_INT
-   printk("using interrupts, ");
-#else
-   printk("using timers, ");
-#endif
-   printk("shrink_timer: %d jiffies\n", shrink_timer);
+   queue_work(tau_workq, _work);
+
+   pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
+   IS_ENABLED(CONFIG_TAU_INT) ? "interrupts" : "workqueue", 
shrink_timer);
tau_initialized = 1;
 
return 0;
-- 
2.26.2



[PATCH 3/5] powerpc/tau: Remove duplicated set_thresholds() call

2020-09-04 Thread Finn Thain
The commentary at the call site seems to disagree with the code. The
conditional prevents calling set_thresholds() via the exception handler,
which appears to crash. Perhaps that's because it immediately triggers
another TAU exception. Anyway, calling set_thresholds() from TAUupdate()
is redundant because tau_timeout() does so.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/powerpc/kernel/tau_6xx.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 268205cc347da..b8d7e7d498e0a 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -110,11 +110,6 @@ static void TAUupdate(int cpu)
 #ifdef DEBUG
printk("grew = %d\n", tau[cpu].grew);
 #endif
-
-#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
-   set_thresholds(cpu);
-#endif
-
 }
 
 #ifdef CONFIG_TAU_INT
-- 
2.26.2



[PATCH 1/5] powerpc/tau: Use appropriate temperature sample interval

2020-09-04 Thread Finn Thain
According to the MPC750 Users Manual, the SITV value in Thermal
Management Register 3 is 13 bits long. The present code calculates the
SITV value as 60 * 500 cycles. This would overflow to give 10 us on
a 500 MHz CPU rather than the intended 60 us. (But according to the
Microprocessor Datasheet, there is also a factor of 266 that has to be
applied to this value on certain parts i.e. speed sort above 266 MHz.)
Always use the maximum cycle count, as recommended by the Datasheet.

Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2")
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/powerpc/include/asm/reg.h |  2 +-
 arch/powerpc/kernel/tau_6xx.c  | 12 
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 88e6c78100d9b..c750afc62887c 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -815,7 +815,7 @@
 #define THRM1_TIN  (1 << 31)
 #define THRM1_TIV  (1 << 30)
 #define THRM1_THRES(x) ((x&0x7f)<<23)
-#define THRM3_SITV(x)  ((x&0x3fff)<<1)
+#define THRM3_SITV(x)  ((x & 0x1fff) << 1)
 #define THRM1_TID  (1<<2)
 #define THRM1_TIE  (1<<1)
 #define THRM1_V(1<<0)
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index e2ab8a111b693..976d5bc1b5176 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -178,15 +178,11 @@ static void tau_timeout(void * info)
 * complex sleep code needs to be added. One mtspr every time
 * tau_timeout is called is probably not a big deal.
 *
-* Enable thermal sensor and set up sample interval timer
-* need 20 us to do the compare.. until a nice 'cpu_speed' function
-* call is implemented, just assume a 500 mhz clock. It doesn't really
-* matter if we take too long for a compare since it's all interrupt
-* driven anyway.
-*
-* use a extra long time.. (60 us @ 500 mhz)
+* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
+* recommends that "the maximum value be set in THRM3 under all
+* conditions."
 */
-   mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
+   mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
 
local_irq_restore(flags);
 }
-- 
2.26.2



[PATCH 0/5] powerpc/tau: TAU driver fixes

2020-09-04 Thread Finn Thain
This patch series fixes various bugs in the Thermal Assist Unit driver.
It was tested on 266 MHz and 292 MHz PowerBook G3 laptops.


Finn Thain (5):
  powerpc/tau: Use appropriate temperature sample interval
  powerpc/tau: Convert from timer to workqueue
  powerpc/tau: Remove duplicated set_thresholds() call
  powerpc/tau: Check processor type before enabling TAU interrupt
  powerpc/tau: Disable TAU between measurements

 arch/powerpc/include/asm/reg.h |   2 +-
 arch/powerpc/kernel/tau_6xx.c  | 147 +
 arch/powerpc/platforms/Kconfig |  14 +---
 3 files changed, 62 insertions(+), 101 deletions(-)

-- 
2.26.2



[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8

2020-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=208181

Erhard F. (erhar...@mailbox.org) changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |OBSOLETE

--- Comment #19 from Erhard F. (erhar...@mailbox.org) ---
I noticed that I covered the "do_IRQ: stack overflow: " problem already in
bug #207129 so closing this one as suggested before.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 205099] KASAN hit at raid6_pq: BUG: Unable to handle kernel data access at 0x00f0fd0d

2020-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205099

Erhard F. (erhar...@mailbox.org) changed:

   What|Removed |Added

 Attachment #288413|0   |1
is obsolete||

--- Comment #31 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 292347
  --> https://bugzilla.kernel.org/attachment.cgi?id=292347=edit
kernel .config (5.9-rc3, OUTLINE KASAN, PowerMac G4 DP)

Does happen even if RAID support is not actively selected in the config as
btrfs pulls in RAID6_PQ on its own.

# CONFIG_DM_RAID is not set
CONFIG_RAID6_PQ=m

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 205099] KASAN hit at raid6_pq: BUG: Unable to handle kernel data access at 0x00f0fd0d

2020-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205099

Erhard F. (erhar...@mailbox.org) changed:

   What|Removed |Added

 Attachment #287625|0   |1
is obsolete||
 Attachment #288411|0   |1
is obsolete||

--- Comment #30 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 292345
  --> https://bugzilla.kernel.org/attachment.cgi?id=292345=edit
dmesg (5.9-rc3, OUTLINE KASAN, PowerMac G4 DP)

Re-tested with v5.9-rc3 out of curiosity. Not much change here, the bug shows
up with OUTLINE KASAN but not with INLINE KASAN, everything else being equal:

==
BUG: KASAN: user-memory-access in raid6_altivec8_gen_syndrome_real+0x2b0/0x480
[raid6_pq]
Read of size 4 at addr 5764b118 by task modprobe/126

CPU: 1 PID: 126 Comm: modprobe Tainted: GW 5.9.0-rc3-PowerMacG4
#2
Call Trace:
[e32cb7b8] [c0517aac] dump_stack+0xc4/0xf8 (unreliable)
[e32cb7e8] [c026e73c] kasan_report+0x16c/0x170
[e32cb828] [b02004e0] raid6_altivec8_gen_syndrome_real+0x2b0/0x480 [raid6_pq]
[e32cba18] [b02006fc] raid6_altivec8_gen_syndrome+0x4c/0x88 [raid6_pq]
[e32cba38] [b021a42c] init_module+0x42c/0x590 [raid6_pq]
[e32cbb08] [c00058a0] do_one_initcall+0xb8/0x3dc
[e32cbbd8] [c011c0fc] do_init_module+0xa8/0x2c4
[e32cbc08] [c011f02c] load_module+0x2b98/0x2d4c
[e32cbe18] [c011f448] sys_finit_module+0x100/0x138
[e32cbf38] [c001a1cc] ret_from_syscall+0x0/0x34
--- interrupt: c01 at 0x3d2068
LR = 0x506104
==
BUG: Unable to handle kernel data access on read at 0x5764b118
Faulting instruction address: 0xb02004e0
Oops: Kernel access of bad area, sig: 11 [#1]

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8

2020-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=208181

--- Comment #18 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 292339
  --> https://bugzilla.kernel.org/attachment.cgi?id=292339=edit
kernel .config (5.9-rc3, PowerMac G4 DP)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8

2020-09-04 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=208181

--- Comment #17 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 292337
  --> https://bugzilla.kernel.org/attachment.cgi?id=292337=edit
dmesg (5.9-rc3, INLINE KASAN, PowerMac G4 DP)

Re-tried with 5.9-rc3 (inline KASAN). The original problem (stack-out-of-bounds
in strcmp+0x58/0xd8) is gone, but still problems with stack usage when doing
larger build jobs:

[...]
[ 1929.683510] do_IRQ: stack overflow: 1696
[ 1929.690727] CPU: 1 PID: 735 Comm: mount.nfs Tainted: GW
5.9.0-rc3-PowerMacG4 #1
[ 1929.697847] Call Trace:
[ 1929.704633] [d0ca4670] [c0a75518] dump_stack+0xfc/0x130 (unreliable)
[ 1929.711507] [d0ca46a0] [c000b094] do_IRQ+0x128/0x180
[ 1929.717998] [d0ca46d0] [c002e560] ret_from_except+0x0/0x14
[ 1929.724652] --- interrupt: 501 at _raw_spin_unlock_irqrestore+0x3c/0xa4
   LR = _raw_spin_unlock_irqrestore+0x38/0xa4
[ 1929.738722] [d0ca47b8] [c0a6dc90] stack_depot_save+0x20c/0x390
[ 1929.746132] [d0ca4818] [c04d4b70] kasan_save_stack+0x40/0x48
[ 1929.753675] [d0ca4928] [c04d4b9c] kasan_set_track+0x24/0x30
[ 1929.761298] [d0ca4938] [c04d710c] kasan_set_free_info+0x28/0x3c
[ 1929.769073] [d0ca4948] [c04d4f74] __kasan_slab_free+0x104/0x118
[ 1929.776983] [d0ca4968] [c04ce800] slab_free_freelist_hook+0xec/0x17c
[ 1929.785111] [d0ca49a8] [c04d3468] kmem_cache_free+0x58/0x2a0
[ 1929.793391] [d0ca49f8] [c11b251c] packet_rcv+0xb9c/0xbb4
[ 1929.801797] [d0ca4a48] [c0dbfd98] dev_queue_xmit_nit+0x6e4/0x748
[ 1929.810434] [d0ca4ab8] [c0dcaf80] dev_hard_start_xmit+0xec/0x880
[ 1929.819207] [d0ca4b18] [c0ea4814] sch_direct_xmit+0x1f8/0x818
[ 1929.828111] [d0ca4bf8] [c0dcc884] __dev_queue_xmit+0xed4/0x136c
[ 1929.837202] [d0ca4d28] [c0f256dc] ip_finish_output2+0xfcc/0x1028
[ 1929.846472] [d0ca4d88] [c0f2d848] __ip_queue_xmit+0xde0/0x1018
[ 1929.855892] [d0ca4df8] [c0f929d8] __tcp_transmit_skb+0x2550/0x2cb8
[ 1929.865486] [d0ca4ee8] [c0f98470] tcp_write_xmit+0x1d28/0x3498
[ 1929.875216] [d0ca4f78] [c0f99c8c] __tcp_push_pending_frames+0xac/0x1c4
[ 1929.885189] [d0ca4f98] [c0f5a970] tcp_sendmsg_locked+0x1c50/0x2294
[ 1929.895338] [d0ca5098] [c0f5afe4] tcp_sendmsg+0x30/0x48
[ 1929.905564] [d0ca50b8] [c0d598b0] sock_sendmsg_nosec+0xf4/0x10c
[ 1929.916463] [d0ca50d8] [b0a31840] xprt_sock_sendmsg+0x2c0/0x6e8 [sunrpc]
[ 1929.927494] [d0ca51b8] [b0a34ce8] xs_tcp_send_request+0x360/0x580 [sunrpc]
[ 1929.938699] [d0ca52e8] [b0a2eae8] xprt_transmit+0x4f8/0xe30 [sunrpc]
[ 1929.950044] [d0ca5368] [b0a1dcd8] call_transmit+0x238/0x25c [sunrpc]
[ 1929.961450] [d0ca5388] [b0a6641c] __rpc_execute+0x35c/0xbf8 [sunrpc]
[ 1929.972996] [d0ca5448] [b0a21d18] rpc_run_task+0x790/0x79c [sunrpc]
[ 1929.984850] [d0ca5498] [b1282e50] nfs4_call_sync_custom+0x14/0x80 [nfsv4]
[ 1929.996821] [d0ca54b8] [b128302c] nfs4_do_call_sync+0x170/0x1a8 [nfsv4]
[ 1930.008922] [d0ca55a8] [b12b3570] nfs4_proc_lookup_common+0x314/0xc54
[nfsv4]
[ 1930.020820] [d0ca5758] [b12b4244] nfs4_proc_lookup+0x158/0x2f0 [nfsv4]
[ 1930.032753] [d0ca57f8] [b0b49544] nfs_lookup+0x2ac/0x9ac [nfs]
[ 1930.044062] [d0ca5838] [c052c984] __lookup_slow+0x278/0x2a8
[ 1930.055461] [d0ca5958] [c05340a0] walk_component+0x288/0x30c
[ 1930.066816] [d0ca5a08] [c0534e5c] path_lookupat.isra.0+0x1b8/0x438
[ 1930.078282] [d0ca5a48] [c05372a0] filename_lookup+0x144/0x1c4
[ 1930.089834] [d0ca5b98] [c05373fc] vfs_path_lookup+0x94/0xc0
[ 1930.101389] [d0ca5c18] [c05714b8] mount_subtree+0x1c4/0x250
[ 1930.113267] [d0ca5ca8] [b12e1b2c] do_nfs4_mount+0x570/0x7fc [nfsv4]
[ 1930.125298] [d0ca5d68] [b12e202c] nfs4_try_get_tree+0xfc/0x16c [nfsv4]
[ 1930.137200] [d0ca5d88] [c050e434] vfs_get_tree+0xf8/0x398
[ 1930.149133] [d0ca5db8] [c056f968] path_mount+0x1074/0x113c
[ 1930.161107] [d0ca5e78] [c056fad8] do_mount+0xa8/0xe4
[ 1930.173109] [d0ca5f08] [c0570054] sys_mount+0xa8/0xb8
[ 1930.185160] [d0ca5f38] [c002e1cc] ret_from_syscall+0x0/0x34
[ 1930.197313] --- interrupt: c01 at 0x8b5754
   LR = 0xac0be0
[ 1930.222896] Kernel panic - not syncing: corrupted stack end detected inside
scheduler


But feel free to close this bug if appropriate as the original issue is solved.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-04 Thread David Laight
From: Alexey Dobriyan
> Sent: 04 September 2020 18:58
> 
> On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote:
> > * Christoph Hellwig  wrote:
> > > this series removes the last set_fs() used to force a kernel address
> > > space for the uaccess code in the kernel read/write/splice code, and then
> > > stops implementing the address space overrides entirely for x86 and
> > > powerpc.
> >
> > Cool! For the x86 bits:
> >
> >   Acked-by: Ingo Molnar 
> 
> set_fs() is older than some kernel hackers!
> 
>   $ cd linux-0.11/
>   $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3
>   ./include/asm/segment.h:61:extern inline void set_fs(unsigned long val)
>   ./include/asm/segment.h-62-{
>   ./include/asm/segment.h-63- __asm__("mov %0,%%fs"::"a" ((unsigned 
> short) val));
>   ./include/asm/segment.h-64-}

What is this strange %fs register you are talking about.
Figure 2-4 only has CS, DS, SS and ES.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)


Re: [PATCH 0/1] powerpc/numa: do not skip node 0 in lookup table

2020-09-04 Thread Daniel Henrique Barboza

I discussed this a bit with Aneesh Kumar in IBM internal Slack, a few weeks
ago, and he informed me that that this patch does not make sense with the
design used by the kernel. The kernel will assume that, for node 0, all
associativity domains must also be zeroed. This is why node 0 is skipped
when creating the distance table.

This of course has consequences for QEMU, so based on that, I've adapted
the QEMU implementation to not touch node 0.



Daniel

On 8/14/20 5:34 PM, Daniel Henrique Barboza wrote:

Hi,

This is a simple fix that I made while testing NUMA changes
I'm making in QEMU [1]. Setting any non-zero value to the
associativity of NUMA node 0 has no impact in the output
of 'numactl' because the distance_lookup_table is never
initialized for node 0.

Seeing through the LOPAPR spec and git history I found no
technical reason to skip node 0, which makes me believe this is
a bug that got under the radar up until now because no one
attempted to set node 0 associativity like I'm doing now.

For anyone wishing to give it a spin, using the QEMU build
in [1] and experimenting with NUMA distances, such as:

sudo ./qemu-system-ppc64 -machine 
pseries-5.2,accel=kvm,usb=off,dump-guest-core=off -m 65536 -overcommit 
mem-lock=off -smp 4,sockets=4,cores=1,threads=1 -rtc base=utc -display none 
-vga none -nographic -boot menu=on -device 
spapr-pci-host-bridge,index=1,id=pci.1 -device 
spapr-pci-host-bridge,index=2,id=pci.2 -device 
spapr-pci-host-bridge,index=3,id=pci.3 -device 
spapr-pci-host-bridge,index=4,id=pci.4 -device 
qemu-xhci,id=usb,bus=pci.0,addr=0x2 -drive 
file=/home/danielhb/f32.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x3,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -device usb-kbd,id=input0,bus=usb.0,port=1 -device 
usb-mouse,id=input1,bus=usb.0,port=2 -device 
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on \
-numa node,nodeid=0,cpus=0 -numa node,nodeid=1,cpus=1 \
-numa node,nodeid=2,cpus=2 -numa node,nodeid=3,cpus=3 \
-numa dist,src=0,dst=1,val=80 -numa dist,src=0,dst=2,val=80 \
-numa dist,src=0,dst=3,val=80 -numa dist,src=1,dst=2,val=80 \
-numa dist,src=1,dst=3,val=80 -numa dist,src=2,dst=3,val=80

The current kernel code will ignore the associativity of
node 0, and numactl will output this:

node distances:
node   0   1   2   3
   0:  10  160  160  160
   1:  160  10  80  80
   2:  160  80  10  80
   3:  160  80  80  10

With this patch:

node distances:
node   0   1   2   3
   0:  10  160  160  160
   1:  160  10  80  40
   2:  160  80  10  20
   3:  160  40  20  10


If anyone wonders, this patch has no conflict with the proposed
NUMA changes in [2] because Aneesh isn't changing this line.


[1] https://github.com/danielhb/qemu/tree/spapr_numa_v1
[2] 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/2020073916.243569-1-aneesh.ku...@linux.ibm.com/


Daniel Henrique Barboza (1):
   powerpc/numa: do not skip node 0 when init lookup table

  arch/powerpc/mm/numa.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)



Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-04 Thread Linus Torvalds
On Fri, Sep 4, 2020 at 10:58 AM Alexey Dobriyan  wrote:
>
> set_fs() is older than some kernel hackers!
>
> $ cd linux-0.11/
> $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3

Oh, it's older than that. It was there (as set_fs) in 0.10, and may
even predate that. But sadly, I don't have tar-balls for 0.02 and
0.03, so can't check.

The actual use of %fs as the user space segment is already there in
0.01, but there was no 'set_fs()'. That was a simpler and more direct
time, and "get_fs()" looked like this back then:

  #define _fs() ({ \
  register unsigned short __res; \
  __asm__("mov %%fs,%%ax":"=a" (__res):); \
  __res;})

and all the setting was basically part of the kernel entry asm and. Lovely.

 Linus


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-04 Thread Alexey Dobriyan
On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote:
> * Christoph Hellwig  wrote:
> > this series removes the last set_fs() used to force a kernel address
> > space for the uaccess code in the kernel read/write/splice code, and then
> > stops implementing the address space overrides entirely for x86 and
> > powerpc.
> 
> Cool! For the x86 bits:
> 
>   Acked-by: Ingo Molnar 

set_fs() is older than some kernel hackers!

$ cd linux-0.11/
$ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3
./include/asm/segment.h:61:extern inline void set_fs(unsigned long val)
./include/asm/segment.h-62-{
./include/asm/segment.h-63- __asm__("mov %0,%%fs"::"a" ((unsigned 
short) val));
./include/asm/segment.h-64-}


[PATCH] kbuild: preprocess module linker script

2020-09-04 Thread Masahiro Yamada
There was a request to preprocess the module linker script like we do
for the vmlinux one (https://lkml.org/lkml/2020/8/21/512).

The difference between vmlinux.lds and module.lds is that the latter
is needed for external module builds, thus must be cleaned up by
'make mrproper' instead of 'make clean' (also, it must be created by
'make modules_prepare').

You cannot put it in arch/*/kernel/ because 'make clean' descends into
it. I moved arch/*/kernel/module.lds to arch/*/include/asm/module.lds.h,
which is included from scripts/module.lds.S.

scripts/module.lds is fine because 'make clean' keeps all the build
artifacts under scripts/.

You can add arch-specific sections in .

Signed-off-by: Masahiro Yamada 
Tested-by: Jessica Yu 
---

 Makefile   |  1 -
 arch/arm/Makefile  |  4 
 .../{kernel/module.lds => include/asm/module.lds.h}|  2 ++
 arch/arm64/Makefile|  4 
 .../{kernel/module.lds => include/asm/module.lds.h}|  2 ++
 arch/ia64/Makefile |  1 -
 arch/ia64/{module.lds => include/asm/module.lds.h} |  0
 arch/m68k/Makefile |  1 -
 .../{kernel/module.lds => include/asm/module.lds.h}|  0
 arch/powerpc/Makefile  |  1 -
 .../{kernel/module.lds => include/asm/module.lds.h}|  0
 arch/riscv/Makefile|  3 ---
 .../{kernel/module.lds => include/asm/module.lds.h}|  3 ++-
 arch/um/include/asm/Kbuild |  1 +
 include/asm-generic/Kbuild |  1 +
 include/asm-generic/module.lds.h   | 10 ++
 scripts/.gitignore |  1 +
 scripts/Makefile   |  2 ++
 scripts/Makefile.modfinal  |  5 ++---
 scripts/{module-common.lds => module.lds.S}|  3 +++
 scripts/package/builddeb   |  2 +-
 21 files changed, 27 insertions(+), 20 deletions(-)
 rename arch/arm/{kernel/module.lds => include/asm/module.lds.h} (72%)
 rename arch/arm64/{kernel/module.lds => include/asm/module.lds.h} (76%)
 rename arch/ia64/{module.lds => include/asm/module.lds.h} (100%)
 rename arch/m68k/{kernel/module.lds => include/asm/module.lds.h} (100%)
 rename arch/powerpc/{kernel/module.lds => include/asm/module.lds.h} (100%)
 rename arch/riscv/{kernel/module.lds => include/asm/module.lds.h} (84%)
 create mode 100644 include/asm-generic/module.lds.h
 rename scripts/{module-common.lds => module.lds.S} (93%)

diff --git a/Makefile b/Makefile
index 9cac6fde3479..3d9b56c6b47e 100644
--- a/Makefile
+++ b/Makefile
@@ -506,7 +506,6 @@ KBUILD_CFLAGS_KERNEL :=
 KBUILD_AFLAGS_MODULE  := -DMODULE
 KBUILD_CFLAGS_MODULE  := -DMODULE
 KBUILD_LDFLAGS_MODULE :=
-export KBUILD_LDS_MODULE := $(srctree)/scripts/module-common.lds
 KBUILD_LDFLAGS :=
 CLANG_FLAGS :=
 
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 4e877354515f..a0cb15de9677 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,10 +16,6 @@ LDFLAGS_vmlinux  += --be8
 KBUILD_LDFLAGS_MODULE  += --be8
 endif
 
-ifeq ($(CONFIG_ARM_MODULE_PLTS),y)
-KBUILD_LDS_MODULE  += $(srctree)/arch/arm/kernel/module.lds
-endif
-
 GZFLAGS:=-9
 #KBUILD_CFLAGS +=-pipe
 
diff --git a/arch/arm/kernel/module.lds b/arch/arm/include/asm/module.lds.h
similarity index 72%
rename from arch/arm/kernel/module.lds
rename to arch/arm/include/asm/module.lds.h
index 79cb6af565e5..0e7cb4e314b4 100644
--- a/arch/arm/kernel/module.lds
+++ b/arch/arm/include/asm/module.lds.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_ARM_MODULE_PLTS
 SECTIONS {
.plt : { BYTE(0) }
.init.plt : { BYTE(0) }
 }
+#endif
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 55bc8546d9c7..232547ec07d8 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -115,10 +115,6 @@ endif
 
 CHECKFLAGS += -D__aarch64__
 
-ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
-KBUILD_LDS_MODULE  += $(srctree)/arch/arm64/kernel/module.lds
-endif
-
 ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
   KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
   CC_FLAGS_FTRACE := -fpatchable-function-entry=2
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/include/asm/module.lds.h
similarity index 76%
rename from arch/arm64/kernel/module.lds
rename to arch/arm64/include/asm/module.lds.h
index 22e36a21c113..691f15af788e 100644
--- a/arch/arm64/kernel/module.lds
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,5 +1,7 @@
+#ifdef CONFIG_ARM64_MODULE_PLTS
 SECTIONS {
.plt (NOLOAD) : { BYTE(0) }
.init.plt (NOLOAD) : { BYTE(0) }
.text.ftrace_trampoline (NOLOAD) : { BYTE(0) }
 }
+#endif
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 2876a7df1b0a..703b1c4f6d12 100644
--- a/arch/ia64/Makefile

[PATCH 3/3] powerpc/uaccess: Remove __put_user_asm() and __put_user_asm2()

2020-09-04 Thread Christophe Leroy
__put_user_asm() and __put_user_asm2() are not used anymore.

Remove them.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 41 --
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 96d1c144f92b..26781b044932 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -151,42 +151,6 @@ static inline int __access_ok(unsigned long addr, unsigned 
long size,
 
 extern long __put_user_bad(void);
 
-/*
- * We don't tell gcc that we are accessing memory, but this is OK
- * because we do not write to any memory gcc knows about, so there
- * are no aliasing issues.
- */
-#define __put_user_asm(x, addr, err, op)   \
-   __asm__ __volatile__(   \
-   "1: " op "%U2%X2 %1,%2  # put_user\n"   \
-   "2:\n"  \
-   ".section .fixup,\"ax\"\n"  \
-   "3: li %0,%3\n" \
-   "   b 2b\n" \
-   ".previous\n"   \
-   EX_TABLE(1b, 3b)\
-   : "=r" (err)\
-   : "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))
-
-#ifdef __powerpc64__
-#define __put_user_asm2(x, ptr, retval)\
- __put_user_asm(x, ptr, retval, "std")
-#else /* __powerpc64__ */
-#define __put_user_asm2(x, addr, err)  \
-   __asm__ __volatile__(   \
-   "1: stw%X2 %1,%2\n" \
-   "2: stw%X2 %L1,%L2\n"   \
-   "3:\n"  \
-   ".section .fixup,\"ax\"\n"  \
-   "4: li %0,%3\n" \
-   "   b 3b\n" \
-   ".previous\n"   \
-   EX_TABLE(1b, 4b)\
-   EX_TABLE(2b, 4b)\
-   : "=r" (err)\
-   : "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err))
-#endif /* __powerpc64__ */
-
 #define __put_user_size_allowed(x, ptr, size, retval)  \
 do {   \
__label__ __pu_failed;  \
@@ -249,6 +213,11 @@ do {   
\
 })
 
 
+/*
+ * We don't tell gcc that we are accessing memory, but this is OK
+ * because we do not write to any memory gcc knows about, so there
+ * are no aliasing issues.
+ */
 #define __put_user_asm_goto(x, addr, label, op)\
asm volatile goto(  \
"1: " op "%U1%X1 %0,%1  # put_user\n"   \
-- 
2.25.0



[PATCH 2/3] powerpc/uaccess: Switch __patch_instruction() to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
__patch_instruction() is the only user of __put_user_asm() outside
of asm/uaccess.h

Switch to the new __put_user_asm_goto() to enable retirement of
__put_user_asm() in a later patch.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/code-patching.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 8c3934ea6220..2333625b5e31 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -21,21 +21,18 @@
 static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst 
instr,
   struct ppc_inst *patch_addr)
 {
-   int err = 0;
-
-   if (!ppc_inst_prefixed(instr)) {
-   __put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw");
-   } else {
-   __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std");
-   }
-
-   if (err)
-   return err;
+   if (!ppc_inst_prefixed(instr))
+   __put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, 
"stw");
+   else
+   __put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, 
"std");
 
asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr),
"r" (exec_addr));
 
return 0;
+
+failed:
+   return -EFAULT;
 }
 
 int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
-- 
2.25.0



[PATCH 1/3] powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
__put_user_asm_goto() provides more flexibility to GCC and avoids using
a local variable to tell if the write succeeded or not.
GCC can then avoid implementing a cmp in the fast path.

See the difference for a small function like the PPC64 version of
save_general_regs() in arch/powerpc/kernel/signal_32.c:

Before the patch (unreachable nop removed):

0c10 <.save_general_regs>:
 c10:   39 20 00 2c li  r9,44
 c14:   39 40 00 00 li  r10,0
 c18:   7d 29 03 a6 mtctr   r9
 c1c:   38 c0 00 00 li  r6,0
 c20:   48 00 00 14 b   c34 <.save_general_regs+0x24>
 c30:   42 40 00 40 bdz c70 <.save_general_regs+0x60>
 c34:   28 2a 00 27 cmpldi  r10,39
 c38:   7c c8 33 78 mr  r8,r6
 c3c:   79 47 1f 24 rldicr  r7,r10,3,60
 c40:   39 20 00 01 li  r9,1
 c44:   41 82 00 0c beq c50 <.save_general_regs+0x40>
 c48:   7d 23 38 2a ldx r9,r3,r7
 c4c:   79 29 00 20 clrldi  r9,r9,32
 c50:   91 24 00 00 stw r9,0(r4)
 c54:   2c 28 00 00 cmpdi   r8,0
 c58:   39 4a 00 01 addir10,r10,1
 c5c:   38 84 00 04 addir4,r4,4
 c60:   41 82 ff d0 beq c30 <.save_general_regs+0x20>
 c64:   38 60 ff f2 li  r3,-14
 c68:   4e 80 00 20 blr
 c70:   38 60 00 00 li  r3,0
 c74:   4e 80 00 20 blr

 <.fixup>:
  cc:   39 00 ff f2 li  r8,-14
  d0:   48 00 00 00 b   d0 <.fixup+0xd0>
d0: R_PPC64_REL24   .text+0xc54

After the patch:

1490 <.save_general_regs>:
1490:   39 20 00 2c li  r9,44
1494:   39 40 00 00 li  r10,0
1498:   7d 29 03 a6 mtctr   r9
149c:   60 00 00 00 nop
14a0:   28 2a 00 27 cmpldi  r10,39
14a4:   79 48 1f 24 rldicr  r8,r10,3,60
14a8:   39 20 00 01 li  r9,1
14ac:   41 82 00 0c beq 14b8 <.save_general_regs+0x28>
14b0:   7d 23 40 2a ldx r9,r3,r8
14b4:   79 29 00 20 clrldi  r9,r9,32
14b8:   91 24 00 00 stw r9,0(r4)
14bc:   39 4a 00 01 addir10,r10,1
14c0:   38 84 00 04 addir4,r4,4
14c4:   42 00 ff dc bdnz14a0 <.save_general_regs+0x10>
14c8:   38 60 00 00 li  r3,0
14cc:   4e 80 00 20 blr
14d0:   38 60 ff f2 li  r3,-14
14d4:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index a5cfe867fbdc..96d1c144f92b 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -189,14 +189,14 @@ extern long __put_user_bad(void);
 
 #define __put_user_size_allowed(x, ptr, size, retval)  \
 do {   \
+   __label__ __pu_failed;  \
+   \
retval = 0; \
-   switch (size) { \
- case 1: __put_user_asm(x, ptr, retval, "stb"); break; \
- case 2: __put_user_asm(x, ptr, retval, "sth"); break; \
- case 4: __put_user_asm(x, ptr, retval, "stw"); break; \
- case 8: __put_user_asm2(x, ptr, retval); break;   \
- default: __put_user_bad();\
-   }   \
+   __put_user_size_goto(x, ptr, size, __pu_failed);\
+   break;  \
+   \
+__pu_failed:   \
+   retval = -EFAULT;   \
 } while (0)
 
 #define __put_user_size(x, ptr, size, retval)  \
-- 
2.25.0



[PATCH] powerpc/uaccess: Add pre-update addressing to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
Enable pre-update addressing mode in __put_user_asm_goto()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 7c2427f237e1..a5cfe867fbdc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -254,7 +254,7 @@ do {
\
"1: " op "%U1%X1 %0,%1  # put_user\n"   \
EX_TABLE(1b, %l2)   \
:   \
-   : "r" (x), "m" (*addr)  \
+   : "r" (x), "m<>" (*addr)\
:   \
: label)
 
-- 
2.25.0



Re: ptrace_syscall_32 is failing

2020-09-04 Thread Thomas Gleixner
Andy,

On Wed, Sep 02 2020 at 09:49, Andy Lutomirski wrote:
> On Wed, Sep 2, 2020 at 1:29 AM Thomas Gleixner  wrote:
>>
>> But you might tell me where exactly you want to inject the SIGTRAP in
>> the syscall exit code flow.
>
> It would be a bit complicated.  Definitely after any signals from the
> syscall are delivered.  Right now, I think that we don't deliver a
> SIGTRAP on the instruction boundary after SYSCALL while
> single-stepping.  (I think we used to, but only sometimes, and now we
> are at least consistent.)  This is because IRET will not trap if it
> starts with TF clear and ends up setting it.  (I asked Intel to
> document this, and I think they finally did, although I haven't gotten
> around to reading the new docs.  Certainly the old docs as of a year
> or two ago had no description whatsoever of how TF changes worked.)
>
> Deciding exactly *when* a trap should occur would be nontrivial -- we
> can't trap on sigreturn() from a SIGTRAP, for example.
>
> So this isn't fully worked out.

Oh well.

>> >> I don't think we want that in general. The current variant is perfectly
>> >> fine for everything except the 32bit fast syscall nonsense. Also
>> >> irqentry_entry/exit is not equivalent to the syscall_enter/exit
>> >> counterparts.
>> >
>> > If there are any architectures in which actual work is needed to
>> > figure out whether something is a syscall in the first place, they'll
>> > want to do the usual kernel entry work before the syscall entry work.
>>
>> That's low level entry code which does not require RCU, lockdep, tracing
>> or whatever muck we setup before actual work can be done.
>>
>> arch_asm_entry()
>>   ...
>>   arch_c_entry(cause) {
>> switch(cause) {
>>   case EXCEPTION: arch_c_exception(...);
>>   case SYSCALL: arch_c_syscall(...);
>>   ...
>> }
>
> You're assuming that figuring out the cause doesn't need the kernel
> entry code to run first.  In the case of the 32-bit vDSO fast
> syscalls, we arguably don't know whether an entry is a syscall until
> we have done a user memory access.  Logically, we're doing:
>
> if (get_user() < 0) {
>   /* Not a syscall.  This is actually a silly operation that sets AX =
> -EFAULT and returns.  Do not audit or invoke ptrace. */
> } else {
>   /* This actually is a syscall. */
> }

Yes, that's what I've addressed with providing split interfaces.

>> You really want to differentiate between exception and syscall
>> entry/exit.
>>
>
> Why do we want to distinguish between exception and syscall
> entry/exit?  For the enter part, AFAICS the exception case boils down
> to enter_from_user_mode() and the syscall case is:
>
> enter_from_user_mode(regs);
> instrumentation_begin();
>
> local_irq_enable();
> ti_work = READ_ONCE(current_thread_info()->flags);
> if (ti_work & SYSCALL_ENTER_WORK)
> syscall = syscall_trace_enter(regs, syscall, ti_work);
> instrumentation_end();
>
> Which would decompose quite nicely as a regular (non-syscall) entry
> plus the syscall part later.

There is a difference between syscall entry and exception entry at least
in my view:

syscall:
enter_from_user_mode(regs);
local_irq_enable();

exception:
enter_from_user_mode(regs);

>> we'd have:
>>
>>   arch_c_entry()
>>  irqentry_enter();
>>  local_irq_enble();
>>  nr = syscall_enter_from_user_mode_work();
>>  ...
>>
>> which enforces two calls for sane entries and more code in arch/
>
> This is why I still like my:
>
> arch_c_entry()
>   irqentry_enter_from_user_mode();
>   generic_syscall();
>   exit...

So what we have now (with my patch applied) is either:

1) arch_c_entry()
nr = syscall_enter_from_user_mode();
arch_handle_syscall(nr);
syscall_exit_to_user_mode();

or for that extra 32bit fast syscall thing:

2) arch_c_entry()
syscall_enter_from_user_mode_prepare();
arch_do_stuff();
nr = syscall_enter_from_user_mode_work();
arch_handle_syscall(nr);
syscall_exit_to_user_mode();

So for sane cases you just use #1.

Ideally we'd not need arch_handle_syscall(nr) at all, but that does not
work with multiple ABIs supported, i.e. the compat muck.

The only way we could make that work is to have:

syscall_enter_exit(regs, mode)
  nr = syscall_enter_from_user_mode();
  arch_handle_syscall(mode, nr);
  syscall_exit_to_user_mode();

and then arch_c_entry() becomes:

syscall_enter_exit(regs, mode);

which means that arch_handle_syscall() would have to evaluate the mode
and chose the appropriate syscall table. Not sure whether that's a win.

Thanks,

tglx




RE: [PATCH 12/14] x86: remove address space overrides using set_fs()

2020-09-04 Thread David Laight
From: Linus Torvalds
> Sent: 04 September 2020 00:26
> 
> On Thu, Sep 3, 2020 at 2:30 PM David Laight  wrote:
> >
> > A non-canonical (is that the right term) address between the highest
> > valid user address and the lowest valid kernel address (7ffe to fffe?)
> > will fault anyway.
> 
> Yes.
> 
> But we actually warn against that fault, because it's been a good way
> to catch places that didn't use the proper "access_ok()" pattern.
> 
> See ex_handler_uaccess() and the
> 
> WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in
> user access. Non-canonical address?");
> 
> warning. It's been good for randomized testing - a missing range check
> on a user address will often hit this.
> 
> Of course, you should never see it in real life (and hopefully not in
> testing either any more). But belt-and-suspenders..

That could still be effective, just pick an address limit that is
appropriate for the one access_ok() is using.

Even if access_ok() uses 1<<63 there are plenty of addresses above it that 
fault.
But the upper limit for 5-level page tables could be used all the time.

One option is to test '(address | length) < (3<<62)' in access_ok().
That is also moderately suitable for masking invalid addresses to 0.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)


Re: [PATCH 12/14] x86: remove address space overrides using set_fs()

2020-09-04 Thread Christoph Hellwig
On Fri, Sep 04, 2020 at 08:38:13AM +0200, Christoph Hellwig wrote:
> > Wait a sec... how is that supposed to build with X86_5LEVEL?  Do you mean
> > 
> > #define LOAD_TASK_SIZE_MINUS_N(n) \
> > ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
> > __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), 
> > X86_FEATURE_LA57
> > 
> > there?
> 
> Don't ask me about the how, but it builds and works with X86_5LEVEL,
> and the style is copied from elsewhere..

Actually, it doesn't any more.  Looks like the change to pass the n
parameter as suggested by Linus broke the previously working version.


Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes

2020-09-04 Thread Anshuman Khandual



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> This patch series includes fixes for debug_vm_pgtable test code so that
> they follow page table updates rules correctly. The first two patches 
> introduce
> changes w.r.t ppc64. The patches are included in this series for 
> completeness. We can
> merge them via ppc64 tree if required.
> 
> Hugetlb test is disabled on ppc64 because that needs larger change to satisfy
> page table update rules.
> 
> These tests are broken w.r.t page table update rules and results in kernel
> crash as below. 
> 
> [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
> cpu 0x0: Vector: 700 (Program Check) at [c00c6d1e76c0]
> pc: c009a5ec: assert_pte_locked+0x14c/0x380
> lr: c05c: pte_update+0x11c/0x190
> sp: c00c6d1e7950
>msr: 82029033
>   current = 0xc00c6d172c80
>   paca= 0xc3ba   irqmask: 0x03   irq_happened: 0x01
> pid   = 1, comm = swapper/0
> kernel BUG at arch/powerpc/mm/pgtable.c:304!
> [link register   ] c05c pte_update+0x11c/0x190
> [c00c6d1e7950] 0001 (unreliable)
> [c00c6d1e79b0] c05eee14 pte_update+0x44/0x190
> [c00c6d1e7a10] c1a2ca9c pte_advanced_tests+0x160/0x3d8
> [c00c6d1e7ab0] c1a2d4fc debug_vm_pgtable+0x7e8/0x1338
> [c00c6d1e7ba0] c00116ec do_one_initcall+0xac/0x5f0
> [c00c6d1e7c80] c19e4fac kernel_init_freeable+0x4dc/0x5a4
> [c00c6d1e7db0] c0012474 kernel_init+0x24/0x160
> [c00c6d1e7e20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
> 
> With DEBUG_VM disabled
> 
> [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x
> [   20.530183] Faulting instruction address: 0xc00df330
> cpu 0x33: Vector: 380 (Data SLB Access) at [c00c6d19f700]
> pc: c00df330: memset+0x68/0x104
> lr: c009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> sp: c00c6d19f990
>msr: 82009033
>dar: 0
>   current = 0xc00c6d177480
>   paca= 0xc0001ec4f400   irqmask: 0x03   irq_happened: 0x01
> pid   = 1, comm = swapper/0
> [link register   ] c009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> [c00c6d19f990] c009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 
> (unreliable)
> [c00c6d19fa10] c19ebf30 pmd_advanced_tests+0x1f0/0x378
> [c00c6d19fab0] c19ed088 debug_vm_pgtable+0x79c/0x1244
> [c00c6d19fba0] c00116ec do_one_initcall+0xac/0x5f0
> [c00c6d19fc80] c19a4fac kernel_init_freeable+0x4dc/0x5a4
> [c00c6d19fdb0] c0012474 kernel_init+0x24/0x160
> [c00c6d19fe20] c000cbd0 ret_from_kernel_thread+0x5c/0x6c
> 
> Changes from v3:
> * Address review feedback
> * Move page table depost and withdraw patch after adding pmdlock to avoid 
> bisect failure.

This version

- Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with 
DEBUG_VM_PGTABLE)
- Runs on arm64 and x86 without any regression, atleast nothing that I have 
noticed
- Will be great if this could get tested on s390, arc, riscv, ppc32 platforms 
as well

+ linux-riscv 
+ linux-snps-...@lists.infradead.org 
+ linux-s...@vger.kernel.org
+ Gerald Schaefer 
+ Vineet Gupta 

There is still an open git bisect issue on arm64 platform which ideally should 
be fixed.

- Anshuman


Re: [PATCH 12/14] x86: remove address space overrides using set_fs()

2020-09-04 Thread Christoph Hellwig
On Fri, Sep 04, 2020 at 03:55:10AM +0100, Al Viro wrote:
> On Thu, Sep 03, 2020 at 04:22:40PM +0200, Christoph Hellwig wrote:
> 
> > diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
> > index c8a85b512796e1..94f7be4971ed04 100644
> > --- a/arch/x86/lib/getuser.S
> > +++ b/arch/x86/lib/getuser.S
> > @@ -35,10 +35,19 @@
> >  #include 
> >  #include 
> >  
> > +#ifdef CONFIG_X86_5LEVEL
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +   ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rdx", \
> > +   "mov $((1 << 56) - 4096 - (n)),%rdx", X86_FEATURE_LA57
> > +#else
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +   mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
> > +#endif
> 
> Wait a sec... how is that supposed to build with X86_5LEVEL?  Do you mean
> 
> #define LOAD_TASK_SIZE_MINUS_N(n) \
>   ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
>   __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), 
> X86_FEATURE_LA57
> 
> there?

Don't ask me about the how, but it builds and works with X86_5LEVEL,
and the style is copied from elsewhere..


Re: [PATCH v4 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-09-04 Thread Anshuman Khandual



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> The seems to be missing quite a lot of details w.r.t allocating
> the correct pgtable_t page (huge_pte_alloc()), holding the right
> lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.
> 
> ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
> Hence disable the test on ppc64.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index b53903fdee85..9afa1354326b 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -811,6 +811,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, 
> pgprot_t prot)
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  }
>  
> +#ifndef CONFIG_PPC_BOOK3S_64
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
> struct vm_area_struct *vma,
> pte_t *ptep, unsigned long pfn,
> @@ -853,6 +854,7 @@ static void __init hugetlb_advanced_tests(struct 
> mm_struct *mm,
>   pte = huge_ptep_get(ptep);
>   WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
>  }
> +#endif
>  #else  /* !CONFIG_HUGETLB_PAGE */
>  static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
> @@ -1065,7 +1067,9 @@ static int __init debug_vm_pgtable(void)
>   pud_populate_tests(mm, pudp, saved_pmdp);
>   spin_unlock(ptl);
>  
> +#ifndef CONFIG_PPC_BOOK3S_64
>   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> +#endif
>  
>   spin_lock(>page_table_lock);
>   p4d_clear_tests(mm, p4dp);
> 

Is it still required now that DEBUG_VM_PGTABLE has been dropped from powerpc
or you would like to re-enabled it back ?

https://lore.kernel.org/linuxppc-dev/159913592797.5893.5829441560236719450.b4...@ellerman.id.au/T/#m6d890e2fe84cf180cb875fae5f791e9c83db8d30


Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-09-04 Thread Leonardo Bras
On Thu, 2020-09-03 at 14:41 +1000, Alexey Kardashevskiy wrote:
> I am new to this, so I am trying to understand how a memory page mapped
> > as DMA, and used for something else could be a problem.
> 
>  From the device prospective, there is PCI space and everything from 0 
> till 1<<64 is accessible and what is that mapped to - the device does 
> not know. PHB's IOMMU is the thing to notice invalid access and raise 
> EEH but PHB only knows about PCI->physical memory mapping (with IOMMU 
> pages) but nothing about the host kernel pages. Does this help? Thanks,

According to our conversation on Slack:
1- There is a problem if a hypervisor gives to it's VMs contiguous
memory blocks that are not aligned to IOMMU pages, because then an 
iommu_map_page() could map some memory in this VM and some memory in
other VM / process.
2- To guarantee this, we should have system pagesize >= iommu_pagesize 

One way to get (2) is by doing this in enable_ddw():
if ((query.page_size & 4) && PAGE_SHIFT >= 24) {
page_shift = 24; /* 16MB */
} else if ((query.page_size & 2) &&  PAGE_SHIFT >= 16 ) {
page_shift = 16; /* 64kB */
} else if (query.page_size & 1 &&  PAGE_SHIFT >= 12) {
page_shift = 12; /* 4kB */
[...]

Another way of solving this, would be adding in LoPAR documentation
that the blocksize of contiguous memory the hypervisor gives a VM
should always be aligned to IOMMU pagesize offered.

I think the best approach would be first sending the above patch, which
is faster, and then get working into adding that to documentation, so
hypervisors guarantee this.

If this gets into the docs, we can revert the patch.

What do you think?

Best regards!



Re: [PATCH v4 11/13] mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries

2020-09-04 Thread Anshuman Khandual



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> pmd_clear() should not be used to clear pmd level pte entries.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 26023d990bd0..b53903fdee85 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_young(pmd));
>  
> + /*  Clear the pte entries  */
> + pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>   pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
>  }
>  
> @@ -319,6 +321,8 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
>   pudp_test_and_clear_young(vma, vaddr, pudp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(pud_young(pud));
> +
> + pudp_huge_get_and_clear(mm, vaddr, pudp);
>  }
>  
>  static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
> @@ -442,8 +446,6 @@ static void __init pud_populate_tests(struct mm_struct 
> *mm, pud_t *pudp,
>* This entry points to next level page table page.
>* Hence this must not qualify as pud_bad().
>*/
> - pmd_clear(pmdp);
> - pud_clear(pudp);
>   pud_populate(mm, pudp, pmdp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(pud_bad(pud));
> @@ -575,7 +577,6 @@ static void __init pmd_populate_tests(struct mm_struct 
> *mm, pmd_t *pmdp,
>* This entry points to next level page table page.
>* Hence this must not qualify as pmd_bad().
>*/
> - pmd_clear(pmdp);
>   pmd_populate(mm, pmdp, pgtable);
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_bad(pmd));
> 

Why pxxp_huge_get_and_clear() cannot be called inside pxx_populate_tests()
functions itself ? Nonetheless, this does not seem to cause any problem.


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-04 Thread Ingo Molnar


* Christoph Hellwig  wrote:

> Hi all,
> 
> this series removes the last set_fs() used to force a kernel address
> space for the uaccess code in the kernel read/write/splice code, and then
> stops implementing the address space overrides entirely for x86 and
> powerpc.

Cool! For the x86 bits:

  Acked-by: Ingo Molnar 

Thanks,

Ingo