RE: [PATCH v2 0/3] hv_netvsc: Prevent packet loss during VF add/remove

2021-01-12 Thread Long Li
> Subject: Re: [PATCH v2 0/3] hv_netvsc: Prevent packet loss during VF
> add/remove
> 
> On Fri,  8 Jan 2021 16:53:40 -0800 Long Li wrote:
> > From: Long Li 
> >
> > This patch set fixes issues with packet loss on VF add/remove.
> 
> These patches are for net-next? They just optimize the amount of packet
> loss on switch, not fix bugs, right?

Yes, those patches are for net-next.

They eliminate the packet loss introduced from Linux side during VF changes. 
They can be seen as optimizations.


[PATCH v2 2/3] hv_netvsc: Wait for completion on request SWITCH_DATA_PATH

2021-01-08 Thread Long Li
From: Long Li 

The completion indicates if NVSP_MSG4_TYPE_SWITCH_DATA_PATH has been
processed by the VSP. The traffic is steered to VF or synthetic after we
receive this completion.

Signed-off-by: Long Li 
Reported-by: kernel test robot 
---
Change from v1:
Fixed warnings from kernel test robot.

 drivers/net/hyperv/netvsc.c | 37 ++---
 drivers/net/hyperv/netvsc_drv.c |  1 -
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2350342b961f..3a3db2f0134d 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -37,6 +37,10 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
struct nvsp_message *init_pkt = _dev->channel_init_pkt;
 
+   /* Block sending traffic to VF if it's about to be gone */
+   if (!vf)
+   net_device_ctx->data_path_is_vf = vf;
+
memset(init_pkt, 0, sizeof(struct nvsp_message));
init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
if (vf)
@@ -50,8 +54,11 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
 
vmbus_sendpacket(dev->channel, init_pkt,
   sizeof(struct nvsp_message),
-  VMBUS_RQST_ID_NO_RESPONSE,
-  VM_PKT_DATA_INBAND, 0);
+  (unsigned long)init_pkt,
+  VM_PKT_DATA_INBAND,
+  VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+   wait_for_completion(_dev->channel_init_wait);
+   net_device_ctx->data_path_is_vf = vf;
 }
 
 /* Worker to setup sub channels on initial setup
@@ -754,8 +761,31 @@ static void netvsc_send_completion(struct net_device *ndev,
   const struct vmpacket_descriptor *desc,
   int budget)
 {
-   const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
+   const struct nvsp_message *nvsp_packet;
u32 msglen = hv_pkt_datalen(desc);
+   struct nvsp_message *pkt_rqst;
+   u64 cmd_rqst;
+
+   /* First check if this is a VMBUS completion without data payload */
+   if (!msglen) {
+   cmd_rqst = vmbus_request_addr(_channel->requestor,
+ (u64)desc->trans_id);
+   if (cmd_rqst == VMBUS_RQST_ERROR) {
+   netdev_err(ndev, "Invalid transaction id\n");
+   return;
+   }
+
+   pkt_rqst = (struct nvsp_message *)(uintptr_t)cmd_rqst;
+   switch (pkt_rqst->hdr.msg_type) {
+   case NVSP_MSG4_TYPE_SWITCH_DATA_PATH:
+   complete(_device->channel_init_wait);
+   break;
+
+   default:
+   netdev_err(ndev, "Unexpected VMBUS completion!!\n");
+   }
+   return;
+   }
 
/* Ensure packet is big enough to read header fields */
if (msglen < sizeof(struct nvsp_message_header)) {
@@ -763,6 +793,7 @@ static void netvsc_send_completion(struct net_device *ndev,
return;
}
 
+   nvsp_packet = hv_pkt_data(desc);
switch (nvsp_packet->hdr.msg_type) {
case NVSP_MSG_TYPE_INIT_COMPLETE:
if (msglen < sizeof(struct nvsp_message_header) +
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5dd4f37afa3d..64ae5f4e974e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2400,7 +2400,6 @@ static int netvsc_vf_changed(struct net_device *vf_netdev)
 
if (net_device_ctx->data_path_is_vf == vf_is_up)
return NOTIFY_OK;
-   net_device_ctx->data_path_is_vf = vf_is_up;
 
netvsc_switch_datapath(ndev, vf_is_up);
netdev_info(ndev, "Data path switched %s VF: %s\n",
-- 
2.27.0



[PATCH v2 3/3] hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove

2021-01-08 Thread Long Li
From: Long Li 

On VF hot remove, NETDEV_GOING_DOWN is sent to notify the VF is about to
go down. At this time, the VF is still sending/receiving traffic and we
request the VSP to switch datapath.

On completion, the datapath is switched to synthetic and we can proceed
with VF hot remove.

Signed-off-by: Long Li 
Reviewed-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 64ae5f4e974e..75b4d6703cf1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2382,12 +2382,15 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
  * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network
  * interface, there is only the CHANGE event and no UP or DOWN event.
  */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event)
 {
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
struct net_device *ndev;
-   bool vf_is_up = netif_running(vf_netdev);
+   bool vf_is_up = false;
+
+   if (event != NETDEV_GOING_DOWN)
+   vf_is_up = netif_running(vf_netdev);
 
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
@@ -2716,7 +2719,8 @@ static int netvsc_netdev_event(struct notifier_block 
*this,
case NETDEV_UP:
case NETDEV_DOWN:
case NETDEV_CHANGE:
-   return netvsc_vf_changed(event_dev);
+   case NETDEV_GOING_DOWN:
+   return netvsc_vf_changed(event_dev, event);
default:
return NOTIFY_DONE;
}
-- 
2.27.0



[PATCH v2 1/3] hv_netvsc: Check VF datapath when sending traffic to VF

2021-01-08 Thread Long Li
From: Long Li 

The driver needs to check if the datapath has been switched to VF before
sending traffic to VF.

Signed-off-by: Long Li 
Reviewed-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f32f28311d57..5dd4f37afa3d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -539,7 +539,8 @@ static int netvsc_xmit(struct sk_buff *skb, struct 
net_device *net, bool xdp_tx)
 */
vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
if (vf_netdev && netif_running(vf_netdev) &&
-   netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net))
+   netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
+   net_device_ctx->data_path_is_vf)
return netvsc_vf_xmit(net, vf_netdev, skb);
 
/* We will atmost need two pages to describe the rndis
-- 
2.27.0



[PATCH v2 0/3] hv_netvsc: Prevent packet loss during VF add/remove

2021-01-08 Thread Long Li
From: Long Li 

This patch set fixes issues with packet loss on VF add/remove.

Long Li (3):
  hv_netvsc: Check VF datapath when sending traffic to VF
  hv_netvsc: Wait for completion on request SWITCH_DATA_PATH
  hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove

 drivers/net/hyperv/netvsc.c | 37 ++---
 drivers/net/hyperv/netvsc_drv.c | 14 -
 2 files changed, 43 insertions(+), 8 deletions(-)

-- 
2.27.0



RE: [PATCH 2/3] hv_netvsc: Wait for completion on request NVSP_MSG4_TYPE_SWITCH_DATA_PATH

2021-01-07 Thread Long Li
> Subject: RE: [PATCH 2/3] hv_netvsc: Wait for completion on request
> NVSP_MSG4_TYPE_SWITCH_DATA_PATH
> 
> > Subject: Re: [PATCH 2/3] hv_netvsc: Wait for completion on request
> > NVSP_MSG4_TYPE_SWITCH_DATA_PATH
> >
> > Hi Long,
> >
> > Thank you for the patch! Perhaps something to improve:
> >
> > [auto build test WARNING on linus/master] [also build test WARNING on
> > v5.11-rc2 next-20210104] [If your patch is applied to the wrong git
> > tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> >
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit-
> > scm.com%2Fdocs%2Fgit-format-
> >
> patchdata=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468
> >
> b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6
> >
> 37455042608743102%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
> >
> AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=
> >
> 90AgH9HlZumRZ4UNC4uD2WIRpZ6ZEnvIdOKOfzYcXpI%3Dreserved=0]
> >
> > url:
> >
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> > ub.com%2F0day-ci%2Flinux%2Fcommits%2FLong-Li%2Fhv_netvsc-Check-
> VF-
> > datapath-when-sending-traffic-to-VF%2F20210106-
> >
> 092237data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb46
> >
> 8b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C
> >
> 637455042608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwM
> >
> DAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata
> >
> =vtVJ8pXIOxIYeKdaqT9pD1%2BEuOM3wz4yqsHh8uWsGP4%3Dreserv
> > ed=0
> > base:
> > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.
> > k
> ernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git&
> >
> amp;data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468b85fb0
> >
> 8d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6374550
> >
> 42608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQ
> >
> IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=FXMG
> > CFODFoq3KLklxr17iVHiq%2FWmJ3c0fM7vIZRfNmc%3Dreserved=0
> > e71ba9452f0b5b2e8dc8aa5445198cd9214a6a62
> > config: i386-allyesconfig (attached as .config)
> > compiler: gcc-9 (Debian 9.3.0-15) 9.3.0 reproduce (this is a W=1
> > build):
> > #
> >
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> > ub.com%2F0day-
> >
> ci%2Flinux%2Fcommit%2F8c92b5574da1b0c2aee3eab7da2c4dad8d92572c
> >
> mp;data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468b85fb08
> >
> d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C63745504
> >
> 2608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIj
> >
> oiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=MMXkQ
> > KENGpyfW0NJs2khBSKTuBExFSZaWHgWyyIj6UU%3Dreserved=0
> > git remote add linux-review
> >
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> > ub.com%2F0day-
> >
> ci%2Flinuxdata=04%7C01%7Clongli%40microsoft.com%7C695cf3d454e
> >
> b468b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0
> > %7C637455042608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wL
> jA
> >
> wMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000s
> >
> data=uge6PX2NyAe%2BjRtvgOhR5xzN2ltBctZXeZwn0hoYco0%3Dreser
> > ved=0
> > git fetch --no-tags linux-review
> > Long-Li/hv_netvsc-Check-VF-datapath-
> > when-sending-traffic-to-VF/20210106-092237
> > git checkout 8c92b5574da1b0c2aee3eab7da2c4dad8d92572c
> > # save the attached .config to linux build tree
> > make W=1 ARCH=i386
> >
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot 
> >
> > All warnings (new ones prefixed by >>):
> >
> >drivers/net/hyperv/netvsc.c: In function 'netvsc_send_completion':
> > >> drivers/net/hyperv/netvsc.c:778:14: warning: cast to pointer from
> > >> integer of different size [-Wint-to-pointer-cast]
> >  778 |   pkt_rqst = (struct nvsp_message *)cmd_rqst;
> >  |  ^
> 
> I think this warning can be safely ignored.
> 
> When sending packets over vmbus, the address is passed as u64 and stored
> internally as u64 in vmbus_next_request_id(). Passing a 32 bit address will
> not lose any data. Later the address is retrieved from vmbus_request_addr()
> as a u64. Again, it will not lose data when casting to a 32 bit address.
> 
> This method of storing and retriev

RE: [PATCH 2/3] hv_netvsc: Wait for completion on request NVSP_MSG4_TYPE_SWITCH_DATA_PATH

2021-01-06 Thread Long Li
> Subject: Re: [PATCH 2/3] hv_netvsc: Wait for completion on request
> NVSP_MSG4_TYPE_SWITCH_DATA_PATH
> 
> Hi Long,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on linus/master] [also build test WARNING on
> v5.11-rc2 next-20210104] [If your patch is applied to the wrong git tree, 
> kindly
> drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit-
> scm.com%2Fdocs%2Fgit-format-
> patchdata=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468
> b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6
> 37455042608743102%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
> AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=
> 90AgH9HlZumRZ4UNC4uD2WIRpZ6ZEnvIdOKOfzYcXpI%3Dreserved=0]
> 
> url:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2F0day-ci%2Flinux%2Fcommits%2FLong-Li%2Fhv_netvsc-Check-VF-
> datapath-when-sending-traffic-to-VF%2F20210106-
> 092237data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb46
> 8b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C
> 637455042608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwM
> DAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata
> =vtVJ8pXIOxIYeKdaqT9pD1%2BEuOM3wz4yqsHh8uWsGP4%3Dreserv
> ed=0
> base:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.k
> ernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git&
> amp;data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468b85fb0
> 8d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6374550
> 42608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQ
> IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=FXMG
> CFODFoq3KLklxr17iVHiq%2FWmJ3c0fM7vIZRfNmc%3Dreserved=0
> e71ba9452f0b5b2e8dc8aa5445198cd9214a6a62
> config: i386-allyesconfig (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
> reproduce (this is a W=1 build):
> #
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2F0day-
> ci%2Flinux%2Fcommit%2F8c92b5574da1b0c2aee3eab7da2c4dad8d92572c
> mp;data=04%7C01%7Clongli%40microsoft.com%7C695cf3d454eb468b85fb08
> d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C63745504
> 2608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIj
> oiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=MMXkQ
> KENGpyfW0NJs2khBSKTuBExFSZaWHgWyyIj6UU%3Dreserved=0
> git remote add linux-review
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2F0day-
> ci%2Flinuxdata=04%7C01%7Clongli%40microsoft.com%7C695cf3d454e
> b468b85fb08d8b1fb3ddd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0
> %7C637455042608753098%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjA
> wMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000s
> data=uge6PX2NyAe%2BjRtvgOhR5xzN2ltBctZXeZwn0hoYco0%3Dreser
> ved=0
> git fetch --no-tags linux-review Long-Li/hv_netvsc-Check-VF-datapath-
> when-sending-traffic-to-VF/20210106-092237
> git checkout 8c92b5574da1b0c2aee3eab7da2c4dad8d92572c
> # save the attached .config to linux build tree
> make W=1 ARCH=i386
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
> 
> All warnings (new ones prefixed by >>):
> 
>drivers/net/hyperv/netvsc.c: In function 'netvsc_send_completion':
> >> drivers/net/hyperv/netvsc.c:778:14: warning: cast to pointer from
> >> integer of different size [-Wint-to-pointer-cast]
>  778 |   pkt_rqst = (struct nvsp_message *)cmd_rqst;
>  |  ^

I think this warning can be safely ignored.

When sending packets over vmbus, the address is passed as u64 and stored 
internally as u64 in vmbus_next_request_id(). Passing a 32 bit address will not 
lose any data. Later the address is retrieved from vmbus_request_addr() as a 
u64. Again, it will not lose data when casting to a 32 bit address.

This method of storing and retrieving addresses are used throughout other 
hyper-v drivers. If we want to not to trigger this warning, I suggest making a 
patch to convert all those usages in all hyper-v drivers.

Thanks,
Long

> 
> 
> vim +778 drivers/net/hyperv/netvsc.c
> 
>757
>758static void netvsc_send_completion(struct net_device *ndev,
>759   struct netvsc_device 
> *net_device,
>760   struct vmbus_channel
> *incoming_channel,
>761   const struct 
> vmpacket_descriptor
> *desc,
>762

[PATCH 2/3] hv_netvsc: Wait for completion on request NVSP_MSG4_TYPE_SWITCH_DATA_PATH

2021-01-05 Thread Long Li
From: Long Li 

The completion indicates if NVSP_MSG4_TYPE_SWITCH_DATA_PATH has been
processed by the VSP. The traffic is steered to VF or synthetic after we
receive this completion.

Signed-off-by: Long Li 
---
 drivers/net/hyperv/netvsc.c | 34 +++--
 drivers/net/hyperv/netvsc_drv.c |  1 -
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2350342b961f..237e998d21d1 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -37,6 +37,10 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
struct nvsp_message *init_pkt = _dev->channel_init_pkt;
 
+   /* Block sending traffic to VF if it's about to be gone */
+   if (!vf)
+   net_device_ctx->data_path_is_vf = vf;
+
memset(init_pkt, 0, sizeof(struct nvsp_message));
init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
if (vf)
@@ -50,8 +54,11 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf)
 
vmbus_sendpacket(dev->channel, init_pkt,
   sizeof(struct nvsp_message),
-  VMBUS_RQST_ID_NO_RESPONSE,
-  VM_PKT_DATA_INBAND, 0);
+  (unsigned long)init_pkt,
+  VM_PKT_DATA_INBAND,
+  VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+   wait_for_completion(_dev->channel_init_wait);
+   net_device_ctx->data_path_is_vf = vf;
 }
 
 /* Worker to setup sub channels on initial setup
@@ -756,6 +763,29 @@ static void netvsc_send_completion(struct net_device *ndev,
 {
const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
u32 msglen = hv_pkt_datalen(desc);
+   struct nvsp_message *pkt_rqst;
+   u64 cmd_rqst;
+
+   /* First check if this is a VMBUS completion without data payload */
+   if (!msglen) {
+   cmd_rqst = vmbus_request_addr(_channel->requestor,
+ (u64)desc->trans_id);
+   if (cmd_rqst == VMBUS_RQST_ERROR) {
+   netdev_err(ndev, "Invalid transaction id\n");
+   return;
+   }
+
+   pkt_rqst = (struct nvsp_message *)cmd_rqst;
+   switch (pkt_rqst->hdr.msg_type) {
+   case NVSP_MSG4_TYPE_SWITCH_DATA_PATH:
+   complete(_device->channel_init_wait);
+   break;
+
+   default:
+   netdev_err(ndev, "Unexpected VMBUS completion!!\n");
+   }
+   return;
+   }
 
/* Ensure packet is big enough to read header fields */
if (msglen < sizeof(struct nvsp_message_header)) {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5dd4f37afa3d..64ae5f4e974e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2400,7 +2400,6 @@ static int netvsc_vf_changed(struct net_device *vf_netdev)
 
if (net_device_ctx->data_path_is_vf == vf_is_up)
return NOTIFY_OK;
-   net_device_ctx->data_path_is_vf = vf_is_up;
 
netvsc_switch_datapath(ndev, vf_is_up);
netdev_info(ndev, "Data path switched %s VF: %s\n",
-- 
2.27.0



[PATCH 3/3] hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove

2021-01-05 Thread Long Li
From: Long Li 

On VF hot remove, NETDEV_GOING_DOWN is sent to notify the VF is about to
go down. At this time, the VF is still sending/receiving traffic and we
request the VSP to switch datapath.

On completion, the datapath is switched to synthetic and we can proceed
with VF hot remove.

Signed-off-by: Long Li 
---
 drivers/net/hyperv/netvsc_drv.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 64ae5f4e974e..75b4d6703cf1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2382,12 +2382,15 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
  * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network
  * interface, there is only the CHANGE event and no UP or DOWN event.
  */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event)
 {
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
struct net_device *ndev;
-   bool vf_is_up = netif_running(vf_netdev);
+   bool vf_is_up = false;
+
+   if (event != NETDEV_GOING_DOWN)
+   vf_is_up = netif_running(vf_netdev);
 
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
@@ -2716,7 +2719,8 @@ static int netvsc_netdev_event(struct notifier_block 
*this,
case NETDEV_UP:
case NETDEV_DOWN:
case NETDEV_CHANGE:
-   return netvsc_vf_changed(event_dev);
+   case NETDEV_GOING_DOWN:
+   return netvsc_vf_changed(event_dev, event);
default:
return NOTIFY_DONE;
}
-- 
2.27.0



[PATCH 1/3] hv_netvsc: Check VF datapath when sending traffic to VF

2021-01-05 Thread Long Li
From: Long Li 

The driver needs to check if the datapath has been switched to VF before
sending traffic to VF.

Signed-off-by: Long Li 
---
 drivers/net/hyperv/netvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f32f28311d57..5dd4f37afa3d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -539,7 +539,8 @@ static int netvsc_xmit(struct sk_buff *skb, struct 
net_device *net, bool xdp_tx)
 */
vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
if (vf_netdev && netif_running(vf_netdev) &&
-   netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net))
+   netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
+   net_device_ctx->data_path_is_vf)
return netvsc_vf_xmit(net, vf_netdev, skb);
 
/* We will atmost need two pages to describe the rndis
-- 
2.27.0



[PATCH v1] mm/migrate: fix comment spelling

2020-10-24 Thread Long Li
The word in the comment is misspelled, it should be "include".

Signed-off-by: Long Li 
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 5ca5842df5db..d79640ab8aa1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1694,7 +1694,7 @@ static int move_pages_and_store_status(struct mm_struct 
*mm, int node,
 * Positive err means the number of failed
 * pages to migrate.  Since we are going to
 * abort and return the number of non-migrated
-* pages, so need to incude the rest of the
+* pages, so need to include the rest of the
 * nr_pages that have not been attempted as
 * well.
 */
-- 
2.17.1



[PATCH v4] mm, slab: Check GFP_SLAB_BUG_MASK before alloc_pages in kmalloc_order

2020-07-02 Thread Long Li
kmalloc cannot allocate memory from HIGHMEM.  Allocating large amounts
of memory currently bypasses the check and will simply leak the memory
when page_address() returns NULL.  To fix this, factor the
GFP_SLAB_BUG_MASK check out of slab & slub, and call it from
kmalloc_order() as well. In order to make the code clear, the warning
message is put in one place.

Signed-off-by: Long Li 
---
changes in V4:
-Change the check function name to kmalloc_check_flags()
-Put the flags check into the kmalloc_check_flags() 

changes in V3:
-Put the warning message in one place
-updage the change log to be clear

 mm/slab.c|  8 +---
 mm/slab.h|  1 +
 mm/slab_common.c | 18 +-
 mm/slub.c|  8 +---
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index ac7a223d9ac3..755f33f96f04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2573,13 +2573,7 @@ static struct page *cache_grow_begin(struct kmem_cache 
*cachep,
 * Be lazy and only check for valid flags here,  keeping it out of the
 * critical path in kmem_cache_alloc().
 */
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   flags = kmalloc_check_flags(flags);
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
diff --git a/mm/slab.h b/mm/slab.h
index a06f3313e4a0..48df5660764c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -90,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
 struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 #endif
 
+gfp_t kmalloc_check_flags(gfp_t flags);
 
 /* Functions provided by the slab allocators */
 int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a143a8c8f874..9184e4575d6d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,8 @@
 #define CREATE_TRACE_POINTS
 #include 
 
+#include "internal.h"
+
 #include "slab.h"
 
 enum slab_state slab_state;
@@ -805,6 +807,20 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 }
 #endif /* !CONFIG_SLOB */
 
+gfp_t kmalloc_check_flags(gfp_t flags)
+{
+   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+   flags &= ~GFP_SLAB_BUG_MASK;
+   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
+   invalid_mask, _mask, flags, );
+   dump_stack();
+   }
+
+   return flags;
+}
+
 /*
  * To avoid unnecessary overhead, we pass through large allocation requests
  * directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -815,7 +831,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int 
order)
void *ret = NULL;
struct page *page;
 
-   flags |= __GFP_COMP;
+   flags = kmalloc_check_flags(flags) | __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
diff --git a/mm/slub.c b/mm/slub.c
index 62d2de56549e..8e787767850f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1817,13 +1817,7 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   flags = kmalloc_check_flags(flags);
 
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-- 
2.17.1



[PATCH v3] mm, slab: Check GFP_SLAB_BUG_MASK before alloc_pages in kmalloc_order

2020-07-01 Thread Long Li
kmalloc cannot allocate memory from HIGHMEM.  Allocating large amounts
of memory currently bypasses the check and will simply leak the memory
when page_address() returns NULL.  To fix this, factor the
GFP_SLAB_BUG_MASK check out of slab & slub, and call it from
kmalloc_order() as well. In order to make the code clear, the warning
message is put in one place.

Signed-off-by: Long Li 
---

changes in V3:
-Put the warning message in one place
-updage the change log to be clear

 mm/slab.c| 10 +++---
 mm/slab.h|  1 +
 mm/slab_common.c | 17 +
 mm/slub.c|  9 ++---
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index ac7a223d9ac3..2850fe3c5fb8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2573,13 +2573,9 @@ static struct page *cache_grow_begin(struct kmem_cache 
*cachep,
 * Be lazy and only check for valid flags here,  keeping it out of the
 * critical path in kmem_cache_alloc().
 */
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   if (unlikely(flags & GFP_SLAB_BUG_MASK))
+   flags = kmalloc_invalid_flags(flags);
+
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
diff --git a/mm/slab.h b/mm/slab.h
index a06f3313e4a0..ab172dca8ce2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -90,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
 struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 #endif
 
+gfp_t kmalloc_invalid_flags(gfp_t flags);
 
 /* Functions provided by the slab allocators */
 int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a143a8c8f874..85a16e323906 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,8 @@
 #define CREATE_TRACE_POINTS
 #include 
 
+#include "internal.h"
+
 #include "slab.h"
 
 enum slab_state slab_state;
@@ -805,6 +807,18 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 }
 #endif /* !CONFIG_SLOB */
 
+gfp_t kmalloc_invalid_flags(gfp_t flags)
+{
+   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+   flags &= ~GFP_SLAB_BUG_MASK;
+   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix 
your code!\n",
+   invalid_mask, _mask, flags, );
+   dump_stack();
+
+   return flags;
+}
+
 /*
  * To avoid unnecessary overhead, we pass through large allocation requests
  * directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -815,6 +829,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int 
order)
void *ret = NULL;
struct page *page;
 
+   if (unlikely(flags & GFP_SLAB_BUG_MASK))
+   flags = kmalloc_invalid_flags(flags);
+
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
diff --git a/mm/slub.c b/mm/slub.c
index 62d2de56549e..039045211df9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1817,13 +1817,8 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   if (unlikely(flags & GFP_SLAB_BUG_MASK))
+   flags = kmalloc_invalid_flags(flags);
 
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-- 
2.17.1



[PATCH v2] mm, slab: Check GFP_SLAB_BUG_MASK before alloc_pages in kmalloc_order

2020-06-30 Thread Long Li
In the ARM32 environment with highmem enabled. Using flag of kmalloc()
with __GFP_HIGHMEM to allocate large memory, it will go through the
kmalloc_order() path and return NULL. The __GFP_HIGHMEM flag will
cause alloc_pages() to allocate highmem memory and pages cannot be
directly converted to a virtual address, kmalloc_order() will return
NULL and the page has been allocated.

After modification, GFP_SLAB_BUG_MASK has been checked before
allocating pages, refer to the new_slab().

Signed-off-by: Long Li 
---

Changes in v2:
- patch is rebased againest "[PATCH] mm: Free unused pages in
kmalloc_order()" [1]
- check GFP_SLAB_BUG_MASK and generate warnings before alloc_pages
in kmalloc_order()

[1] https://lkml.org/lkml/2020/6/27/16

 mm/slab_common.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index a143a8c8f874..3548f4f8374b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -27,6 +27,7 @@
 #include 
 
 #include "slab.h"
+#include "internal.h"
 
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
@@ -815,6 +816,15 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int 
order)
void *ret = NULL;
struct page *page;
 
+   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+   flags &= ~GFP_SLAB_BUG_MASK;
+   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
+   invalid_mask, _mask, flags, );
+   dump_stack();
+   }
+
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
-- 
2.17.1



[PATCH v1] mm:free unused pages in kmalloc_order

2020-06-26 Thread Long Li
Environment using the slub allocator, 1G memory in my ARM32.
kmalloc(1024, GFP_HIGHUSER) can allocate memory normally,
kmalloc(64*1024, GFP_HIGHUSER) will cause a memory leak, because
alloc_pages returns highmem physical pages, but it cannot be directly
converted into a virtual address and return NULL, the pages has not
been released. Usually driver developers will not use the
GFP_HIGHUSER flag to allocate memory in kmalloc, but I think this
memory leak is not perfect, it is best to be fixed. This is the
first time I have posted a patch, there may be something wrong.

Signed-off-by: Long Li 
---
 mm/slab_common.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index a143a8c8f874..d2c53b980ab3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -819,8 +819,12 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int 
order)
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
-   mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
-   PAGE_SIZE << order);
+   if (ret)
+   mod_node_page_state(page_pgdat(page),
+   NR_SLAB_UNRECLAIMABLE_B,
+   PAGE_SIZE << order);
+   else
+   __free_pages(page, order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
-- 
2.17.1



RE: [Patch v4] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-09-23 Thread Long Li
>Subject: RE: [Patch v4] storvsc: setup 1:1 mapping between hardware queue
>and CPU queue
>
>>Subject: Re: [Patch v4] storvsc: setup 1:1 mapping between hardware
>>queue and CPU queue
>>
>>On Fri, Sep 06, 2019 at 10:24:20AM -0700, lon...@linuxonhyperv.com wrote:
>>>From: Long Li 
>>>
>>>storvsc doesn't use a dedicated hardware queue for a given CPU queue.
>>>When issuing I/O, it selects returning CPU (hardware queue)
>>>dynamically based on vmbus channel usage across all channels.
>>>
>>>This patch advertises num_present_cpus() as number of hardware queues.
>>>This will have upper layer setup 1:1 mapping between hardware queue
>>>and CPU queue and avoid unnecessary locking when issuing I/O.
>>>
>>>Signed-off-by: Long Li 

Hi Martin,

I have addressed all comments on this patch. Can you merge it to SCSI?

If there is anything else I need to change, please let me know.

Thanks

Long


>>>---
>>>
>>>Changes:
>>>v2: rely on default upper layer function to map queues. (suggested by
>>>Ming Lei
>>>)
>>>v3: use num_present_cpus() instead of num_online_cpus(). Hyper-v
>>>doesn't support hot-add CPUs. (suggested by Michael Kelley
>>>)
>>>v4: move change logs to after Signed-of-by
>>>
>>> drivers/scsi/storvsc_drv.c | 3 +--
>>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>>
>>>diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>>>index b89269120a2d..cf987712041a 100644
>>>--- a/drivers/scsi/storvsc_drv.c
>>>+++ b/drivers/scsi/storvsc_drv.c
>>>@@ -1836,8 +1836,7 @@ static int storvsc_probe(struct hv_device *device,
>>> /*
>>>  * Set the number of HW queues we are supporting.
>>>  */
>>>-if (stor_device->num_sc != 0)
>>>-host->nr_hw_queues = stor_device->num_sc + 1;
>>>+host->nr_hw_queues = num_present_cpus();
>>
>>Just looking at the change notes for v3: why isn't this
>>num_active_cpus() then? One can still isolate CPUs on hyper-v, no?
>
>The isolated CPU can be made online at run time. For example, even
>maxcpus=x is put on the boot line, individual CPUs can still be made
>online/offline.
>
>>
>>--
>>Thanks,
>>Sasha


RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-23 Thread Long Li
>Thanks for the clarification.
>
>The problem with what Ming is proposing in my mind (and its an existing
>problem that exists today), is that nvme is taking precedence over anything
>else until it absolutely cannot hog the cpu in hardirq.
>
>In the thread Ming referenced a case where today if the cpu core has a net
>softirq activity it cannot make forward progress. So with Ming's suggestion,
>net softirq will eventually make progress, but it creates an inherent fairness
>issue. Who said that nvme completions should come faster then the net rx/tx
>or another I/O device (or hrtimers or sched events...)?
>
>As much as I'd like nvme to complete as soon as possible, I might have other
>activities in the system that are as important if not more. So I don't think we
>can solve this with something that is not cooperative or fair with the rest of
>the system.
>
>>> If we are context switching too much, it means the soft-irq operation
>>> is not efficient, not necessarily the fact that the completion path
>>> is running in soft- irq..
>>>
>>> Is your kernel compiled with full preemption or voluntary preemption?
>>
>> The tests are based on Ubuntu 18.04 kernel configuration. Here are the
>parameters:
>>
>> # CONFIG_PREEMPT_NONE is not set
>> CONFIG_PREEMPT_VOLUNTARY=y
>> # CONFIG_PREEMPT is not set
>
>I see, so it still seems that irq_poll_softirq is still not efficient in 
>reaping
>completions. reaping the completions on its own is pretty much the same in
>hard and soft irq, so its really the scheduling part that is creating the 
>overhead
>(which does not exist in hard irq).
>
>Question:
>when you test with without the patch (completions are coming in hard-irq),
>do the fio threads that run on the cpu cores that are assigned to the cores 
>that
>are handling interrupts get substantially lower throughput than the rest of the
>fio threads? I would expect that the fio threads that are running on the first 
>32
>cores to get very low iops (overpowered by the nvme interrupts) and the rest
>doing much more given that nvme has almost no limits to how much time it
>can spend on processing completions.
>
>If need_resched() is causing us to context switch too aggressively, does
>changing that to local_softirq_pending() make things better?
>--
>diff --git a/lib/irq_poll.c b/lib/irq_poll.c index d8eab563fa77..05d524fcaf04
>100644
>--- a/lib/irq_poll.c
>+++ b/lib/irq_poll.c
>@@ -116,7 +116,7 @@ static void __latent_entropy irq_poll_softirq(struct
>softirq_action *h)
> /*
>  * If softirq window is exhausted then punt.
>  */
>-   if (need_resched())
>+   if (local_softirq_pending())
> break;
> }
>--
>
>Although, this can potentially cause other threads from making forward
>progress.. If it is better, perhaps we also need a time limit as well.

Thanks for this patch. The IOPS was about the same. (it tends to fluctuate more 
but within 3% variation)

I captured the following from one of the CPUs. All CPUs tend to have similar 
numbers. The following numbers are captured during 5 seconds and averaged:

Context switches/s:
Without any patch: 5
With the previous patch: 640
With this patch: 522

Process migrated/s:
Without any patch: 0.6
With the previous patch: 104
With this patch: 121

>
>Perhaps we should add statistics/tracing on how many completions we are
>reaping per invocation...

I'll look into a bit more on completion. From the numbers I think the increased 
number of context switches/migrations are hurting most on performance.

Thanks

Long


RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-20 Thread Long Li
> >> Long, does this patch make any difference?
> >
> > Sagi,
> >
> > Sorry it took a while to bring my system back online.
> >
> > With the patch, the IOPS is about the same drop with the 1st patch. I think
> the excessive context switches are causing the drop in IOPS.
> >
> > The following are captured by "perf sched record" for 30 seconds during
> tests.
> >
> > "perf sched latency"
> > With patch:
> >fio:(82)  | 937632.706 ms |  1782255 | avg:0.209 ms | 
> > max:   63.123
> ms | max at:768.274023 s
> >
> > without patch:
> >fio:(82)  |2348323.432 ms |18848 | avg:0.295 ms | 
> > max:   28.446
> ms | max at:   6447.310255 s
> 
> Without patch means the proposed hard-irq patch?

It means the current upstream code without any patch. But It's prone to soft 
lockup.

Ming's proposed hard-irq patch gets similar results to "without patch", however 
it fixes the soft lockup.

> 
> If we are context switching too much, it means the soft-irq operation is not
> efficient, not necessarily the fact that the completion path is running in 
> soft-
> irq..
> 
> Is your kernel compiled with full preemption or voluntary preemption?

The tests are based on Ubuntu 18.04 kernel configuration. Here are the 
parameters:

# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set

> 
> > Look closer at each CPU, we can see ksoftirqd is competing CPU with
> > fio (and effectively throttle other fio processes) (captured in
> > /sys/kernel/debug/tracing, echo sched:* >set_event)
> >
> > On CPU1 with patch: (note that the prev_state for fio is "R", it's
> preemptively scheduled)
> > <...>-4077  [001] d... 66456.805062: sched_switch: prev_comm=fio
> prev_pid=4077 prev_prio=120 prev_state=R ==> next_comm=ksoftirqd/1
> next_pid=17 next_prio=120
> > <...>-17[001] d... 66456.805859: sched_switch:
> prev_comm=ksoftirqd/1 prev_pid=17 prev_prio=120 prev_state=S ==>
> next_comm=fio next_pid=4077 next_prio=120
> > <...>-4077  [001] d... 66456.844049: sched_switch: prev_comm=fio
> prev_pid=4077 prev_prio=120 prev_state=R ==> next_comm=ksoftirqd/1
> next_pid=17 next_prio=120
> > <...>-17[001] d... 66456.844607: sched_switch:
> prev_comm=ksoftirqd/1 prev_pid=17 prev_prio=120 prev_state=S ==>
> next_comm=fio next_pid=4077 next_prio=120
> >
> > On CPU1 without patch: (the prev_state for fio is "S", it's voluntarily
> scheduled)
> >-0 [001] d...  6725.392308: sched_switch:
> prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==>
> next_comm=fio next_pid=14342 next_prio=120
> >   fio-14342 [001] d...  6725.392332: sched_switch: prev_comm=fio
> prev_pid=14342 prev_prio=120 prev_state=S ==> next_comm=swapper/1
> next_pid=0 next_prio=120
> >-0 [001] d...  6725.392356: sched_switch:
> prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==>
> next_comm=fio next_pid=14342 next_prio=120
> >   fio-14342 [001] d...  6725.392425: sched_switch:
> > prev_comm=fio prev_pid=14342 prev_prio=120 prev_state=S ==>
> > next_comm=swapper/1 next_pid=0 next_prio=12


RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-17 Thread Long Li
>Subject: Re: [PATCH 1/4] softirq: implement IRQ flood detection mechanism
>
>Hey Ming,
>
 Ok, so the real problem is per-cpu bounded tasks.

 I share Thomas opinion about a NAPI like approach.
>>>
>>> We already have that, its irq_poll, but it seems that for this
>>> use-case, we get lower performance for some reason. I'm not entirely
>>> sure why that is, maybe its because we need to mask interrupts
>>> because we don't have an "arm" register in nvme like network devices
>>> have?
>>
>> Long observed that IOPS drops much too by switching to threaded irq.
>> If softirqd is waken up for handing softirq, the performance shouldn't
>> be better than threaded irq.
>
>Its true that it shouldn't be any faster, but what irqpoll already has and we
>don't need to reinvent is a proper budgeting mechanism that needs to occur
>when multiple devices map irq vectors to the same cpu core.
>
>irqpoll already maintains a percpu list and dispatch the ->poll with a budget
>that the backend enforces and irqpoll multiplexes between them.
>Having this mechanism in irq (hard or threaded) context sounds unnecessary a
>bit.
>
>It seems like we're attempting to stay in irq context for as long as we can
>instead of scheduling to softirq/thread context if we have more than a
>minimal amount of work to do. Without at least understanding why
>softirq/thread degrades us so much this code seems like the wrong approach
>to me. Interrupt context will always be faster, but it is not a sufficient 
>reason
>to spend as much time as possible there, is it?
>
>We should also keep in mind, that the networking stack has been doing this
>for years, I would try to understand why this cannot work for nvme before
>dismissing.
>
>> Especially, Long found that context
>> switch is increased a lot after applying your irq poll patch.
>>
>> https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists
>> .infradead.org%2Fpipermail%2Flinux-nvme%2F2019-
>August%2F026788.html
>>
>p;data=02%7C01%7Clongli%40microsoft.com%7C20391b0810844821325908d73
>59c
>>
>64d2%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C637036818140279
>742
>>
>mp;sdata=GyBWILwPvwHYvrTGSAVZbdl%2Fcoz3twSXe2DrH2t1MeQ%3D
>p;reserved
>> =0
>
>Oh, I didn't see that one, wonder why... thanks!
>
>5% improvement, I guess we can buy that for other users as is :)
>
>If we suffer from lots of context switches while the CPU is flooded with
>interrupts, then I would argue that we're re-raising softirq too much.
>In this use-case, my assumption is that the cpu cannot keep up with the
>interrupts and not that it doesn't reap enough (we also reap the first batch in
>interrupt context...)
>
>Perhaps making irqpoll continue until it must resched would improve things
>further? Although this is a latency vs. efficiency tradeoff, looks like
>MAX_SOFTIRQ_TIME is set to 2ms:
>
>"
>  * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
>  * certain cases, such as stop_machine(), jiffies may cease to
>  * increment and so we need the MAX_SOFTIRQ_RESTART limit as
>  * well to make sure we eventually return from this method.
>  *
>  * These limits have been established via experimentation.
>  * The two things to balance is latency against fairness -
>  * we want to handle softirqs as soon as possible, but they
>  * should not be able to lock up the box.
>"
>
>Long, does this patch make any difference?

Sagi,

Sorry it took a while to bring my system back online.

With the patch, the IOPS is about the same drop with the 1st patch. I think the 
excessive context switches are causing the drop in IOPS.

The following are captured by "perf sched record" for 30 seconds during tests.

"perf sched latency"
With patch:
  fio:(82)  | 937632.706 ms |  1782255 | avg:0.209 ms | max:   
63.123 ms | max at:768.274023 s

without patch:
  fio:(82)  |2348323.432 ms |18848 | avg:0.295 ms | max:   
28.446 ms | max at:   6447.310255 s

Look closer at each CPU, we can see ksoftirqd is competing CPU with fio (and 
effectively throttle other fio processes)
(captured in /sys/kernel/debug/tracing, echo sched:* >set_event)

On CPU1 with patch: (note that the prev_state for fio is "R", it's preemptively 
scheduled)
   <...>-4077  [001] d... 66456.805062: sched_switch: prev_comm=fio 
prev_pid=4077 prev_prio=120 prev_state=R ==> next_comm=ksoftirqd/1 next_pid=17 
next_prio=120
   <...>-17[001] d... 66456.805859: sched_switch: 
prev_comm=ksoftirqd/1 prev_pid=17 prev_prio=120 prev_state=S ==> next_comm=fio 
next_pid=4077 next_prio=120
   <...>-4077  [001] d... 66456.844049: sched_switch: prev_comm=fio 
prev_pid=4077 prev_prio=120 prev_state=R ==> next_comm=ksoftirqd/1 next_pid=17 
next_prio=120
   <...>-17[001] d... 66456.844607: sched_switch: 
prev_comm=ksoftirqd/1 prev_pid=17 prev_prio=120 prev_state=S ==> next_comm=fio 
next_pid=4077 next_prio=120

On CPU1 without patch: (the prev_state for fio is "S", it's voluntarily 

RE: [Patch v4] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-09-06 Thread Long Li
>Subject: Re: [Patch v4] storvsc: setup 1:1 mapping between hardware queue
>and CPU queue
>
>On Fri, Sep 06, 2019 at 10:24:20AM -0700, lon...@linuxonhyperv.com wrote:
>>From: Long Li 
>>
>>storvsc doesn't use a dedicated hardware queue for a given CPU queue.
>>When issuing I/O, it selects returning CPU (hardware queue) dynamically
>>based on vmbus channel usage across all channels.
>>
>>This patch advertises num_present_cpus() as number of hardware queues.
>>This will have upper layer setup 1:1 mapping between hardware queue and
>>CPU queue and avoid unnecessary locking when issuing I/O.
>>
>>Signed-off-by: Long Li 
>>---
>>
>>Changes:
>>v2: rely on default upper layer function to map queues. (suggested by
>>Ming Lei
>>)
>>v3: use num_present_cpus() instead of num_online_cpus(). Hyper-v
>>doesn't support hot-add CPUs. (suggested by Michael Kelley
>>)
>>v4: move change logs to after Signed-of-by
>>
>> drivers/scsi/storvsc_drv.c | 3 +--
>> 1 file changed, 1 insertion(+), 2 deletions(-)
>>
>>diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>>index b89269120a2d..cf987712041a 100644
>>--- a/drivers/scsi/storvsc_drv.c
>>+++ b/drivers/scsi/storvsc_drv.c
>>@@ -1836,8 +1836,7 @@ static int storvsc_probe(struct hv_device *device,
>>  /*
>>   * Set the number of HW queues we are supporting.
>>   */
>>- if (stor_device->num_sc != 0)
>>- host->nr_hw_queues = stor_device->num_sc + 1;
>>+ host->nr_hw_queues = num_present_cpus();
>
>Just looking at the change notes for v3: why isn't this
>num_active_cpus() then? One can still isolate CPUs on hyper-v, no?

The isolated CPU can be made online at run time. For example, even maxcpus=x is 
put on the boot line, individual CPUs can still be made online/offline.

>
>--
>Thanks,
>Sasha


RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-06 Thread Long Li
>Subject: Re: [PATCH 1/4] softirq: implement IRQ flood detection mechanism
>
>On Fri, Sep 06, 2019 at 09:48:21AM +0800, Ming Lei wrote:
>> When one IRQ flood happens on one CPU:
>>
>> 1) softirq handling on this CPU can't make progress
>>
>> 2) kernel thread bound to this CPU can't make progress
>>
>> For example, network may require softirq to xmit packets, or another
>> irq thread for handling keyboards/mice or whatever, or rcu_sched may
>> depend on that CPU for making progress, then the irq flood stalls the
>> whole system.
>>
>> >
>> > AFAIU, there are fast medium where the responses to requests are
>> > faster than the time to process them, right?
>>
>> Usually medium may not be faster than CPU, now we are talking about
>> interrupts, which can be originated from lots of devices concurrently,
>> for example, in Long Li'test, there are 8 NVMe drives involved.
>
>Why are all 8 nvmes sharing the same CPU for interrupt handling?
>Shouldn't matrix_find_best_cpu_managed() handle selecting the least used
>CPU from the cpumask for the effective interrupt handling?

The tests run on 10 NVMe disks on a system of 80 CPUs. Each NVMe disk has 32 
hardware queues.
It seems matrix_find_best_cpu_managed() has done its job, but we may still have 
CPUs that service several hardware queues mapped from other issuing CPUs.
Another thing to consider is that there may be other managed interrupts on the 
system, so NVMe interrupts may not end up evenly distributed on such a system.


RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-05 Thread Long Li
>Subject: Re: [PATCH 1/4] softirq: implement IRQ flood detection mechanism
>
>
>On 06/09/2019 03:22, Long Li wrote:
>[ ... ]
>>
>
>> Tracing shows that the CPU was in either hardirq or softirq all the
>> time before warnings. During tests, the system was unresponsive at
>> times.
>>
>> Ming's patch fixed this problem. The system was responsive throughout
>> tests.
>>
>> As for performance hit, both resulted in a small drop in peak IOPS.
>> With IRQ_TIME_ACCOUNTING I see a 3% drop. With Ming's patch it is 1%
>> drop.
>
>Do you mean IRQ_TIME_ACCOUNTING + irq threaded ?

It's just IRQ_TIME_ACCOUNTING.

>
>
>> For the tests, I used the following fio command on 10 NVMe disks: fio
>> --bs=4k --ioengine=libaio --iodepth=128
>> --
>filename=/dev/nvme0n1:/dev/nvme1n1:/dev/nvme2n1:/dev/nvme3n1:/dev
>/nv
>>
>me4n1:/dev/nvme5n1:/dev/nvme6n1:/dev/nvme7n1:/dev/nvme8n1:/dev/n
>vme9n1
>> --direct=1 --runtime=12000 --numjobs=80 --rw=randread --name=test
>> --group_reporting --gtod_reduce=1
>
>--
>
><https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww
>.linaro.org%2Fdata=02%7C01%7Clongli%40microsoft.com%7Cf142f9f9e
>15145434dd608d73283c817%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0
>%7C637033413903343340sdata=FRCGiKyxpdqyIPob1nWITGvymRdI3fSG
>vyBJovpwVw4%3Dreserved=0> Linaro.org │ Open source software for
>ARM SoCs
>
>Follow Linaro:
><https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww
>.facebook.com%2Fpages%2FLinarodata=02%7C01%7Clongli%40microso
>ft.com%7Cf142f9f9e15145434dd608d73283c817%7C72f988bf86f141af91ab2d7c
>d011db47%7C1%7C0%7C637033413903343340sdata=P6t7wiGUESJoFuKi
>u3VrjRMGBYUWAW7TEYinUiFrlQs%3Dreserved=0> Facebook |
><https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Ftwitt
>er.com%2F%23!%2Flinaroorgdata=02%7C01%7Clongli%40microsoft.co
>m%7Cf142f9f9e15145434dd608d73283c817%7C72f988bf86f141af91ab2d7cd011
>db47%7C1%7C0%7C637033413903343340sdata=UB%2FOZZ1Mz38PQiDa
>BiJOHS4qr%2FWCejI0aKX9JRPNZ3s%3Dreserved=0> Twitter |
><https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww
>.linaro.org%2Flinaro-
>blog%2Fdata=02%7C01%7Clongli%40microsoft.com%7Cf142f9f9e15145
>434dd608d73283c817%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6
>37033413903343340sdata=7%2BrawoAWuFzou90GTgIUJV%2Fasv2N2Og
>ciePvYmblDFM%3Dreserved=0> Blog



RE: [Patch v3] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-09-05 Thread Long Li
>Subject: RE: [Patch v3] storvsc: setup 1:1 mapping between hardware queue
>and CPU queue
>
>From: Long Li  Sent: Thursday, September 5, 2019 3:55
>PM
>>
>> storvsc doesn't use a dedicated hardware queue for a given CPU queue.
>> When issuing I/O, it selects returning CPU (hardware queue)
>> dynamically based on vmbus channel usage across all channels.
>>
>> This patch advertises num_present_cpus() as number of hardware queues.
>> This will have upper layer setup 1:1 mapping between hardware queue
>> and CPU queue and avoid unnecessary locking when issuing I/O.
>>
>> Changes:
>> v2: rely on default upper layer function to map queues. (suggested by
>> Ming Lei
>> )
>> v3: use num_present_cpus() instead of num_online_cpus(). Hyper-v
>> doesn't support hot-add CPUs. (suggested by Michael Kelley
>> )
>
>I've mostly seen the "Changes:" section placed below the "---" so that it
>doesn't clutter up the commit log.  But maybe there's not a strong
>requirement one way or the other as I didn't find anything called out in the
>"Documentation/process"
>directory.

Should I resubmit the patch (but keep it v3)?

>
>Michael
>
>>
>> Signed-off-by: Long Li 
>> ---
>>  drivers/scsi/storvsc_drv.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>> index b89269120a2d..cf987712041a 100644
>> --- a/drivers/scsi/storvsc_drv.c
>> +++ b/drivers/scsi/storvsc_drv.c
>> @@ -1836,8 +1836,7 @@ static int storvsc_probe(struct hv_device *device,
>>  /*
>>   * Set the number of HW queues we are supporting.
>>   */
>> -if (stor_device->num_sc != 0)
>> -host->nr_hw_queues = stor_device->num_sc + 1;
>> +host->nr_hw_queues = num_present_cpus();
>>
>>  /*
>>   * Set the error handler work queue.
>> --
>> 2.17.1



RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-09-05 Thread Long Li
>Subject: Re: [PATCH 1/4] softirq: implement IRQ flood detection mechanism
>
>
>Hi Ming,
>
>On 05/09/2019 11:06, Ming Lei wrote:
>> On Wed, Sep 04, 2019 at 07:31:48PM +0200, Daniel Lezcano wrote:
>>> Hi,
>>>
>>> On 04/09/2019 19:07, Bart Van Assche wrote:
 On 9/3/19 12:50 AM, Daniel Lezcano wrote:
> On 03/09/2019 09:28, Ming Lei wrote:
>> On Tue, Sep 03, 2019 at 08:40:35AM +0200, Daniel Lezcano wrote:
>>> It is a scheduler problem then ?
>>
>> Scheduler can do nothing if the CPU is taken completely by
>> handling interrupt & softirq, so seems not a scheduler problem, IMO.
>
> Why? If there is a irq pressure on one CPU reducing its capacity,
> the scheduler will balance the tasks on another CPU, no?

 Only if CONFIG_IRQ_TIME_ACCOUNTING has been enabled. However, I
 don't know any Linux distro that enables that option. That's
 probably because that option introduces two rdtsc() calls in each
 interrupt. Given the overhead introduced by this option, I don't
 think this is the solution Ming is looking for.
>>>
>>> Was this overhead reported somewhere ?
>>
>> The syscall of gettimeofday() calls ktime_get_real_ts64() which
>> finally calls tk_clock_read() which calls rdtsc too.
>>
>> But gettimeofday() is often used in fast path, and block IO_STAT needs
>> to read it too.
>>
>>>
 See also irqtime_account_irq() in kernel/sched/cputime.c.
>>>
>>> From my POV, this framework could be interesting to detect this situation.
>>
>> Now we are talking about IRQ_TIME_ACCOUNTING instead of
>IRQ_TIMINGS,
>> and the former one could be used to implement the detection. And the
>> only sharing should be the read of timestamp.
>
>You did not share yet the analysis of the problem (the kernel warnings give
>the symptoms) and gave the reasoning for the solution. It is hard to
>understand what you are looking for exactly and how to connect the dots.
>
>AFAIU, there are fast medium where the responses to requests are faster
>than the time to process them, right?
>
>I don't see how detecting IRQ flooding and use a threaded irq is the solution,
>can you explain?
>
>If the responses are coming at a very high rate, whatever the solution
>(interrupts, threaded interrupts, polling), we are still in the same situation.
>
>My suggestion was initially to see if the interrupt load will be taken into
>accounts in the cpu load and favorize task migration with the scheduler load
>balance to a less loaded CPU, thus the CPU processing interrupts will end up
>doing only that while other CPUs will handle the "threaded" side.
>
>Beside that, I'm wondering if the block scheduler should be somehow
>involved in that [1]
>
>  -- Daniel

Hi Daniel

I want to share some test results with IRQ_TIME_ACCOUNTING. During tests, the 
kernel had warnings on RCU stall and soft lockup:

An example of RCU stall;
[ 3016.148250] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[ 3016.148299] rcu: 66-: (1 GPs behind) idle=cc2/0/0x3 
softirq=10037/10037 fqs=4717
[ 3016.148299]  (detected by 27, t=15011 jiffies, g=45173, q=17194)
[ 3016.148299] Sending NMI from CPU 27 to CPUs 66:
[ 3016.148299] NMI backtrace for cpu 66
[ 3016.148299] CPU: 66 PID: 0 Comm: swapper/66 Tainted: G L
5.3.0-rc6+ #68
[ 3016.148299] Hardware name: Microsoft Corporation Virtual Machine/Virtual 
Machine, BIOS 090007  05/18/2018
[ 3016.148299] RIP: 0010:0x9c4740013003
[ 3016.148299] Code: Bad RIP value.
[ 3016.148299] RSP: 0018:9c4759acc8d0 EFLAGS: 0046
[ 3016.148299] RAX:  RBX: 0080 RCX: 0001000b
[ 3016.148299] RDX: 00fb RSI: 9c4740013000 RDI: 9c4759acc920
[ 3016.148299] RBP: 9c4759acc920 R08: 0008 R09: 01484a845c6de350
[ 3016.148299] R10: 9c4759accd30 R11: 0001 R12: 00fb
[ 3016.148299] R13: 0042 R14: 8a7d9b771f80 R15: 01e1
[ 3016.148299] FS:  () GS:8afd9f88() 
knlGS:
[ 3016.148299] CS:  0010 DS:  ES:  CR0: 80050033
[ 3016.148299] CR2: 9c4740012fd9 CR3: 00208b9bc000 CR4: 003406e0
[ 3016.148299] Call Trace:
[ 3016.148299]  
[ 3016.148299]  ? __send_ipi_mask+0x145/0x2e0
[ 3016.148299]  ? __send_ipi_one+0x3a/0x60
[ 3016.148299]  ? hv_send_ipi+0x10/0x30
[ 3016.148299]  ? generic_exec_single+0x63/0xe0
[ 3016.148299]  ? smp_call_function_single_async+0x1f/0x40
[ 3016.148299]  ? blk_mq_complete_request+0xdf/0xf0
[ 3016.148299]  ? nvme_irq+0x144/0x240 [nvme]
[ 3016.148299]  ? tick_sched_do_timer+0x80/0x80
[ 3016.148299]  ? __handle_irq_event_percpu+0x40/0x190
[ 3016.148299]  ? handle_irq_event_percpu+0x30/0x70
[ 3016.148299]  ? handle_irq_event+0x36/0x60
[ 3016.148299]  ? handle_edge_irq+0x7e/0x190
[ 3016.148299]  ? handle_irq+0x1c/0x30
[ 3016.148299]  ? do_IRQ+0x49/0xd0
[ 3016.148299]  ? common_interrupt+0xf/0xf
[ 3016.148299]  ? common_interrupt+0xa/0xf
[ 3016.148299]  ? __do_softirq+0x76/0x2e3

RE: [PATCH 1/4] softirq: implement IRQ flood detection mechanism

2019-08-29 Thread Long Li
>>>For some high performance IO devices, interrupt may come very frequently,
>>>meantime IO request completion may take a bit time. Especially on some
>>>devices(SCSI or NVMe), IO requests can be submitted concurrently from
>>>multiple CPU cores, however IO completion is only done on one of these
>>>submission CPU cores.
>>>
>>>Then IRQ flood can be easily triggered, and CPU lockup.
>>>
>>>Implement one simple generic CPU IRQ flood detection mechanism. This
>>>mechanism uses the CPU average interrupt interval to evaluate if IRQ flood
>>>is triggered. The Exponential Weighted Moving Average(EWMA) is used to
>>>compute CPU average interrupt interval.
>>>
>>>Cc: Long Li 
>>>Cc: Ingo Molnar ,
>>>Cc: Peter Zijlstra 
>>>Cc: Keith Busch 
>>>Cc: Jens Axboe 
>>>Cc: Christoph Hellwig 
>>>Cc: Sagi Grimberg 
>>>Cc: John Garry 
>>>Cc: Thomas Gleixner 
>>>Cc: Hannes Reinecke 
>>>Cc: linux-n...@lists.infradead.org
>>>Cc: linux-s...@vger.kernel.org
>>>Signed-off-by: Ming Lei 
>>>---
>>> drivers/base/cpu.c  | 25 ++
>>> include/linux/hardirq.h |  2 ++
>>> kernel/softirq.c| 46
>>>+
>>> 3 files changed, 73 insertions(+)
>>>
>>>diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index
>>>cc37511de866..7277d1aa0906 100644
>>>--- a/drivers/base/cpu.c
>>>+++ b/drivers/base/cpu.c
>>>@@ -20,6 +20,7 @@
>>> #include 
>>> #include 
>>> #include 
>>>+#include 
>>>
>>> #include "base.h"
>>>
>>>@@ -183,10 +184,33 @@ static struct attribute_group
>>>crash_note_cpu_attr_group = {  };  #endif
>>>
>>>+static ssize_t show_irq_interval(struct device *dev,
>>>+ struct device_attribute *attr, char *buf) {
>>>+struct cpu *cpu = container_of(dev, struct cpu, dev);
>>>+ssize_t rc;
>>>+int cpunum;
>>>+
>>>+cpunum = cpu->dev.id;
>>>+
>>>+rc = sprintf(buf, "%llu\n", irq_get_avg_interval(cpunum));
>>>+return rc;
>>>+}
>>>+
>>>+static DEVICE_ATTR(irq_interval, 0400, show_irq_interval, NULL); static
>>>+struct attribute *irq_interval_cpu_attrs[] = {
>>>+_attr_irq_interval.attr,
>>>+NULL
>>>+};
>>>+static struct attribute_group irq_interval_cpu_attr_group = {
>>>+.attrs = irq_interval_cpu_attrs,
>>>+};
>>>+
>>> static const struct attribute_group *common_cpu_attr_groups[] = {  #ifdef
>>>CONFIG_KEXEC
>>> _note_cpu_attr_group,
>>> #endif
>>>+_interval_cpu_attr_group,
>>> NULL
>>> };
>>>
>>>@@ -194,6 +218,7 @@ static const struct attribute_group
>>>*hotplugable_cpu_attr_groups[] = {  #ifdef CONFIG_KEXEC
>>> _note_cpu_attr_group,
>>> #endif
>>>+_interval_cpu_attr_group,
>>> NULL
>>> };
>>>
>>>diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index
>>>da0af631ded5..fd394060ddb3 100644
>>>--- a/include/linux/hardirq.h
>>>+++ b/include/linux/hardirq.h
>>>@@ -8,6 +8,8 @@
>>> #include 
>>> #include 
>>>
>>>+extern u64 irq_get_avg_interval(int cpu); extern bool
>>>+irq_flood_detected(void);
>>>
>>> extern void synchronize_irq(unsigned int irq);  extern bool
>>>synchronize_hardirq(unsigned int irq); diff --git a/kernel/softirq.c
>>>b/kernel/softirq.c index 0427a86743a4..96e01669a2e0 100644
>>>--- a/kernel/softirq.c
>>>+++ b/kernel/softirq.c
>>>@@ -25,6 +25,7 @@
>>> #include 
>>> #include 
>>> #include 
>>>+#include 
>>>
>>> #define CREATE_TRACE_POINTS
>>> #include 
>>>@@ -52,6 +53,12 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
>>>EXPORT_PER_CPU_SYMBOL(irq_stat);  #endif
>>>
>>>+struct irq_interval {
>>>+u64 last_irq_end;
>>>+u64 avg;
>>>+};
>>>+DEFINE_PER_CPU(struct irq_interval, avg_irq_interval);
>>>+
>>> static struct softirq_action softirq_vec[NR_SOFTIRQS]
>>>__cacheline_aligned_in_smp;
>>>
>>> DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -339,6 +346,41 @@
>>>asmlinkage __visible void do_softirq(void)
>&g

RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-23 Thread Long Li
>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>
 Sagi,

 Here are the test results.

 Benchmark command:
 fio --bs=4k --ioengine=libaio --iodepth=64
 --
>>>filename=/dev/nvme0n1:/dev/nvme1n1:/dev/nvme2n1:/dev/nvme3n1:/d
>>>ev/nv

>>>me4n1:/dev/nvme5n1:/dev/nvme6n1:/dev/nvme7n1:/dev/nvme8n1:/dev
>>>/nvme9n1
 --direct=1 --runtime=90 --numjobs=80 --rw=randread --name=test
 --group_reporting --gtod_reduce=1

 With your patch: 1720k IOPS
 With threaded interrupts: 1320k IOPS
 With just interrupts: 3720k IOPS

 Interrupts are the fastest but we need to find a way to throttle it.
>>>
>>>This is the workload that generates the flood?
>>>If so I did not expect that this would be the perf difference..
>>>
>>>If completions keep pounding on the cpu, I would expect irqpoll to simply
>>>keep running forever and poll the cqs. There is no fundamental reason why
>>>polling would be faster in an interrupt, what matters could be:
>>>1. we reschedule more than we need to
>>>2. we don't reap enough completions in every poll round, which will trigger
>>>rearming the interrupt and then when it fires reschedule another softirq...
>>>

Yes I think it's the rescheduling that takes some. With the patch there are 
lots of ksoftirqd activities. (compared to nearly none without the patch)
A 90 seconds FIO run shows a big difference of context switches on all CPUs:
With patch: 5755849
Without patch: 1462931

>>>Maybe we need to take care of some irq_poll optimizations?
>>>
>>>Does this (untested) patch make any difference?
>>>--
>>>diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d58e..0e94183eba15
>>>100644
>>>--- a/lib/irq_poll.c
>>>+++ b/lib/irq_poll.c
>>>@@ -12,7 +12,8 @@
>>>  #include 
>>>  #include 
>>>
>>>-static unsigned int irq_poll_budget __read_mostly = 256;
>>>+static unsigned int irq_poll_budget __read_mostly = 3000; unsigned int
>>>+__read_mostly irqpoll_budget_usecs = 2000;
>>>
>>>  static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
>>>
>>>@@ -77,32 +78,26 @@ EXPORT_SYMBOL(irq_poll_complete);
>>>
>>>  static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
>>>  {
>>>-   struct list_head *list = this_cpu_ptr(_cpu_iopoll);
>>>-   int rearm = 0, budget = irq_poll_budget;
>>>-   unsigned long start_time = jiffies;
>>>+   struct list_head *irqpoll_list = this_cpu_ptr(_cpu_iopoll);
>>>+   unsigned int budget = irq_poll_budget;
>>>+   unsigned long time_limit =
>>>+   jiffies + usecs_to_jiffies(irqpoll_budget_usecs);
>>>+   LIST_HEAD(list);
>>>
>>> local_irq_disable();
>>>+   list_splice_init(irqpoll_list, );
>>>+   local_irq_enable();
>>>
>>>-   while (!list_empty(list)) {
>>>+   while (!list_empty()) {
>>> struct irq_poll *iop;
>>> int work, weight;
>>>
>>>-   /*
>>>-* If softirq window is exhausted then punt.
>>>-*/
>>>-   if (budget <= 0 || time_after(jiffies, start_time)) {
>>>-   rearm = 1;
>>>-   break;
>>>-   }
>>>-
>>>-   local_irq_enable();
>>>-
>>> /* Even though interrupts have been re-enabled, this
>>>  * access is safe because interrupts can only add new
>>>  * entries to the tail of this list, and only ->poll()
>>>  * calls can remove this head entry from the list.
>>>  */
>>>-   iop = list_entry(list->next, struct irq_poll, list);
>>>+   iop = list_first_entry(, struct irq_poll, list);
>>>
>>> weight = iop->weight;
>>> work = 0;
>>>@@ -111,8 +106,6 @@ static void __latent_entropy irq_poll_softirq(struct
>>>softirq_action *h)
>>>
>>> budget -= work;
>>>
>>>-   local_irq_disable();
>>>-
>>> /*
>>>  * Drivers must not modify the iopoll state, if they
>>>  * consume their assigned weight (or more, some drivers 
>>> can't @@
>>>-125,11 +118,21 @@ static void __latent_entropy irq_poll_softirq(struct
>>>softirq_action *h)
>>> if (test_bit(IRQ_POLL_F_DISABLE, >state))
>>> __irq_poll_complete(iop);
>>> else
>>>-   list_move_tail(>list, list);
>>>+   list_move_tail(>list, );
>>> }
>>>+
>>>+   /*
>>>+* If softirq window is exhausted then punt.
>>>+*/
>>>+   if (budget <= 0 || time_after_eq(jiffies, time_limit))
>>>+   break;
>>> }
>>>
>>>-   if (rearm)
>>>+   local_irq_disable();
>>>+
>>>+   list_splice_tail_init(irqpoll_list, );
>>>+   list_splice(, irqpoll_list);
>>>+   if 

RE: [Patch v2] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-08-22 Thread Long Li
>>>Subject: RE: [Patch v2] storvsc: setup 1:1 mapping between hardware
>>>queue and CPU queue
>>>
>>>>>>Subject: RE: [Patch v2] storvsc: setup 1:1 mapping between hardware
>>>>>>queue and CPU queue
>>>>>>
>>>>>>From: Long Li  Sent: Thursday, August 22,
>>>>>>2019
>>>>>>1:42 PM
>>>>>>>
>>>>>>> storvsc doesn't use a dedicated hardware queue for a given CPU
>>>queue.
>>>>>>> When issuing I/O, it selects returning CPU (hardware queue)
>>>>>>> dynamically based on vmbus channel usage across all channels.
>>>>>>>
>>>>>>> This patch advertises num_possible_cpus() as number of hardware
>>>>>>> queues. This will have upper layer setup 1:1 mapping between
>>>>>>> hardware queue and CPU queue and avoid unnecessary locking when
>>>issuing I/O.
>>>>>>>
>>>>>>> Changes:
>>>>>>> v2: rely on default upper layer function to map queues. (suggested
>>>>>>> by Ming Lei
>>>>>>> )
>>>>>>>
>>>>>>> Signed-off-by: Long Li 
>>>>>>> ---
>>>>>>>  drivers/scsi/storvsc_drv.c | 3 +--
>>>>>>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/scsi/storvsc_drv.c
>>>>>>> b/drivers/scsi/storvsc_drv.c index b89269120a2d..dfd3b76a4f89
>>>>>>> 100644
>>>>>>> --- a/drivers/scsi/storvsc_drv.c
>>>>>>> +++ b/drivers/scsi/storvsc_drv.c
>>>>>>> @@ -1836,8 +1836,7 @@ static int storvsc_probe(struct hv_device
>>>>>>*device,
>>>>>>> /*
>>>>>>>  * Set the number of HW queues we are supporting.
>>>>>>>  */
>>>>>>> -   if (stor_device->num_sc != 0)
>>>>>>> -   host->nr_hw_queues = stor_device->num_sc + 1;
>>>>>>> +   host->nr_hw_queues = num_possible_cpus();
>>>>>>
>>>>>>For a lot of the VM sizes in Azure, num_possible_cpus() is 128, even
>>>>>>if the VM has only 4 or 8 or some other smaller number of vCPUs.
>>>>>>So I'm wondering if you really want num_present_cpus() here instead,
>>>>>>which would include only the vCPUs that actually exist in the VM.
>>>
>>>I think reporting num_possible_cpus() doesn't do more harm or take more
>>>resources. Because block layer allocates map for all the possible CPUs.
>>>
>>>The actual mapping is done in blk_mq_map_queues(), and it iterates all the
>>>possible CPUs. If we report num_present_cpus(), the rest of the CPUs also
>>>need to be mapped.

Actually I get your point, reporting num_present_cpus() will get less number of 
struct blk_mq_hw_ctx created. So it saves memory.

If we don't plan to support adding/onlining CPUs, we should use 
num_present_cpus().

>>>
>>>>>>
>>>>>>Michael
>>>>>>
>>>>>>>
>>>>>>> /*
>>>>>>>  * Set the error handler work queue.
>>>>>>> --
>>>>>>> 2.17.1



RE: [Patch v2] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-08-22 Thread Long Li
>>>Subject: RE: [Patch v2] storvsc: setup 1:1 mapping between hardware
>>>queue and CPU queue
>>>
>>>From: Long Li  Sent: Thursday, August 22, 2019
>>>1:42 PM
>>>>
>>>> storvsc doesn't use a dedicated hardware queue for a given CPU queue.
>>>> When issuing I/O, it selects returning CPU (hardware queue)
>>>> dynamically based on vmbus channel usage across all channels.
>>>>
>>>> This patch advertises num_possible_cpus() as number of hardware
>>>> queues. This will have upper layer setup 1:1 mapping between hardware
>>>> queue and CPU queue and avoid unnecessary locking when issuing I/O.
>>>>
>>>> Changes:
>>>> v2: rely on default upper layer function to map queues. (suggested by
>>>> Ming Lei
>>>> )
>>>>
>>>> Signed-off-by: Long Li 
>>>> ---
>>>>  drivers/scsi/storvsc_drv.c | 3 +--
>>>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>>>> index b89269120a2d..dfd3b76a4f89 100644
>>>> --- a/drivers/scsi/storvsc_drv.c
>>>> +++ b/drivers/scsi/storvsc_drv.c
>>>> @@ -1836,8 +1836,7 @@ static int storvsc_probe(struct hv_device
>>>*device,
>>>>/*
>>>> * Set the number of HW queues we are supporting.
>>>> */
>>>> -  if (stor_device->num_sc != 0)
>>>> -  host->nr_hw_queues = stor_device->num_sc + 1;
>>>> +  host->nr_hw_queues = num_possible_cpus();
>>>
>>>For a lot of the VM sizes in Azure, num_possible_cpus() is 128, even if the
>>>VM has only 4 or 8 or some other smaller number of vCPUs.
>>>So I'm wondering if you really want num_present_cpus() here instead,
>>>which would include only the vCPUs that actually exist in the VM.

I think reporting num_possible_cpus() doesn't do more harm or take more 
resources. Because block layer allocates map for all the possible CPUs.

The actual mapping is done in blk_mq_map_queues(), and it iterates all the 
possible CPUs. If we report num_present_cpus(), the rest of the CPUs also need 
to be mapped.

>>>
>>>Michael
>>>
>>>>
>>>>/*
>>>> * Set the error handler work queue.
>>>> --
>>>> 2.17.1



RE: [PATCH] storvsc: setup 1:1 mapping between hardware queue and CPU queue

2019-08-22 Thread Long Li
>>>Subject: Re: [PATCH] storvsc: setup 1:1 mapping between hardware queue
>>>and CPU queue
>>>
>>>On Tue, Aug 20, 2019 at 3:36 AM  wrote:
>>>>
>>>> From: Long Li 
>>>>
>>>> storvsc doesn't use a dedicated hardware queue for a given CPU queue.
>>>> When issuing I/O, it selects returning CPU (hardware queue)
>>>> dynamically based on vmbus channel usage across all channels.
>>>>
>>>> This patch sets up a 1:1 mapping between hardware queue and CPU
>>>queue,
>>>> thus avoiding unnecessary locking at upper layer when issuing I/O.
>>>>
>>>> Signed-off-by: Long Li 
>>>> ---
>>>>  drivers/scsi/storvsc_drv.c | 16 ++--
>>>>  1 file changed, 14 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
>>>> index b89269120a2d..26c16d40ec46 100644
>>>> --- a/drivers/scsi/storvsc_drv.c
>>>> +++ b/drivers/scsi/storvsc_drv.c
>>>> @@ -1682,6 +1682,18 @@ static int storvsc_queuecommand(struct
>>>Scsi_Host *host, struct scsi_cmnd *scmnd)
>>>> return 0;
>>>>  }
>>>>
>>>> +static int storvsc_map_queues(struct Scsi_Host *shost) {
>>>> +   unsigned int cpu;
>>>> +   struct blk_mq_queue_map *qmap =
>>>> +>tag_set.map[HCTX_TYPE_DEFAULT];
>>>> +
>>>> +   for_each_possible_cpu(cpu) {
>>>> +   qmap->mq_map[cpu] = cpu;
>>>> +   }
>>>
>>>Block layer provides the helper of blk_mq_map_queues(), so suggest you
>>>to use the default cpu mapping, instead of inventing a new one.

Thanks for the pointer. I'm sending a v2.

Long

>>>
>>>thanks,
>>>Ming Lei


RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-21 Thread Long Li
>>>Subject: RE: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on
>>>CPU
>>>>>>with flooded interrupts
>>>>>>
>>>>>>
>>>>>>> From: Long Li 
>>>>>>>
>>>>>>> When a NVMe hardware queue is mapped to several CPU queues, it is
>>>>>>> possible that the CPU this hardware queue is bound to is flooded by
>>>>>>> returning I/O for other CPUs.
>>>>>>>
>>>>>>> For example, consider the following scenario:
>>>>>>> 1. CPU 0, 1, 2 and 3 share the same hardware queue 2. the hardware
>>>>>>> queue interrupts CPU 0 for I/O response 3. processes from CPU 1, 2
>>>>>>> and
>>>>>>> 3 keep sending I/Os
>>>>>>>
>>>>>>> CPU 0 may be flooded with interrupts from NVMe device that are I/O
>>>>>>> responses for CPU 1, 2 and 3. Under heavy I/O load, it is possible
>>>>>>> that CPU 0 spends all the time serving NVMe and other system
>>>>>>> interrupts, but doesn't have a chance to run in process context.
>>>>>>>
>>>>>>> To fix this, CPU 0 can schedule a work to complete the I/O request
>>>>>>> when it detects the scheduler is not making progress. This serves
>>>>>>> multiple
>>>>>>purposes:
>>>>>>>
>>>>>>> 1. This CPU has to be scheduled to complete the request. The other
>>>>>>> CPUs can't issue more I/Os until some previous I/Os are completed.
>>>>>>> This helps this CPU get out of NVMe interrupts.
>>>>>>>
>>>>>>> 2. This acts a throttling mechanisum for NVMe devices, in that it
>>>>>>> can not starve a CPU while servicing I/Os from other CPUs.
>>>>>>>
>>>>>>> 3. This CPU can make progress on RCU and other work items on its
>>>queue.
>>>>>>
>>>>>>The problem is indeed real, but this is the wrong approach in my mind.
>>>>>>
>>>>>>We already have irqpoll which takes care proper budgeting polling
>>>>>>cycles and not hogging the cpu.
>>>>>>
>>>>>>I've sent rfc for this particular problem before [1]. At the time
>>>>>>IIRC, Christoph suggested that we will poll the first batch directly
>>>>>>from the irq context and reap the rest in irqpoll handler.
>>>
>>>Thanks for the pointer. I will test and report back.

Sagi,

Here are the test results.

Benchmark command:
fio --bs=4k --ioengine=libaio --iodepth=64 
--filename=/dev/nvme0n1:/dev/nvme1n1:/dev/nvme2n1:/dev/nvme3n1:/dev/nvme4n1:/dev/nvme5n1:/dev/nvme6n1:/dev/nvme7n1:/dev/nvme8n1:/dev/nvme9n1
 --direct=1 --runtime=90 --numjobs=80 --rw=randread --name=test 
--group_reporting --gtod_reduce=1

With your patch: 1720k IOPS
With threaded interrupts: 1320k IOPS
With just interrupts: 3720k IOPS

Interrupts are the fastest but we need to find a way to throttle it.

Thanks

Long


>>>
>>>>>>
>>>>>>[1]:
>>>>>>https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Fl
>>>ists.
>>>>>>infradead.org%2Fpipermail%2Flinux-nvme%2F2016-
>>>>>>October%2F006497.htmldata=02%7C01%7Clongli%40microsoft.co
>>>m%
>>>>>>7C0ebf36eff15c4182116608d725948b93%7C72f988bf86f141af91ab2d7cd0
>>>11d
>>>>>>b47%7C1%7C0%7C637019192254250361sdata=fJ%2Fkc8HLSmfzaY
>>>3BY
>>>>>>E66zlZKD6FjcXgMJZzVGCVqI%2FU%3Dreserved=0
>>>>>>
>>>>>>How about something like this instead:
>>>>>>--
>>>>>>diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index
>>>>>>71127a366d3c..84bf16d75109 100644
>>>>>>--- a/drivers/nvme/host/pci.c
>>>>>>+++ b/drivers/nvme/host/pci.c
>>>>>>@@ -24,6 +24,7 @@
>>>>>>  #include 
>>>>>>  #include 
>>>>>>  #include 
>>>>>>+#include 
>>>>>>
>>>>>>  #include "trace.h"
>>>>>>  #include "nvme.h"
>>>>>>@@ -32,6 +33,7 @@
>>>>>>  #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvm

RE: [PATCH 0/3] fix interrupt swamp in NVMe

2019-08-21 Thread Long Li
>>>Subject: Re: [PATCH 0/3] fix interrupt swamp in NVMe
>>>
>>>On Wed, Aug 21, 2019 at 07:47:44AM +, Long Li wrote:
>>>> >>>Subject: Re: [PATCH 0/3] fix interrupt swamp in NVMe
>>>> >>>
>>>> >>>On 20/08/2019 09:25, Ming Lei wrote:
>>>> >>>> On Tue, Aug 20, 2019 at 2:14 PM  wrote:
>>>> >>>>>
>>>> >>>>> From: Long Li 
>>>> >>>>>
>>>> >>>>> This patch set tries to fix interrupt swamp in NVMe devices.
>>>> >>>>>
>>>> >>>>> On large systems with many CPUs, a number of CPUs may share
>>>one
>>>> >>>NVMe
>>>> >>>>> hardware queue. It may have this situation where several CPUs
>>>> >>>>> are issuing I/Os, and all the I/Os are returned on the CPU where
>>>> >>>>> the
>>>> >>>hardware queue is bound to.
>>>> >>>>> This may result in that CPU swamped by interrupts and stay in
>>>> >>>>> interrupt mode for extended time while other CPUs continue to
>>>> >>>>> issue I/O. This can trigger Watchdog and RCU timeout, and make
>>>> >>>>> the system
>>>> >>>unresponsive.
>>>> >>>>>
>>>> >>>>> This patch set addresses this by enforcing scheduling and
>>>> >>>>> throttling I/O when CPU is starved in this situation.
>>>> >>>>>
>>>> >>>>> Long Li (3):
>>>> >>>>>   sched: define a function to report the number of context switches
>>>on a
>>>> >>>>> CPU
>>>> >>>>>   sched: export idle_cpu()
>>>> >>>>>   nvme: complete request in work queue on CPU with flooded
>>>> >>>>> interrupts
>>>> >>>>>
>>>> >>>>>  drivers/nvme/host/core.c | 57
>>>> >>>>> +++-
>>>> >>>>>  drivers/nvme/host/nvme.h |  1 +
>>>> >>>>>  include/linux/sched.h|  2 ++
>>>> >>>>>  kernel/sched/core.c  |  7 +
>>>> >>>>>  4 files changed, 66 insertions(+), 1 deletion(-)
>>>> >>>>
>>>> >>>> Another simpler solution may be to complete request in threaded
>>>> >>>> interrupt handler for this case. Meantime allow scheduler to run
>>>> >>>> the interrupt thread handler on CPUs specified by the irq
>>>> >>>> affinity mask, which was discussed by the following link:
>>>> >>>>
>>>> >>>>
>>>> >>>https://lor
>>>> >>>e
>>>> >>>> .kernel.org%2Flkml%2Fe0e9478e-62a5-ca24-3b12-
>>>> >>>58f7d056383e%40huawei.com
>>>> >>>> %2Fdata=02%7C01%7Clongli%40microsoft.com%7Cc7f46d3e2
>>>73f45
>>>> >>>176d1c08
>>>> >>>>
>>>> >>>d7254cc69e%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C63
>>>70188
>>>> >>>8401588
>>>> >>>>
>>>> >>>9866sdata=h5k6HoGoyDxuhmDfuKLZUwgmw17PU%2BT%2FCb
>>>awfxV
>>>> >>>Er3U%3D
>>>> >>>> reserved=0
>>>> >>>>
>>>> >>>> Could you try the above solution and see if the lockup can be
>>>avoided?
>>>> >>>> John Garry
>>>> >>>> should have workable patch.
>>>> >>>
>>>> >>>Yeah, so we experimented with changing the interrupt handling in
>>>> >>>the SCSI driver I maintain to use a threaded handler IRQ handler
>>>> >>>plus patch below, and saw a significant throughput boost:
>>>> >>>
>>>> >>>--->8
>>>> >>>
>>>> >>>Subject: [PATCH] genirq: Add support to allow thread to use hard
>>>> >>>irq affinity
>>>> >>>
>>>> >>>Currently the cpu allowed mask for the threaded part of a threaded
>>>> >>>irq handler will be set to the effective affinity of the hard irq.
>>>> >>>
>>>> >>&g

RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-21 Thread Long Li
>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>
>>>> From: Long Li 
>>>>
>>>> When a NVMe hardware queue is mapped to several CPU queues, it is
>>>> possible that the CPU this hardware queue is bound to is flooded by
>>>> returning I/O for other CPUs.
>>>>
>>>> For example, consider the following scenario:
>>>> 1. CPU 0, 1, 2 and 3 share the same hardware queue 2. the hardware
>>>> queue interrupts CPU 0 for I/O response 3. processes from CPU 1, 2 and
>>>> 3 keep sending I/Os
>>>>
>>>> CPU 0 may be flooded with interrupts from NVMe device that are I/O
>>>> responses for CPU 1, 2 and 3. Under heavy I/O load, it is possible
>>>> that CPU 0 spends all the time serving NVMe and other system
>>>> interrupts, but doesn't have a chance to run in process context.
>>>>
>>>> To fix this, CPU 0 can schedule a work to complete the I/O request
>>>> when it detects the scheduler is not making progress. This serves multiple
>>>purposes:
>>>>
>>>> 1. This CPU has to be scheduled to complete the request. The other
>>>> CPUs can't issue more I/Os until some previous I/Os are completed.
>>>> This helps this CPU get out of NVMe interrupts.
>>>>
>>>> 2. This acts a throttling mechanisum for NVMe devices, in that it can
>>>> not starve a CPU while servicing I/Os from other CPUs.
>>>>
>>>> 3. This CPU can make progress on RCU and other work items on its queue.
>>>
>>>The problem is indeed real, but this is the wrong approach in my mind.
>>>
>>>We already have irqpoll which takes care proper budgeting polling cycles
>>>and not hogging the cpu.
>>>
>>>I've sent rfc for this particular problem before [1]. At the time IIRC,
>>>Christoph suggested that we will poll the first batch directly from the irq
>>>context and reap the rest in irqpoll handler.

Thanks for the pointer. I will test and report back.

>>>
>>>[1]:
>>>https://nam06.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists.
>>>infradead.org%2Fpipermail%2Flinux-nvme%2F2016-
>>>October%2F006497.htmldata=02%7C01%7Clongli%40microsoft.com%
>>>7C0ebf36eff15c4182116608d725948b93%7C72f988bf86f141af91ab2d7cd011d
>>>b47%7C1%7C0%7C637019192254250361sdata=fJ%2Fkc8HLSmfzaY3BY
>>>E66zlZKD6FjcXgMJZzVGCVqI%2FU%3Dreserved=0
>>>
>>>How about something like this instead:
>>>--
>>>diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index
>>>71127a366d3c..84bf16d75109 100644
>>>--- a/drivers/nvme/host/pci.c
>>>+++ b/drivers/nvme/host/pci.c
>>>@@ -24,6 +24,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>>+#include 
>>>
>>>  #include "trace.h"
>>>  #include "nvme.h"
>>>@@ -32,6 +33,7 @@
>>>  #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
>>>
>>>  #define SGES_PER_PAGE  (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
>>>+#define NVME_POLL_BUDGET_IRQ   256
>>>
>>>  /*
>>>   * These can be higher, but we need to ensure that any command doesn't
>>>@@ -189,6 +191,7 @@ struct nvme_queue {
>>> u32 *dbbuf_cq_db;
>>> u32 *dbbuf_sq_ei;
>>> u32 *dbbuf_cq_ei;
>>>+   struct irq_poll iop;
>>> struct completion delete_done;
>>>  };
>>>
>>>@@ -1015,6 +1018,23 @@ static inline int nvme_process_cq(struct
>>>nvme_queue *nvmeq, u16 *start,
>>> return found;
>>>  }
>>>
>>>+static int nvme_irqpoll_handler(struct irq_poll *iop, int budget) {
>>>+   struct nvme_queue *nvmeq = container_of(iop, struct nvme_queue,
>>>iop);
>>>+   struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
>>>+   u16 start, end;
>>>+   int completed;
>>>+
>>>+   completed = nvme_process_cq(nvmeq, , , budget);
>>>+   nvme_complete_cqes(nvmeq, start, end);
>>>+   if (completed < budget) {
>>>+   irq_poll_complete(>iop);
>>>+   enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
>>>+   }
>>>+
>>>+   return completed;
>>>+}
>>>+
>>>  static irqreturn_t nvme_irq(int irq, void *data)
>>>  {
>>>

RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-21 Thread Long Li
>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>On Mon, Aug 19, 2019 at 11:14:29PM -0700, lon...@linuxonhyperv.com
>>>wrote:
>>>> From: Long Li 
>>>>
>>>> When a NVMe hardware queue is mapped to several CPU queues, it is
>>>> possible that the CPU this hardware queue is bound to is flooded by
>>>> returning I/O for other CPUs.
>>>>
>>>> For example, consider the following scenario:
>>>> 1. CPU 0, 1, 2 and 3 share the same hardware queue 2. the hardware
>>>> queue interrupts CPU 0 for I/O response 3. processes from CPU 1, 2 and
>>>> 3 keep sending I/Os
>>>>
>>>> CPU 0 may be flooded with interrupts from NVMe device that are I/O
>>>> responses for CPU 1, 2 and 3. Under heavy I/O load, it is possible
>>>> that CPU 0 spends all the time serving NVMe and other system
>>>> interrupts, but doesn't have a chance to run in process context.
>>>
>>>Ideally -- and there is some code to affect this, the load-balancer will move
>>>tasks away from this CPU.
>>>
>>>> To fix this, CPU 0 can schedule a work to complete the I/O request
>>>> when it detects the scheduler is not making progress. This serves multiple
>>>purposes:
>>>
>>>Suppose the task waiting for the IO completion is a RT task, and you've just
>>>queued it to a regular work. This is an instant priority inversion.

This is a choice. We can either not "lock up" the CPU, or finish the I/O on 
time from IRQ handler. I think throttling only happens in extreme conditions, 
which is rare. The purpose is to make the whole system responsive and happy.

>>>
>>>> 1. This CPU has to be scheduled to complete the request. The other
>>>> CPUs can't issue more I/Os until some previous I/Os are completed.
>>>> This helps this CPU get out of NVMe interrupts.
>>>>
>>>> 2. This acts a throttling mechanisum for NVMe devices, in that it can
>>>> not starve a CPU while servicing I/Os from other CPUs.
>>>>
>>>> 3. This CPU can make progress on RCU and other work items on its queue.
>>>>
>>>> Signed-off-by: Long Li 
>>>> ---
>>>>  drivers/nvme/host/core.c | 57
>>>> +++-
>>>>  drivers/nvme/host/nvme.h |  1 +
>>>>  2 files changed, 57 insertions(+), 1 deletion(-)
>>>
>>>WTH does this live in the NVME driver? Surely something like this should be
>>>in the block layer. I'm thinking there's fiber channel connected storage that
>>>should be able to trigger much the same issues.

Yes this can be done in block layer. I'm not sure the best way to accomplish 
this so implemented a NVMe patch to help test. The test results are promising 
in that we are getting 99.5% of performance while avoided CPU lockup. The 
challenge is to find a way to throttle a fast storage device.

>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index
>>>> 6a9dd68c0f4f..576bb6fce293 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>
>>>> @@ -260,9 +270,54 @@ static void nvme_retry_req(struct request *req)
>>>>blk_mq_delay_kick_requeue_list(req->q, delay);  }
>>>>
>>>> +static void nvme_complete_rq_work(struct work_struct *work) {
>>>> +  struct nvme_request *nvme_rq =
>>>> +  container_of(work, struct nvme_request, work);
>>>> +  struct request *req = blk_mq_rq_from_pdu(nvme_rq);
>>>> +
>>>> +  nvme_complete_rq(req);
>>>> +}
>>>> +
>>>> +
>>>>  void nvme_complete_rq(struct request *req)  {
>>>> -  blk_status_t status = nvme_error_status(req);
>>>> +  blk_status_t status;
>>>> +  int cpu;
>>>> +  u64 switches;
>>>> +  struct nvme_request *nvme_rq;
>>>> +
>>>> +  if (!in_interrupt())
>>>> +  goto skip_check;
>>>> +
>>>> +  nvme_rq = nvme_req(req);
>>>> +  cpu = smp_processor_id();
>>>> +  if (idle_cpu(cpu))
>>>> +  goto skip_check;
>>>> +
>>>> +  /* Check if this CPU is flooded with interrupts */
>>>> +  switches = get_cpu_rq_switches(cpu);
>>>> +  if (this_cpu_read(last_switch) == switches) {
>>>> +  /*
>>>> +   * If this CPU hasn't 

RE: [PATCH 1/3] sched: define a function to report the number of context switches on a CPU

2019-08-21 Thread Long Li
>>>Subject: Re: [PATCH 1/3] sched: define a function to report the number of
>>>context switches on a CPU
>>>
>>>On Mon, Aug 19, 2019 at 11:14:27PM -0700, lon...@linuxonhyperv.com
>>>wrote:
>>>> From: Long Li 
>>>>
>>>> The number of context switches on a CPU is useful to determine how
>>>> busy this CPU is on processing IRQs. Export this information so it can
>>>> be used by device drivers.
>>>
>>>Please do explain that; because I'm not seeing how number of switches
>>>relates to processing IRQs _at_all_!

Some kernel components rely on context switch to progress, for example watchdog 
and RCU. On a CPU with reasonable interrupt load, it continues to make context 
switches, normally a number of switches per seconds. 

While observing a CPU with heavy interrupt loads, I see that it spends all its 
time in IRQ and softIRQ, and not to get a chance to do a switch (calling 
__schedule()) for a long time. This will result in system unresponsive at 
times. The purpose is to find out if this CPU is in this state, and implement 
some throttling mechanism to help reduce the number of interrupts. I think the 
number of switches is not accurate for detecting this condition in the most 
precise way, but maybe it's good enough.

I agree this may not be the best way. If you have other idea on detecting a CPU 
is swamped by interrupts, please point me to where to look at.

Thanks

Long




RE: [PATCH 0/3] fix interrupt swamp in NVMe

2019-08-21 Thread Long Li
>>>Subject: Re: [PATCH 0/3] fix interrupt swamp in NVMe
>>>
>>>On 20/08/2019 09:25, Ming Lei wrote:
>>>> On Tue, Aug 20, 2019 at 2:14 PM  wrote:
>>>>>
>>>>> From: Long Li 
>>>>>
>>>>> This patch set tries to fix interrupt swamp in NVMe devices.
>>>>>
>>>>> On large systems with many CPUs, a number of CPUs may share one
>>>NVMe
>>>>> hardware queue. It may have this situation where several CPUs are
>>>>> issuing I/Os, and all the I/Os are returned on the CPU where the
>>>hardware queue is bound to.
>>>>> This may result in that CPU swamped by interrupts and stay in
>>>>> interrupt mode for extended time while other CPUs continue to issue
>>>>> I/O. This can trigger Watchdog and RCU timeout, and make the system
>>>unresponsive.
>>>>>
>>>>> This patch set addresses this by enforcing scheduling and throttling
>>>>> I/O when CPU is starved in this situation.
>>>>>
>>>>> Long Li (3):
>>>>>   sched: define a function to report the number of context switches on a
>>>>> CPU
>>>>>   sched: export idle_cpu()
>>>>>   nvme: complete request in work queue on CPU with flooded interrupts
>>>>>
>>>>>  drivers/nvme/host/core.c | 57
>>>>> +++-
>>>>>  drivers/nvme/host/nvme.h |  1 +
>>>>>  include/linux/sched.h|  2 ++
>>>>>  kernel/sched/core.c  |  7 +
>>>>>  4 files changed, 66 insertions(+), 1 deletion(-)
>>>>
>>>> Another simpler solution may be to complete request in threaded
>>>> interrupt handler for this case. Meantime allow scheduler to run the
>>>> interrupt thread handler on CPUs specified by the irq affinity mask,
>>>> which was discussed by the following link:
>>>>
>>>>
>>>https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Flor
>>>e
>>>> .kernel.org%2Flkml%2Fe0e9478e-62a5-ca24-3b12-
>>>58f7d056383e%40huawei.com
>>>> %2Fdata=02%7C01%7Clongli%40microsoft.com%7Cc7f46d3e273f45
>>>176d1c08
>>>>
>>>d7254cc69e%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6370188
>>>8401588
>>>>
>>>9866sdata=h5k6HoGoyDxuhmDfuKLZUwgmw17PU%2BT%2FCbawfxV
>>>Er3U%3D
>>>> reserved=0
>>>>
>>>> Could you try the above solution and see if the lockup can be avoided?
>>>> John Garry
>>>> should have workable patch.
>>>
>>>Yeah, so we experimented with changing the interrupt handling in the SCSI
>>>driver I maintain to use a threaded handler IRQ handler plus patch below,
>>>and saw a significant throughput boost:
>>>
>>>--->8
>>>
>>>Subject: [PATCH] genirq: Add support to allow thread to use hard irq affinity
>>>
>>>Currently the cpu allowed mask for the threaded part of a threaded irq
>>>handler will be set to the effective affinity of the hard irq.
>>>
>>>Typically the effective affinity of the hard irq will be for a single cpu. 
>>>As such,
>>>the threaded handler would always run on the same cpu as the hard irq.
>>>
>>>We have seen scenarios in high data-rate throughput testing that the cpu
>>>handling the interrupt can be totally saturated handling both the hard
>>>interrupt and threaded handler parts, limiting throughput.
>>>
>>>Add IRQF_IRQ_AFFINITY flag to allow the driver requesting the threaded
>>>interrupt to decide on the policy of which cpu the threaded handler may run.
>>>
>>>Signed-off-by: John Garry 

Thanks for pointing me to this patch. This fixed the interrupt swamp and make 
the system stable.

However I'm seeing reduced performance when using threaded interrupts.

Here are the test results on a system with 80 CPUs and 10 NVMe disks (32 
hardware queues for each disk)
Benchmark tool is FIO, I/O pattern: 4k random reads on all NVMe disks, with 
queue depth = 64, num of jobs = 80, direct=1

With threaded interrupts: 1320k IOPS
With just interrupts: 3720k IOPS
With just interrupts and my patch: 3700k IOPS

At the peak IOPS, the overall CPU usage is at around 98-99%. I think the cost 
of doing wake up and context switch for NVMe threaded IRQ handler takes some 
CPU away.

In this test, I made the following change to make use of IRQF_IRQ_AFFINITY for 
NVMe:

diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c
index a1de501a2729..3fb30

RE: [Patch (resend) 5/5] cifs: Call MID callback before destroying transport

2019-05-13 Thread Long Li
>>>-Original Message-
>>>From: Pavel Shilovsky 
>>>Sent: Thursday, May 9, 2019 11:01 AM
>>>To: Long Li 
>>>Cc: Steve French ; linux-cifs >>c...@vger.kernel.org>; samba-technical ;
>>>Kernel Mailing List 
>>>Subject: Re: [Patch (resend) 5/5] cifs: Call MID callback before destroying
>>>transport
>>>
>>>пт, 5 апр. 2019 г. в 14:39, Long Li :
>>>>
>>>> From: Long Li 
>>>>
>>>> When transport is being destroyed, it's possible that some processes
>>>> may hold memory registrations that need to be deregistred.
>>>>
>>>> Call them first so nobody is using transport resources, and it can be
>>>> destroyed.
>>>>
>>>> Signed-off-by: Long Li 
>>>> ---
>>>>  fs/cifs/connect.c | 36 +++-
>>>>  1 file changed, 19 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index
>>>> 33e4d98..084756cf 100644
>>>> --- a/fs/cifs/connect.c
>>>> +++ b/fs/cifs/connect.c
>>>> @@ -528,22 +528,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
>>>> /* do not want to be sending data on a socket we are freeing */
>>>> cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
>>>> mutex_lock(>srv_mutex);
>>>> -   if (server->ssocket) {
>>>> -   cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
>>>> -server->ssocket->state, server->ssocket->flags);
>>>> -   kernel_sock_shutdown(server->ssocket, SHUT_WR);
>>>> -   cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
>>>> -server->ssocket->state, server->ssocket->flags);
>>>> -   sock_release(server->ssocket);
>>>> -   server->ssocket = NULL;
>>>> -   } else if (cifs_rdma_enabled(server))
>>>> -   smbd_destroy(server);
>>>> -   server->sequence_number = 0;
>>>> -   server->session_estab = false;
>>>> -   kfree(server->session_key.response);
>>>> -   server->session_key.response = NULL;
>>>> -   server->session_key.len = 0;
>>>> -   server->lstrp = jiffies;
>>>>
>>>> /* mark submitted MIDs for retry and issue callback */
>>>> INIT_LIST_HEAD(_list);
>>>> @@ -556,7 +540,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
>>>> list_move(_entry->qhead, _list);
>>>> }
>>>> spin_unlock(_Lock);
>>>> -   mutex_unlock(>srv_mutex);
>>>>
>>>> cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
>>>> list_for_each_safe(tmp, tmp2, _list) { @@ -565,6 +548,25
>>>> @@ cifs_reconnect(struct TCP_Server_Info *server)
>>>> mid_entry->callback(mid_entry);
>>>> }
>>>
>>>The original call was issuing callbacks without holding srv_mutex - callbacks
>>>may take this mutex for its internal needs. With the proposed patch the
>>>code will deadlock.
>>>
>>>Also the idea of destroying the socket first is to allow possible retries 
>>>(from
>>>callbacks) to return a proper error instead of trying to send anything 
>>>through
>>>the reconnecting socket.

I will send a patch to revert this and follow your suggestion on putting 
smbd_destroy() to after all MIDs have been called. Your suggestion tested well.

Thanks

Long

>>>
>>>>
>>>> +   if (server->ssocket) {
>>>> +   cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
>>>> +server->ssocket->state, server->ssocket->flags);
>>>> +   kernel_sock_shutdown(server->ssocket, SHUT_WR);
>>>> +   cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
>>>> +server->ssocket->state, server->ssocket->flags);
>>>> +   sock_release(server->ssocket);
>>>> +   server->ssocket = NULL;
>>>> +   } else if (cifs_rdma_enabled(server))
>>>> +   smbd_destroy(server);
>>>
>>>If we need to call smbd_destroy() *after* callbacks, let's just move it alone
>>>without the rest of the code.
>>>
>>>
>>>> +   server->sequence_number = 0;
>>>> +   server->session_estab = false;
>>>> +   kfree(server->session_key.response);
>>>> +   server->session_key.response = NULL;
>>>> +   server->session_key.len = 0;
>>>> +   server->lstrp = jiffies;
>>>> +
>>>> +   mutex_unlock(>srv_mutex);
>>>> +
>>>> do {
>>>> try_to_freeze();
>>>>
>>>> --
>>>> 2.7.4
>>>>
>>>
>>>
>>>--
>>>Best regards,
>>>Pavel Shilovsky


RE: [PATCH] x86/hyper-v: implement EOI assist

2019-04-15 Thread Long Li
>>>Subject: Re: [PATCH] x86/hyper-v: implement EOI assist
>>>
>>>Vitaly Kuznetsov  writes:
>>>
>>>> Hyper-V TLFS suggests an optimization to avoid imminent VMExit on EOI:
>>>> "The OS performs an EOI by atomically writing zero to the EOI Assist
>>>> field of the virtual VP assist page and checking whether the "No EOI
>>>required"
>>>> field was previously zero. If it was, the OS must write to the
>>>> HV_X64_APIC_EOI MSR thereby triggering an intercept into the
>>>hypervisor."
>>>>
>>>> Implement the optimization in Linux.
>>>>
>>>
>>>Simon, Long,
>>>
>>>did you get a chance to run some tests with this?

I have ran some tests on Azure L80s_v2.

With 10 NVMe disks on raid0 and formatted to EXT4, I'm getting 2.6m max IOPS 
with the patch, compared to 2.55m IOPS before.

The VM has been running stable. Thank you!

Tested-by: Long Li 

>>>
>>>--
>>>Vitaly


[PATCH] cifs: smbd: take an array of reqeusts when sending upper layer data

2019-04-15 Thread Long Li
From: Long Li 

To support compounding, __smb_send_rqst() now sends an array of requests to
the transport layer.
Change smbd_send() to take an array of requests, and send them in as few
packets as possible.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 55 +++--
 fs/cifs/smbdirect.h |  3 ++-
 fs/cifs/transport.c |  2 +-
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index b95354c..272bdf8 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -2075,7 +2075,8 @@ int smbd_recv(struct smbd_connection *info, struct msghdr 
*msg)
  * rqst: the data to write
  * return value: 0 if successfully write, otherwise error code
  */
-int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+int smbd_send(struct TCP_Server_Info *server,
+   int num_rqst, struct smb_rqst *rqst_array)
 {
struct smbd_connection *info = server->smbd_conn;
struct kvec vec;
@@ -2087,6 +2088,8 @@ int smbd_send(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
info->max_send_size - sizeof(struct smbd_data_transfer);
struct kvec *iov;
int rc;
+   struct smb_rqst *rqst;
+   int rqst_idx;
 
if (info->transport_status != SMBD_CONNECTED) {
rc = -EAGAIN;
@@ -2094,46 +2097,40 @@ int smbd_send(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
}
 
/*
-* Skip the RFC1002 length defined in MS-SMB2 section 2.1
-* It is used only for TCP transport in the iov[0]
-* In future we may want to add a transport layer under protocol
-* layer so this will only be issued to TCP transport
-*/
-
-   if (rqst->rq_iov[0].iov_len != 4) {
-   log_write(ERR, "expected the pdu length in 1st iov, but got 
%zu\n", rqst->rq_iov[0].iov_len);
-   return -EINVAL;
-   }
-
-   /*
 * Add in the page array if there is one. The caller needs to set
 * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
 * ends at page boundary
 */
-   buflen = smb_rqst_len(server, rqst);
+   remaining_data_length = 0;
+   for (i = 0; i < num_rqst; i++)
+   remaining_data_length += smb_rqst_len(server, _array[i]);
 
-   if (buflen + sizeof(struct smbd_data_transfer) >
+   if (remaining_data_length + sizeof(struct smbd_data_transfer) >
info->max_fragmented_send_size) {
log_write(ERR, "payload size %d > max size %d\n",
-   buflen, info->max_fragmented_send_size);
+   remaining_data_length, info->max_fragmented_send_size);
rc = -EINVAL;
goto done;
}
 
-   iov = >rq_iov[1];
+   rqst_idx = 0;
+
+next_rqst:
+   rqst = _array[rqst_idx];
+   iov = rqst->rq_iov;
 
-   cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen);
-   for (i = 0; i < rqst->rq_nvec-1; i++)
+   cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
+   rqst_idx, smb_rqst_len(server, rqst));
+   for (i = 0; i < rqst->rq_nvec; i++)
dump_smb(iov[i].iov_base, iov[i].iov_len);
 
-   remaining_data_length = buflen;
 
-   log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
-   "rq_tailsz=%d buflen=%d\n",
-   rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
-   rqst->rq_tailsz, buflen);
+   log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
+   "rq_tailsz=%d buflen=%lu\n",
+   rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
+   rqst->rq_tailsz, smb_rqst_len(server, rqst));
 
-   start = i = iov[0].iov_len ? 0 : 1;
+   start = i = 0;
buflen = 0;
while (true) {
buflen += iov[i].iov_len;
@@ -2181,14 +2178,14 @@ int smbd_send(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
goto done;
}
i++;
-   if (i == rqst->rq_nvec-1)
+   if (i == rqst->rq_nvec)
break;
}
start = i;
buflen = 0;
} else {
i++;
-   if (i == rqst->rq_nvec-1) {
+   if (i == rqst->rq_nvec) {
/* send out all remaining vecs */
remaining_data_length -= buflen;
log_write(INFO,
@@ -2232,6 +2229,10 @@ int smbd_send

[Patch (resend) 2/5] cifs: smbd: Return EINTR when interrupted

2019-04-05 Thread Long Li
From: Long Li 

When packets are waiting for outbound I/O and interrupted, return the
proper error code to user process.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 7259427..df95c75 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1972,7 +1972,7 @@ static int smbd_recv_buf(struct smbd_connection *info, 
char *buf,
info->transport_status != SMBD_CONNECTED);
/* Don't return any data if interrupted */
if (rc)
-   return -ENODEV;
+   return rc;
 
if (info->transport_status != SMBD_CONNECTED) {
log_read(ERR, "disconnected\n");
-- 
2.7.4



[Patch (resend) 1/5] cifs: smbd: Don't destroy transport on RDMA disconnect

2019-04-05 Thread Long Li
From: Long Li 

Now upper layer is handling the transport shutdown and reconnect, remove
the code that handling transport shutdown on RDMA disconnect.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c |   8 ++--
 fs/cifs/smbdirect.c  | 120 +++
 fs/cifs/smbdirect.h  |   9 
 3 files changed, 10 insertions(+), 127 deletions(-)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 19ed9ab..5ff0b3d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -312,12 +312,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, 
void *v)
atomic_read(>smbd_conn->send_credits),
atomic_read(>smbd_conn->receive_credits),
server->smbd_conn->receive_credit_target);
-   seq_printf(m, "\nPending send_pending: %x send_payload_pending:"
-   " %x smbd_send_pending: %x smbd_recv_pending: %x",
+   seq_printf(m, "\nPending send_pending: %x "
+   "send_payload_pending: %x",
atomic_read(>smbd_conn->send_pending),
-   atomic_read(>smbd_conn->send_payload_pending),
-   server->smbd_conn->smbd_send_pending,
-   server->smbd_conn->smbd_recv_pending);
+   atomic_read(>smbd_conn->send_payload_pending));
seq_printf(m, "\nReceive buffers count_receive_queue: %x "
"count_empty_packet_queue: %x",
server->smbd_conn->count_receive_queue,
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 06449cf..7259427 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -164,95 +164,6 @@ do {   
\
 #define log_rdma_mr(level, fmt, args...) \
log_rdma(level, LOG_RDMA_MR, fmt, ##args)
 
-/*
- * Destroy the transport and related RDMA and memory resources
- * Need to go through all the pending counters and make sure on one is using
- * the transport while it is destroyed
- */
-static void smbd_destroy_rdma_work(struct work_struct *work)
-{
-   struct smbd_response *response;
-   struct smbd_connection *info =
-   container_of(work, struct smbd_connection, destroy_work);
-   unsigned long flags;
-
-   log_rdma_event(INFO, "destroying qp\n");
-   ib_drain_qp(info->id->qp);
-   rdma_destroy_qp(info->id);
-
-   /* Unblock all I/O waiting on the send queue */
-   wake_up_interruptible_all(>wait_send_queue);
-
-   log_rdma_event(INFO, "cancelling idle timer\n");
-   cancel_delayed_work_sync(>idle_timer_work);
-   log_rdma_event(INFO, "cancelling send immediate work\n");
-   cancel_delayed_work_sync(>send_immediate_work);
-
-   log_rdma_event(INFO, "wait for all send to finish\n");
-   wait_event(info->wait_smbd_send_pending,
-   info->smbd_send_pending == 0);
-
-   log_rdma_event(INFO, "wait for all recv to finish\n");
-   wake_up_interruptible(>wait_reassembly_queue);
-   wait_event(info->wait_smbd_recv_pending,
-   info->smbd_recv_pending == 0);
-
-   log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
-   wait_event(info->wait_send_pending,
-   atomic_read(>send_pending) == 0);
-   wait_event(info->wait_send_payload_pending,
-   atomic_read(>send_payload_pending) == 0);
-
-   log_rdma_event(INFO, "freeing mr list\n");
-   wake_up_interruptible_all(>wait_mr);
-   wait_event(info->wait_for_mr_cleanup,
-   atomic_read(>mr_used_count) == 0);
-   destroy_mr_list(info);
-
-   /* It's not posssible for upper layer to get to reassembly */
-   log_rdma_event(INFO, "drain the reassembly queue\n");
-   do {
-   spin_lock_irqsave(>reassembly_queue_lock, flags);
-   response = _get_first_reassembly(info);
-   if (response) {
-   list_del(>list);
-   spin_unlock_irqrestore(
-   >reassembly_queue_lock, flags);
-   put_receive_buffer(info, response);
-   } else
-   spin_unlock_irqrestore(>reassembly_queue_lock, 
flags);
-   } while (response);
-
-   info->reassembly_data_length = 0;
-
-   log_rdma_event(INFO, "free receive buffers\n");
-   wait_event(info->wait_receive_queues,
-   info->count_receive_queue + info->count_empty_packet_queue
-   == info->receive_credit_max);
-   destroy_receive_buffers(info);
-
-   ib

[Patch (resend) 3/5] cifs: smbd: Indicate to retry on transport sending failure

2019-04-05 Thread Long Li
From: Long Li 

Failure to send a packet doesn't mean it's a permanent failure, it can't be
returned to user process. This I/O should be retried or failed based on
server packet response and transport health. This logic is handled by the
upper layer.

Give this decision to upper layer.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index df95c75..05b05e7 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -853,7 +853,7 @@ static int smbd_create_header(struct smbd_connection *info,
 
if (info->transport_status != SMBD_CONNECTED) {
log_outgoing(ERR, "disconnected not sending\n");
-   return -ENOENT;
+   return -EAGAIN;
}
atomic_dec(>send_credits);
 
@@ -979,6 +979,7 @@ static int smbd_post_send(struct smbd_connection *info,
wake_up(>wait_send_pending);
}
smbd_disconnect_rdma_connection(info);
+   rc = -EAGAIN;
} else
/* Reset timer for idle connection after packet is sent */
mod_delayed_work(info->workqueue, >idle_timer_work,
@@ -2085,7 +2086,7 @@ int smbd_send(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
int rc;
 
if (info->transport_status != SMBD_CONNECTED) {
-   rc = -ENODEV;
+   rc = -EAGAIN;
goto done;
}
 
-- 
2.7.4



[Patch (resend) 4/5] cifs: smbd: Retry on memory registration failure

2019-04-05 Thread Long Li
From: Long Li 

Memory registration failure doesn't mean this I/O has failed, it means the
transport is hitting I/O error or needs reconnect. This error is not from
the server.

Indicate this error to upper layer, and let upper layer decide how to
reconnect and proceed with this I/O.

Signed-off-by: Long Li 
---
 fs/cifs/smb2pdu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 21ad01d..ff3b730 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3220,7 +3220,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
rdata->nr_pages, rdata->page_offset,
rdata->tailsz, true, need_invalidate);
if (!rdata->mr)
-   return -ENOBUFS;
+   return -EAGAIN;
 
req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
if (need_invalidate)
@@ -3624,7 +3624,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
wdata->nr_pages, wdata->page_offset,
wdata->tailsz, false, need_invalidate);
if (!wdata->mr) {
-   rc = -ENOBUFS;
+   rc = -EAGAIN;
goto async_writev_out;
}
req->Length = 0;
-- 
2.7.4



[Patch (resend) 5/5] cifs: Call MID callback before destroying transport

2019-04-05 Thread Long Li
From: Long Li 

When transport is being destroyed, it's possible that some processes may
hold memory registrations that need to be deregistred.

Call them first so nobody is using transport resources, and it can be
destroyed.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 33e4d98..084756cf 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -528,22 +528,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* do not want to be sending data on a socket we are freeing */
cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
mutex_lock(>srv_mutex);
-   if (server->ssocket) {
-   cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
-server->ssocket->state, server->ssocket->flags);
-   kernel_sock_shutdown(server->ssocket, SHUT_WR);
-   cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
-server->ssocket->state, server->ssocket->flags);
-   sock_release(server->ssocket);
-   server->ssocket = NULL;
-   } else if (cifs_rdma_enabled(server))
-   smbd_destroy(server);
-   server->sequence_number = 0;
-   server->session_estab = false;
-   kfree(server->session_key.response);
-   server->session_key.response = NULL;
-   server->session_key.len = 0;
-   server->lstrp = jiffies;
 
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(_list);
@@ -556,7 +540,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
list_move(_entry->qhead, _list);
}
spin_unlock(_Lock);
-   mutex_unlock(>srv_mutex);
 
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_safe(tmp, tmp2, _list) {
@@ -565,6 +548,25 @@ cifs_reconnect(struct TCP_Server_Info *server)
mid_entry->callback(mid_entry);
}
 
+   if (server->ssocket) {
+   cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
+server->ssocket->state, server->ssocket->flags);
+   kernel_sock_shutdown(server->ssocket, SHUT_WR);
+   cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
+server->ssocket->state, server->ssocket->flags);
+   sock_release(server->ssocket);
+   server->ssocket = NULL;
+   } else if (cifs_rdma_enabled(server))
+   smbd_destroy(server);
+   server->sequence_number = 0;
+   server->session_estab = false;
+   kfree(server->session_key.response);
+   server->session_key.response = NULL;
+   server->session_key.len = 0;
+   server->lstrp = jiffies;
+
+   mutex_unlock(>srv_mutex);
+
do {
try_to_freeze();
 
-- 
2.7.4



[Patch v2 2/2] CIFS: Fix an issue with re-sending rdata when transport returning -EAGAIN

2019-03-15 Thread Long Li
From: Long Li 

When sending a rdata, transport may return -EAGAIN. In this case
we should re-obtain credits because the session may have been
reconnected.

Change in v2: adjust_credits before re-sending

Signed-off-by: Long Li 
---
 fs/cifs/file.c | 71 +-
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 321df1d27422..9d90cc07e38b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3347,44 +3347,55 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
 
-   /*
-* Wait for credits to resend this rdata.
-* Note: we are attempting to resend the whole rdata not in segments
-*/
do {
-   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+   if (rdata->cfile->invalidHandle) {
+   rc = cifs_reopen_file(rdata->cfile, true);
+   if (rc == -EAGAIN)
+   continue;
+   else if (rc)
+   break;
+   }
+
+   /*
+* Wait for credits to resend this rdata.
+* Note: we are attempting to resend the whole rdata not in
+* segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
, );
 
-   if (rc)
-   goto out;
+   if (rc)
+   goto fail;
 
-   if (rsize < rdata->bytes) {
-   add_credits_and_wake_if(server, , 0);
-   msleep(1000);
-   }
-   } while (rsize < rdata->bytes);
+   if (rsize < rdata->bytes) {
+   add_credits_and_wake_if(server, , 0);
+   msleep(1000);
+   }
+   } while (rsize < rdata->bytes);
+   rdata->credits = credits;
 
-   rdata->credits = credits;
-   rc = -EAGAIN;
-   while (rc == -EAGAIN) {
-   rc = 0;
-   if (rdata->cfile->invalidHandle)
-   rc = cifs_reopen_file(rdata->cfile, true);
-   if (!rc)
-   rc = server->ops->async_readv(rdata);
-   }
+   rc = adjust_credits(server, >credits, rdata->bytes);
+   if (!rc) {
+   if (rdata->cfile->invalidHandle)
+   rc = -EAGAIN;
+   else
+   rc = server->ops->async_readv(rdata);
+   }
 
-   if (!rc) {
-   /* Add to aio pending list */
-   list_add_tail(>list, rdata_list);
-   return 0;
-   }
+   /* If the read was successfully sent, we are done */
+   if (!rc) {
+   /* Add to aio pending list */
+   list_add_tail(>list, rdata_list);
+   return 0;
+   }
 
-   add_credits_and_wake_if(server, >credits, 0);
-out:
-   kref_put(>refcount,
-   cifs_uncached_readdata_release);
+   /* Roll back credits and retry if needed */
+   add_credits_and_wake_if(server, >credits, 0);
+   } while (rc == -EAGAIN);
 
+fail:
+   kref_put(>refcount, cifs_uncached_readdata_release);
return rc;
 }
 
-- 
2.14.1



[Patch v2 1/2] CIFS: Fix an issue with re-sending wdata when transport returning -EAGAIN

2019-03-15 Thread Long Li
From: Long Li 

When sending a wdata, transport may return -EAGAIN. In this case
we should re-obtain credits because the session may have been
reconnected.

Change in v2: adjust_credits before re-sending

Signed-off-by: Long Li 
---
 fs/cifs/file.c | 77 ++
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9b53f33137b3..321df1d27422 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2620,43 +2620,56 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct 
list_head *wdata_list,
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
 
-   /*
-* Wait for credits to resend this wdata.
-* Note: we are attempting to resend the whole wdata not in segments
-*/
do {
-   rc = server->ops->wait_mtu_credits(server, wdata->bytes, ,
-  );
+   if (wdata->cfile->invalidHandle) {
+   rc = cifs_reopen_file(wdata->cfile, false);
+   if (rc == -EAGAIN)
+   continue;
+   else if (rc)
+   break;
+   }
 
-   if (rc)
-   goto out;
 
-   if (wsize < wdata->bytes) {
-   add_credits_and_wake_if(server, , 0);
-   msleep(1000);
-   }
-   } while (wsize < wdata->bytes);
+   /*
+* Wait for credits to resend this wdata.
+* Note: we are attempting to resend the whole wdata not in
+* segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+   , );
+   if (rc)
+   goto fail;
+
+   if (wsize < wdata->bytes) {
+   add_credits_and_wake_if(server, , 0);
+   msleep(1000);
+   }
+   } while (wsize < wdata->bytes);
+   wdata->credits = credits;
 
-   wdata->credits = credits;
-   rc = -EAGAIN;
-   while (rc == -EAGAIN) {
-   rc = 0;
-   if (wdata->cfile->invalidHandle)
-   rc = cifs_reopen_file(wdata->cfile, false);
-   if (!rc)
-   rc = server->ops->async_writev(wdata,
+   rc = adjust_credits(server, >credits, wdata->bytes);
+
+   if (!rc) {
+   if (wdata->cfile->invalidHandle)
+   rc = -EAGAIN;
+   else
+   rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
-   }
+   }
 
-   if (!rc) {
-   list_add_tail(>list, wdata_list);
-   return 0;
-   }
+   /* If the write was successfully sent, we are done */
+   if (!rc) {
+   list_add_tail(>list, wdata_list);
+   return 0;
+   }
 
-   add_credits_and_wake_if(server, >credits, 0);
-out:
-   kref_put(>refcount, cifs_uncached_writedata_release);
+   /* Roll back credits and retry if needed */
+   add_credits_and_wake_if(server, >credits, 0);
+   } while (rc == -EAGAIN);
 
+fail:
+   kref_put(>refcount, cifs_uncached_writedata_release);
return rc;
 }
 
@@ -2884,12 +2897,12 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
wdata->bytes, _from,
ctx->cfile, cifs_sb, _list,
ctx);
+
+   kref_put(>refcount,
+   
cifs_uncached_writedata_release);
}
 
list_splice(_list, >list);
-
-   kref_put(>refcount,
-cifs_uncached_writedata_release);
goto restart_loop;
}
}
-- 
2.14.1



[PATCH 1/2] CIFS: Fix a bug with re-sending wdata when transport returning -EAGAIN

2019-03-01 Thread Long Li
From: Long Li 

When sending a wdata, transport may return -EAGAIN. In this case
we should re-obtain credits because the session may have been
reconnected.

Signed-off-by: Long Li 
---
 fs/cifs/file.c | 61 +-
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9b53f33137b3..08e73759d6ec 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2620,43 +2620,44 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct 
list_head *wdata_list,
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
 
-   /*
-* Wait for credits to resend this wdata.
-* Note: we are attempting to resend the whole wdata not in segments
-*/
do {
-   rc = server->ops->wait_mtu_credits(server, wdata->bytes, ,
-  );
-
-   if (rc)
-   goto out;
-
-   if (wsize < wdata->bytes) {
-   add_credits_and_wake_if(server, , 0);
-   msleep(1000);
-   }
-   } while (wsize < wdata->bytes);
+   /*
+* Wait for credits to resend this wdata.
+* Note: we are attempting to resend the whole wdata not in
+* segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, wdata->bytes,
+   , );
+   if (rc)
+   goto fail;
+
+   if (wsize < wdata->bytes) {
+   add_credits_and_wake_if(server, , 0);
+   msleep(1000);
+   }
+   } while (wsize < wdata->bytes);
 
-   wdata->credits = credits;
-   rc = -EAGAIN;
-   while (rc == -EAGAIN) {
+   wdata->credits = credits;
rc = 0;
if (wdata->cfile->invalidHandle)
rc = cifs_reopen_file(wdata->cfile, false);
if (!rc)
rc = server->ops->async_writev(wdata,
-   cifs_uncached_writedata_release);
-   }
+   cifs_uncached_writedata_release);
 
-   if (!rc) {
-   list_add_tail(>list, wdata_list);
-   return 0;
-   }
+   /* If the write was successfully sent, we are done */
+   if (!rc) {
+   list_add_tail(>list, wdata_list);
+   return 0;
+   }
 
-   add_credits_and_wake_if(server, >credits, 0);
-out:
-   kref_put(>refcount, cifs_uncached_writedata_release);
+   /* Roll back credits and retry if needed */
+   add_credits_and_wake_if(server, >credits, 0);
+   } while (rc == -EAGAIN);
 
+fail:
+   kref_put(>refcount, cifs_uncached_writedata_release);
return rc;
 }
 
@@ -2884,12 +2885,12 @@ static void collect_uncached_write_data(struct 
cifs_aio_ctx *ctx)
wdata->bytes, _from,
ctx->cfile, cifs_sb, _list,
ctx);
+
+   kref_put(>refcount,
+   
cifs_uncached_writedata_release);
}
 
list_splice(_list, >list);
-
-   kref_put(>refcount,
-cifs_uncached_writedata_release);
goto restart_loop;
}
}
-- 
2.17.1



[PATCH 2/2] CIFS: Fix a bug with re-sending rdata when transport returning -EAGAIN

2019-03-01 Thread Long Li
From: Long Li 

When sending a rdata, transport may return -EAGAIN. In this case
we should re-obtain credits because the session may have been
reconnected.

Signed-off-by: Long Li 
---
 fs/cifs/file.c | 51 +-
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 08e73759d6ec..c83ca96f883b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3335,44 +3335,45 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
 
-   /*
-* Wait for credits to resend this rdata.
-* Note: we are attempting to resend the whole rdata not in segments
-*/
do {
-   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+   /*
+* Wait for credits to resend this rdata.
+* Note: we are attempting to resend the whole rdata not in
+* segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
, );
 
-   if (rc)
-   goto out;
+   if (rc)
+   goto fail;
 
-   if (rsize < rdata->bytes) {
-   add_credits_and_wake_if(server, , 0);
-   msleep(1000);
-   }
-   } while (rsize < rdata->bytes);
+   if (rsize < rdata->bytes) {
+   add_credits_and_wake_if(server, , 0);
+   msleep(1000);
+   }
+   } while (rsize < rdata->bytes);
 
-   rdata->credits = credits;
-   rc = -EAGAIN;
-   while (rc == -EAGAIN) {
+   rdata->credits = credits;
rc = 0;
if (rdata->cfile->invalidHandle)
rc = cifs_reopen_file(rdata->cfile, true);
if (!rc)
rc = server->ops->async_readv(rdata);
-   }
 
-   if (!rc) {
-   /* Add to aio pending list */
-   list_add_tail(>list, rdata_list);
-   return 0;
-   }
+   /* If the read was successfully sent, we are done */
+   if (!rc) {
+   /* Add to aio pending list */
+   list_add_tail(>list, rdata_list);
+   return 0;
+   }
 
-   add_credits_and_wake_if(server, >credits, 0);
-out:
-   kref_put(>refcount,
-   cifs_uncached_readdata_release);
+   /* Roll back credits and retry if needed */
+   add_credits_and_wake_if(server, >credits, 0);
+   } while (rc == -EAGAIN);
 
+fail:
+   kref_put(>refcount, cifs_uncached_readdata_release);
return rc;
 }
 
-- 
2.17.1



[PATCH] CIFS: use the correct length when pinning memory for direct I/O for write

2018-12-16 Thread Long Li
From: Long Li 

The current code attempts to pin memory using the largest possible wsize
based on the currect SMB credits. This doesn't cause kernel oops but this is
not optimal as we may pin more pages then actually needed.

Fix this by only pinning what are needed for doing this write I/O.

Signed-off-by: Long Li 
Cc: sta...@vger.kernel.org
---
 fs/cifs/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3467351..c23bf9d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2617,11 +2617,13 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
if (rc)
break;
 
+   cur_len = min_t(const size_t, len, wsize);
+
if (ctx->direct_io) {
ssize_t result;
 
result = iov_iter_get_pages_alloc(
-   from, , wsize, );
+   from, , cur_len, );
if (result < 0) {
cifs_dbg(VFS,
"direct_writev couldn't get user pages "
-- 
2.7.4



[PATCH] CIFS: return correct errors when pinning memory failed for direct I/O

2018-12-16 Thread Long Li
From: Long Li 

When pinning memory failed, we should return the correct error code and
rewind the SMB credits.

Reported-by: Murphy Zhou 
Signed-off-by: Long Li 
Cc: sta...@vger.kernel.org
Cc: Murphy Zhou 
---
 fs/cifs/file.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c9bc56b..3467351 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2630,6 +2630,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
result, from->type,
from->iov_offset, from->count);
dump_stack();
+
+   rc = result;
+   add_credits_and_wake_if(server, credits, 0);
break;
}
cur_len = (size_t)result;
@@ -3313,13 +3316,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
cur_len, );
if (result < 0) {
cifs_dbg(VFS,
-   "couldn't get user pages (cur_len=%zd)"
+   "couldn't get user pages (rc=%zd)"
" iter type %d"
" iov_offset %zd count %zd\n",
result, direct_iov.type,
direct_iov.iov_offset,
direct_iov.count);
dump_stack();
+
+   rc = result;
+   add_credits_and_wake_if(server, credits, 0);
break;
}
cur_len = (size_t)result;
-- 
2.7.4



[PATCH] CIFS: Avoid returning EBUSY to upper layer VFS

2018-12-05 Thread Long Li
From: Long Li 

EBUSY is not handled by VFS, and will be passed to user-mode. This is not
correct as we need to wait for more credits.

This patch also fixes a bug where rsize or wsize is used uninitialized when
the call to server->ops->wait_mtu_credits() fails.

Reported-by: Dan Carpenter 
Signed-off-by: Long Li 
---
 fs/cifs/file.c | 31 ++-
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74c33d5..c9bc56b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2541,14 +2541,13 @@ static int
 cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
 {
-   int wait_retry = 0;
unsigned int wsize, credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
 
/*
-* Try to resend this wdata, waiting for credits up to 3 seconds.
+* Wait for credits to resend this wdata.
 * Note: we are attempting to resend the whole wdata not in segments
 */
do {
@@ -2556,19 +2555,13 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct 
list_head *wdata_list,
server, wdata->bytes, , );
 
if (rc)
-   break;
+   goto out;
 
if (wsize < wdata->bytes) {
add_credits_and_wake_if(server, credits, 0);
msleep(1000);
-   wait_retry++;
}
-   } while (wsize < wdata->bytes && wait_retry < 3);
-
-   if (wsize < wdata->bytes) {
-   rc = -EBUSY;
-   goto out;
-   }
+   } while (wsize < wdata->bytes);
 
rc = -EAGAIN;
while (rc == -EAGAIN) {
@@ -3234,14 +3227,13 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
 {
-   int wait_retry = 0;
unsigned int rsize, credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
 
/*
-* Try to resend this rdata, waiting for credits up to 3 seconds.
+* Wait for credits to resend this rdata.
 * Note: we are attempting to resend the whole rdata not in segments
 */
do {
@@ -3249,24 +3241,13 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
, );
 
if (rc)
-   break;
+   goto out;
 
if (rsize < rdata->bytes) {
add_credits_and_wake_if(server, credits, 0);
msleep(1000);
-   wait_retry++;
}
-   } while (rsize < rdata->bytes && wait_retry < 3);
-
-   /*
-* If we can't find enough credits to send this rdata
-* release the rdata and return failure, this will pass
-* whatever I/O amount we have finished to VFS.
-*/
-   if (rsize < rdata->bytes) {
-   rc = -EBUSY;
-   goto out;
-   }
+   } while (rsize < rdata->bytes);
 
rc = -EAGAIN;
while (rc == -EAGAIN) {
-- 
2.7.4



[PATCH] CIFS: Avoid returning EBUSY to upper layer VFS

2018-12-05 Thread Long Li
From: Long Li 

EBUSY is not handled by VFS, and will be passed to user-mode. This is not
correct as we need to wait for more credits.

This patch also fixes a bug where rsize or wsize is used uninitialized when
the call to server->ops->wait_mtu_credits() fails.

Reported-by: Dan Carpenter 
Signed-off-by: Long Li 
---
 fs/cifs/file.c | 31 ++-
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74c33d5..c9bc56b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2541,14 +2541,13 @@ static int
 cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
 {
-   int wait_retry = 0;
unsigned int wsize, credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(wdata->cfile->tlink)->ses->server;
 
/*
-* Try to resend this wdata, waiting for credits up to 3 seconds.
+* Wait for credits to resend this wdata.
 * Note: we are attempting to resend the whole wdata not in segments
 */
do {
@@ -2556,19 +2555,13 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct 
list_head *wdata_list,
server, wdata->bytes, , );
 
if (rc)
-   break;
+   goto out;
 
if (wsize < wdata->bytes) {
add_credits_and_wake_if(server, credits, 0);
msleep(1000);
-   wait_retry++;
}
-   } while (wsize < wdata->bytes && wait_retry < 3);
-
-   if (wsize < wdata->bytes) {
-   rc = -EBUSY;
-   goto out;
-   }
+   } while (wsize < wdata->bytes);
 
rc = -EAGAIN;
while (rc == -EAGAIN) {
@@ -3234,14 +3227,13 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
 {
-   int wait_retry = 0;
unsigned int rsize, credits;
int rc;
struct TCP_Server_Info *server =
tlink_tcon(rdata->cfile->tlink)->ses->server;
 
/*
-* Try to resend this rdata, waiting for credits up to 3 seconds.
+* Wait for credits to resend this rdata.
 * Note: we are attempting to resend the whole rdata not in segments
 */
do {
@@ -3249,24 +3241,13 @@ static int cifs_resend_rdata(struct cifs_readdata 
*rdata,
, );
 
if (rc)
-   break;
+   goto out;
 
if (rsize < rdata->bytes) {
add_credits_and_wake_if(server, credits, 0);
msleep(1000);
-   wait_retry++;
}
-   } while (rsize < rdata->bytes && wait_retry < 3);
-
-   /*
-* If we can't find enough credits to send this rdata
-* release the rdata and return failure, this will pass
-* whatever I/O amount we have finished to VFS.
-*/
-   if (rsize < rdata->bytes) {
-   rc = -EBUSY;
-   goto out;
-   }
+   } while (rsize < rdata->bytes);
 
rc = -EAGAIN;
while (rc == -EAGAIN) {
-- 
2.7.4



RE: [Patch v4 1/3] CIFS: Add support for direct I/O read

2018-11-29 Thread Long Li
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> ср, 28 нояб. 2018 г. в 15:43, Long Li :
> >
> > > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> > >
> > > Hi Long,
> > >
> > > Please find my comments below.
> > >
> > >
> > > ср, 31 окт. 2018 г. в 15:14, Long Li :
> > > >
> > > > From: Long Li 
> > > >
> > > > With direct I/O read, we transfer the data directly from transport
> > > > layer to the user data buffer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common read code to __cifs_readv for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li 
> > > > ---
> > > >  fs/cifs/cifsfs.h   |   1 +
> > > >  fs/cifs/cifsglob.h |   5 ++
> > > >  fs/cifs/file.c | 219
> +++--
> > > 
> > > >  3 files changed, 186 insertions(+), 39 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 5f02318..7fba9aa 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode,
> > > > struct file *file);  extern int cifs_close(struct inode *inode,
> > > > struct file *file);  extern int cifs_closedir(struct inode *inode,
> > > > struct file *file);  extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);
> > > > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct
> > > > +iov_iter *to);
> > > >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);  extern ssize_t
> > > > cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
> > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index
> > > > 7f62c98..52248dd 100644
> > > > --- a/fs/cifs/cifsglob.h
> > > > +++ b/fs/cifs/cifsglob.h
> > > > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > > > unsigned intlen;
> > > > unsigned inttotal_len;
> > > > boolshould_dirty;
> > > > +   /*
> > > > +* Indicates if this aio_ctx is for direct_io,
> > > > +* If yes, iter is a copy of the user passed iov_iter
> > > > +*/
> > > > +   booldirect_io;
> > > >  };
> > > >
> > > >  struct cifs_readdata;
> > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > 87eece6..daab878
> > > > 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> > > *refcount)
> > > > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > > > for (i = 0; i < rdata->nr_pages; i++) {
> > > > put_page(rdata->pages[i]);
> > > > -   rdata->pages[i] = NULL;
> > > > }
> > > > cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > > > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > > > return uncached_fill_pages(server, rdata, iter,
> > > > iter->count); }
> > > >
> > > > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > > > + struct list_head *rdata_list,
> > > > + struct cifs_aio_ctx *ctx) {
> > > > +   int wait_retry = 0;
> > > > +   unsigned int rsize, credits;
> > > > +   int rc;
> > > > +   struct TCP_Server_Info *server =
> > > > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > > > +
> > > > +   /*
> > > > +* Try to resend this rdata, waiting for credits up to 3 
> > > > seconds.
> > > > +* Note: we are attempting to resend the whole rdata not in
> segments
> > > > +*/
> > > > +   do {
> > > > +   rc = server->ops-&

RE: [Patch v4 1/3] CIFS: Add support for direct I/O read

2018-11-29 Thread Long Li
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> ср, 28 нояб. 2018 г. в 15:43, Long Li :
> >
> > > Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> > >
> > > Hi Long,
> > >
> > > Please find my comments below.
> > >
> > >
> > > ср, 31 окт. 2018 г. в 15:14, Long Li :
> > > >
> > > > From: Long Li 
> > > >
> > > > With direct I/O read, we transfer the data directly from transport
> > > > layer to the user data buffer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common read code to __cifs_readv for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li 
> > > > ---
> > > >  fs/cifs/cifsfs.h   |   1 +
> > > >  fs/cifs/cifsglob.h |   5 ++
> > > >  fs/cifs/file.c | 219
> +++--
> > > 
> > > >  3 files changed, 186 insertions(+), 39 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 5f02318..7fba9aa 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode,
> > > > struct file *file);  extern int cifs_close(struct inode *inode,
> > > > struct file *file);  extern int cifs_closedir(struct inode *inode,
> > > > struct file *file);  extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);
> > > > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct
> > > > +iov_iter *to);
> > > >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);  extern ssize_t
> > > > cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
> > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index
> > > > 7f62c98..52248dd 100644
> > > > --- a/fs/cifs/cifsglob.h
> > > > +++ b/fs/cifs/cifsglob.h
> > > > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > > > unsigned intlen;
> > > > unsigned inttotal_len;
> > > > boolshould_dirty;
> > > > +   /*
> > > > +* Indicates if this aio_ctx is for direct_io,
> > > > +* If yes, iter is a copy of the user passed iov_iter
> > > > +*/
> > > > +   booldirect_io;
> > > >  };
> > > >
> > > >  struct cifs_readdata;
> > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > 87eece6..daab878
> > > > 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> > > *refcount)
> > > > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > > > for (i = 0; i < rdata->nr_pages; i++) {
> > > > put_page(rdata->pages[i]);
> > > > -   rdata->pages[i] = NULL;
> > > > }
> > > > cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > > > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > > > return uncached_fill_pages(server, rdata, iter,
> > > > iter->count); }
> > > >
> > > > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > > > + struct list_head *rdata_list,
> > > > + struct cifs_aio_ctx *ctx) {
> > > > +   int wait_retry = 0;
> > > > +   unsigned int rsize, credits;
> > > > +   int rc;
> > > > +   struct TCP_Server_Info *server =
> > > > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > > > +
> > > > +   /*
> > > > +* Try to resend this rdata, waiting for credits up to 3 
> > > > seconds.
> > > > +* Note: we are attempting to resend the whole rdata not in
> segments
> > > > +*/
> > > > +   do {
> > > > +   rc = server->ops-&

RE: [Patch v4 2/3] CIFS: Add support for direct I/O write

2018-11-29 Thread Long Li
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 28 нояб. 2018 г. в 18:20, Long Li :
> >
> > > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> > >
> > > ср, 31 окт. 2018 г. в 15:26, Long Li :
> > > >
> > > > From: Long Li 
> > > >
> > > > With direct I/O write, user supplied buffers are pinned to the
> > > > memory and data are transferred directly from user buffers to the
> transport layer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common write code to __cifs_writev for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li 
> > > > ---
> > > >  fs/cifs/cifsfs.h |   1 +
> > > >  fs/cifs/file.c   | 194
> +++
> > > 
> > > >  2 files changed, 154 insertions(+), 41 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 7fba9aa..e9c5103 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);  extern ssize_t
> > > > cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
> > > > extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);
> > > > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct
> > > > +iov_iter *from);
> > > >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct
> > > > iov_iter *from);  extern int cifs_lock(struct file *, int, struct
> > > > file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t,
> > > > int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > daab878..1a41c04 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > > > *wdata, struct iov_iter *from,  }
> > > >
> > > >  static int
> > > > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > > > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > > > +   int wait_retry = 0;
> > > > +   unsigned int wsize, credits;
> > > > +   int rc;
> > > > +   struct TCP_Server_Info *server =
> > > > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > > > +
> > > > +   /*
> > > > +* Try to resend this wdata, waiting for credits up to 3 
> > > > seconds.
> > > > +* Note: we are attempting to resend the whole wdata not
> > > > + in
> > > segments
> > > > +*/
> > > > +   do {
> > > > +   rc = server->ops->wait_mtu_credits(server,
> > > > + wdata->bytes, , );
> > > > +
> > > > +   if (rc)
> > > > +   break;
> > > > +
> > > > +   if (wsize < wdata->bytes) {
> > > > +   add_credits_and_wake_if(server, credits, 0);
> > > > +   msleep(1000);
> > > > +   wait_retry++;
> > > > +   }
> > > > +   } while (wsize < wdata->bytes && wait_retry < 3);
> > > > +
> > > > +   if (wsize < wdata->bytes) {
> > > > +   rc = -EBUSY;
> > > > +   goto out;
> > > > +   }
> > > > +
> > > > +   rc = -EAGAIN;
> > > > +   while (rc == -EAGAIN)
> > > > +   if (!wdata->cfile->invalidHandle ||
> > > > +   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > > > +   rc = server->ops->async_writev(wdata,
> > > > +
> > > > + cifs_uncached_writedata_release);
> > > > +
> > > > +   if (!rc) {
> > > > +   list_add_tail(>list, wdata_list);
> > > > +   return 0;
> > > > +   }
> > > > +
> > > > +   add_cr

RE: [Patch v4 2/3] CIFS: Add support for direct I/O write

2018-11-29 Thread Long Li
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 28 нояб. 2018 г. в 18:20, Long Li :
> >
> > > Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> > >
> > > ср, 31 окт. 2018 г. в 15:26, Long Li :
> > > >
> > > > From: Long Li 
> > > >
> > > > With direct I/O write, user supplied buffers are pinned to the
> > > > memory and data are transferred directly from user buffers to the
> transport layer.
> > > >
> > > > Change in v3: add support for kernel AIO
> > > >
> > > > Change in v4:
> > > > Refactor common write code to __cifs_writev for direct and non-direct
> I/O.
> > > > Retry on direct I/O failure.
> > > >
> > > > Signed-off-by: Long Li 
> > > > ---
> > > >  fs/cifs/cifsfs.h |   1 +
> > > >  fs/cifs/file.c   | 194
> +++
> > > 
> > > >  2 files changed, 154 insertions(+), 41 deletions(-)
> > > >
> > > > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > > > 7fba9aa..e9c5103 100644
> > > > --- a/fs/cifs/cifsfs.h
> > > > +++ b/fs/cifs/cifsfs.h
> > > > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb
> > > > *iocb, struct iov_iter *to);  extern ssize_t
> > > > cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
> > > > extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct
> > > > iov_iter *to);  extern ssize_t cifs_user_writev(struct kiocb
> > > > *iocb, struct iov_iter *from);
> > > > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct
> > > > +iov_iter *from);
> > > >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct
> > > > iov_iter *from);  extern int cifs_lock(struct file *, int, struct
> > > > file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t,
> > > > int); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index
> > > > daab878..1a41c04 100644
> > > > --- a/fs/cifs/file.c
> > > > +++ b/fs/cifs/file.c
> > > > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > > > *wdata, struct iov_iter *from,  }
> > > >
> > > >  static int
> > > > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > > > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > > > +   int wait_retry = 0;
> > > > +   unsigned int wsize, credits;
> > > > +   int rc;
> > > > +   struct TCP_Server_Info *server =
> > > > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > > > +
> > > > +   /*
> > > > +* Try to resend this wdata, waiting for credits up to 3 
> > > > seconds.
> > > > +* Note: we are attempting to resend the whole wdata not
> > > > + in
> > > segments
> > > > +*/
> > > > +   do {
> > > > +   rc = server->ops->wait_mtu_credits(server,
> > > > + wdata->bytes, , );
> > > > +
> > > > +   if (rc)
> > > > +   break;
> > > > +
> > > > +   if (wsize < wdata->bytes) {
> > > > +   add_credits_and_wake_if(server, credits, 0);
> > > > +   msleep(1000);
> > > > +   wait_retry++;
> > > > +   }
> > > > +   } while (wsize < wdata->bytes && wait_retry < 3);
> > > > +
> > > > +   if (wsize < wdata->bytes) {
> > > > +   rc = -EBUSY;
> > > > +   goto out;
> > > > +   }
> > > > +
> > > > +   rc = -EAGAIN;
> > > > +   while (rc == -EAGAIN)
> > > > +   if (!wdata->cfile->invalidHandle ||
> > > > +   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > > > +   rc = server->ops->async_writev(wdata,
> > > > +
> > > > + cifs_uncached_writedata_release);
> > > > +
> > > > +   if (!rc) {
> > > > +   list_add_tail(>list, wdata_list);
> > > > +   return 0;
> > > > +   }
> > > > +
> > > > +   add_cr

RE: [Patch v4 2/3] CIFS: Add support for direct I/O write

2018-11-28 Thread Long Li
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 31 окт. 2018 г. в 15:26, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O write, user supplied buffers are pinned to the memory
> > and data are transferred directly from user buffers to the transport layer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common write code to __cifs_writev for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h |   1 +
> >  fs/cifs/file.c   | 194 +++
> 
> >  2 files changed, 154 insertions(+), 41 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 7fba9aa..e9c5103 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb,
> > struct iov_iter *to);  extern ssize_t cifs_direct_readv(struct kiocb
> > *iocb, struct iov_iter *to);  extern ssize_t cifs_strict_readv(struct
> > kiocb *iocb, struct iov_iter *to);  extern ssize_t
> > cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
> > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > +*from);
> >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter
> > *from);  extern int cifs_lock(struct file *, int, struct file_lock *);
> > extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git
> > a/fs/cifs/file.c b/fs/cifs/file.c index daab878..1a41c04 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > *wdata, struct iov_iter *from,  }
> >
> >  static int
> > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > +   int wait_retry = 0;
> > +   unsigned int wsize, credits;
> > +   int rc;
> > +   struct TCP_Server_Info *server =
> > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > +
> > +   /*
> > +* Try to resend this wdata, waiting for credits up to 3 seconds.
> > +* Note: we are attempting to resend the whole wdata not in
> segments
> > +*/
> > +   do {
> > +   rc = server->ops->wait_mtu_credits(server,
> > + wdata->bytes, , );
> > +
> > +   if (rc)
> > +   break;
> > +
> > +   if (wsize < wdata->bytes) {
> > +   add_credits_and_wake_if(server, credits, 0);
> > +   msleep(1000);
> > +   wait_retry++;
> > +   }
> > +   } while (wsize < wdata->bytes && wait_retry < 3);
> > +
> > +   if (wsize < wdata->bytes) {
> > +   rc = -EBUSY;
> > +   goto out;
> > +   }
> > +
> > +   rc = -EAGAIN;
> > +   while (rc == -EAGAIN)
> > +   if (!wdata->cfile->invalidHandle ||
> > +   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > +   rc = server->ops->async_writev(wdata,
> > +
> > + cifs_uncached_writedata_release);
> > +
> > +   if (!rc) {
> > +   list_add_tail(>list, wdata_list);
> > +   return 0;
> > +   }
> > +
> > +   add_credits_and_wake_if(server, wdata->credits, 0);
> > +out:
> > +   kref_put(>refcount, cifs_uncached_writedata_release);
> > +
> > +   return rc;
> > +}
> > +
> > +static int
> >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> >  struct cifsFileInfo *open_file,
> >  struct cifs_sb_info *cifs_sb, struct list_head
> > *wdata_list, @@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset,
> size_t len, struct iov_iter *from,
> > loff_t saved_offset = offset;
> > pid_t pid;
> > struct TCP_Server_Info *server;
> > +   struct page **pagevec;
> > +   size_t start;
> >
> > if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> > pid = open_file->pid;
> > @@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len,
> struct iov_iter *from,
> > if (rc)
> > break;
> >
> >

RE: [Patch v4 2/3] CIFS: Add support for direct I/O write

2018-11-28 Thread Long Li
> Subject: Re: [Patch v4 2/3] CIFS: Add support for direct I/O write
> 
> ср, 31 окт. 2018 г. в 15:26, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O write, user supplied buffers are pinned to the memory
> > and data are transferred directly from user buffers to the transport layer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common write code to __cifs_writev for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h |   1 +
> >  fs/cifs/file.c   | 194 +++
> 
> >  2 files changed, 154 insertions(+), 41 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 7fba9aa..e9c5103 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb,
> > struct iov_iter *to);  extern ssize_t cifs_direct_readv(struct kiocb
> > *iocb, struct iov_iter *to);  extern ssize_t cifs_strict_readv(struct
> > kiocb *iocb, struct iov_iter *to);  extern ssize_t
> > cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
> > +extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter
> > +*from);
> >  extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter
> > *from);  extern int cifs_lock(struct file *, int, struct file_lock *);
> > extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git
> > a/fs/cifs/file.c b/fs/cifs/file.c index daab878..1a41c04 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata
> > *wdata, struct iov_iter *from,  }
> >
> >  static int
> > +cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head
> > +*wdata_list, struct cifs_aio_ctx *ctx) {
> > +   int wait_retry = 0;
> > +   unsigned int wsize, credits;
> > +   int rc;
> > +   struct TCP_Server_Info *server =
> > +tlink_tcon(wdata->cfile->tlink)->ses->server;
> > +
> > +   /*
> > +* Try to resend this wdata, waiting for credits up to 3 seconds.
> > +* Note: we are attempting to resend the whole wdata not in
> segments
> > +*/
> > +   do {
> > +   rc = server->ops->wait_mtu_credits(server,
> > + wdata->bytes, , );
> > +
> > +   if (rc)
> > +   break;
> > +
> > +   if (wsize < wdata->bytes) {
> > +   add_credits_and_wake_if(server, credits, 0);
> > +   msleep(1000);
> > +   wait_retry++;
> > +   }
> > +   } while (wsize < wdata->bytes && wait_retry < 3);
> > +
> > +   if (wsize < wdata->bytes) {
> > +   rc = -EBUSY;
> > +   goto out;
> > +   }
> > +
> > +   rc = -EAGAIN;
> > +   while (rc == -EAGAIN)
> > +   if (!wdata->cfile->invalidHandle ||
> > +   !(rc = cifs_reopen_file(wdata->cfile, false)))
> > +   rc = server->ops->async_writev(wdata,
> > +
> > + cifs_uncached_writedata_release);
> > +
> > +   if (!rc) {
> > +   list_add_tail(>list, wdata_list);
> > +   return 0;
> > +   }
> > +
> > +   add_credits_and_wake_if(server, wdata->credits, 0);
> > +out:
> > +   kref_put(>refcount, cifs_uncached_writedata_release);
> > +
> > +   return rc;
> > +}
> > +
> > +static int
> >  cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
> >  struct cifsFileInfo *open_file,
> >  struct cifs_sb_info *cifs_sb, struct list_head
> > *wdata_list, @@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset,
> size_t len, struct iov_iter *from,
> > loff_t saved_offset = offset;
> > pid_t pid;
> > struct TCP_Server_Info *server;
> > +   struct page **pagevec;
> > +   size_t start;
> >
> > if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
> > pid = open_file->pid;
> > @@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len,
> struct iov_iter *from,
> > if (rc)
> > break;
> >
> >

RE: [Patch v4 1/3] CIFS: Add support for direct I/O read

2018-11-28 Thread Long Li
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> Hi Long,
> 
> Please find my comments below.
> 
> 
> ср, 31 окт. 2018 г. в 15:14, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O read, we transfer the data directly from transport
> > layer to the user data buffer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common read code to __cifs_readv for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h   |   1 +
> >  fs/cifs/cifsglob.h |   5 ++
> >  fs/cifs/file.c | 219 +++--
> 
> >  3 files changed, 186 insertions(+), 39 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 5f02318..7fba9aa 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct
> > file *file);  extern int cifs_close(struct inode *inode, struct file
> > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > iov_iter *to);
> > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > +*to);
> >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > b/fs/cifs/cifsglob.h index 7f62c98..52248dd 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > unsigned intlen;
> > unsigned inttotal_len;
> > boolshould_dirty;
> > +   /*
> > +* Indicates if this aio_ctx is for direct_io,
> > +* If yes, iter is a copy of the user passed iov_iter
> > +*/
> > +   booldirect_io;
> >  };
> >
> >  struct cifs_readdata;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 87eece6..daab878
> > 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> *refcount)
> > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > for (i = 0; i < rdata->nr_pages; i++) {
> > put_page(rdata->pages[i]);
> > -   rdata->pages[i] = NULL;
> > }
> > cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > return uncached_fill_pages(server, rdata, iter, iter->count);
> > }
> >
> > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > + struct list_head *rdata_list,
> > + struct cifs_aio_ctx *ctx) {
> > +   int wait_retry = 0;
> > +   unsigned int rsize, credits;
> > +   int rc;
> > +   struct TCP_Server_Info *server =
> > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > +
> > +   /*
> > +* Try to resend this rdata, waiting for credits up to 3 seconds.
> > +* Note: we are attempting to resend the whole rdata not in segments
> > +*/
> > +   do {
> > +   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > +   , );
> > +
> > +   if (rc)
> > +   break;
> > +
> > +   if (rsize < rdata->bytes) {
> > +   add_credits_and_wake_if(server, credits, 0);
> > +   msleep(1000);
> > +   wait_retry++;
> > +   }
> > +   } while (rsize < rdata->bytes && wait_retry < 3);
> > +
> > +   /*
> > +* If we can't find enough credits to send this rdata
> > +* release the rdata and return failure, this will pass
> > +* whatever I/O amount we have finished to VFS.
> > +*/
> > +   if (rsize < rdata->bytes) {
> > +   rc = -EBUSY;
> 
> We don't have enough credits and return EBUSY here...
> 
> > +   goto out;
> > +   }
> > +
> > +   rc = -

RE: [Patch v4 1/3] CIFS: Add support for direct I/O read

2018-11-28 Thread Long Li
> Subject: Re: [Patch v4 1/3] CIFS: Add support for direct I/O read
> 
> Hi Long,
> 
> Please find my comments below.
> 
> 
> ср, 31 окт. 2018 г. в 15:14, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O read, we transfer the data directly from transport
> > layer to the user data buffer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Change in v4:
> > Refactor common read code to __cifs_readv for direct and non-direct I/O.
> > Retry on direct I/O failure.
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h   |   1 +
> >  fs/cifs/cifsglob.h |   5 ++
> >  fs/cifs/file.c | 219 +++--
> 
> >  3 files changed, 186 insertions(+), 39 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > 5f02318..7fba9aa 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct
> > file *file);  extern int cifs_close(struct inode *inode, struct file
> > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > iov_iter *to);
> > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > +*to);
> >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > b/fs/cifs/cifsglob.h index 7f62c98..52248dd 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
> > unsigned intlen;
> > unsigned inttotal_len;
> > boolshould_dirty;
> > +   /*
> > +* Indicates if this aio_ctx is for direct_io,
> > +* If yes, iter is a copy of the user passed iov_iter
> > +*/
> > +   booldirect_io;
> >  };
> >
> >  struct cifs_readdata;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 87eece6..daab878
> > 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> *refcount)
> > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > for (i = 0; i < rdata->nr_pages; i++) {
> > put_page(rdata->pages[i]);
> > -   rdata->pages[i] = NULL;
> > }
> > cifs_readdata_release(refcount);  } @@ -3092,6 +3091,63 @@
> > cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
> > return uncached_fill_pages(server, rdata, iter, iter->count);
> > }
> >
> > +static int cifs_resend_rdata(struct cifs_readdata *rdata,
> > + struct list_head *rdata_list,
> > + struct cifs_aio_ctx *ctx) {
> > +   int wait_retry = 0;
> > +   unsigned int rsize, credits;
> > +   int rc;
> > +   struct TCP_Server_Info *server =
> > +tlink_tcon(rdata->cfile->tlink)->ses->server;
> > +
> > +   /*
> > +* Try to resend this rdata, waiting for credits up to 3 seconds.
> > +* Note: we are attempting to resend the whole rdata not in segments
> > +*/
> > +   do {
> > +   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
> > +   , );
> > +
> > +   if (rc)
> > +   break;
> > +
> > +   if (rsize < rdata->bytes) {
> > +   add_credits_and_wake_if(server, credits, 0);
> > +   msleep(1000);
> > +   wait_retry++;
> > +   }
> > +   } while (rsize < rdata->bytes && wait_retry < 3);
> > +
> > +   /*
> > +* If we can't find enough credits to send this rdata
> > +* release the rdata and return failure, this will pass
> > +* whatever I/O amount we have finished to VFS.
> > +*/
> > +   if (rsize < rdata->bytes) {
> > +   rc = -EBUSY;
> 
> We don't have enough credits and return EBUSY here...
> 
> > +   goto out;
> > +   }
> > +
> > +   rc = -

RE: [Patch v4] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-06 Thread Long Li
> Subject: Re: [Patch v4] genirq/matrix: Choose CPU for managed IRQs based
> on how many of them are allocated
> 
> Long,
> 
> On Tue, 6 Nov 2018, Long Li wrote:
> 
> > From: Long Li 
> >
> > On a large system with multiple devices of the same class (e.g. NVMe
> > disks, using managed IRQs), the kernel tends to concentrate their IRQs
> > on several CPUs.
> 
> Thanks for addressing the comments. Well done.
> 
> I've merged it, but took the liberty to rework the changelog so all the
> background information which we exchanged over the various iterations is
> preserved there.

Thank you!

Long

> 
> Thanks,
> 
>   tglx


RE: [Patch v4] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-06 Thread Long Li
> Subject: Re: [Patch v4] genirq/matrix: Choose CPU for managed IRQs based
> on how many of them are allocated
> 
> Long,
> 
> On Tue, 6 Nov 2018, Long Li wrote:
> 
> > From: Long Li 
> >
> > On a large system with multiple devices of the same class (e.g. NVMe
> > disks, using managed IRQs), the kernel tends to concentrate their IRQs
> > on several CPUs.
> 
> Thanks for addressing the comments. Well done.
> 
> I've merged it, but took the liberty to rework the changelog so all the
> background information which we exchanged over the various iterations is
> preserved there.

Thank you!

Long

> 
> Thanks,
> 
>   tglx


[tip:irq/core] genirq/matrix: Improve target CPU selection for managed interrupts.

2018-11-06 Thread tip-bot for Long Li
Commit-ID:  e8da8794a7fd9eef1ec9a07f0d4897c68581c72b
Gitweb: https://git.kernel.org/tip/e8da8794a7fd9eef1ec9a07f0d4897c68581c72b
Author: Long Li 
AuthorDate: Tue, 6 Nov 2018 04:00:00 +
Committer:  Thomas Gleixner 
CommitDate: Tue, 6 Nov 2018 23:20:13 +0100

genirq/matrix: Improve target CPU selection for managed interrupts.

On large systems with multiple devices of the same class (e.g. NVMe disks,
using managed interrupts), the kernel can affinitize these interrupts to a
small subset of CPUs instead of spreading them out evenly.

irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask
of possible target CPUs which has the lowest number of interrupt vectors
allocated.

This is done by searching the CPU with the highest number of available
vectors. While this is correct for non-managed CPUs it can select the wrong
CPU for managed interrupts. Under certain constellations this results in
affinitizing the managed interrupts of several devices to a single CPU in
a set.

The book keeping of available vectors works the following way:

 1) Non-managed interrupts:

available is decremented when the interrupt is actually requested by
the device driver and a vector is assigned. It's incremented when the
interrupt and the vector are freed.

 2) Managed interrupts:

Managed interrupts guarantee vector reservation when the MSI/MSI-X
functionality of a device is enabled, which is achieved by reserving
vectors in the bitmaps of the possible target CPUs. This reservation
decrements the available count on each possible target CPU.

When the interrupt is requested by the device driver then a vector is
allocated from the reserved region. The operation is reversed when the
interrupt is freed by the device driver. Neither of these operations
affect the available count.

The reservation persist up to the point where the MSI/MSI-X
functionality is disabled and only this operation increments the
available count again.

For non-managed interrupts the available count is the correct selection
criterion because the guaranteed reservations need to be taken into
account. Using the allocated counter could lead to a failing allocation in
the following situation (total vector space of 10 assumed):

 CPU0   CPU1
 available: 2  0
 allocated: 5  3   <--- CPU1 is selected, but available space = 0
 managed reserved:  3  7

 while available yields the correct result.

For managed interrupts the available count is not the appropriate
selection criterion because as explained above the available count is not
affected by the actual vector allocation.

The following example illustrates that. Total vector space of 10
assumed. The starting point is:

 CPU0   CPU1
 available: 5  4
 allocated: 2  3
 managed reserved:  3  3

 Allocating vectors for three non-managed interrupts will result in
 affinitizing the first two to CPU0 and the third one to CPU1 because the
 available count is adjusted with each allocation:

  CPU0  CPU1
 available:  5 4<- Select CPU0 for 1st allocation
 --> allocated:  3 3

 available:  4 4<- Select CPU0 for 2nd allocation
 --> allocated:  4 3

 available:  3 4<- Select CPU1 for 3rd allocation
 --> allocated:  4 4

 But the allocation of three managed interrupts starting from the same
 point will affinitize all of them to CPU0 because the available count is
 not affected by the allocation (see above). So the end result is:

  CPU0  CPU1
 available:  5 4
 allocated:  5 3

Introduce a "managed_allocated" field in struct cpumap to track the vector
allocation for managed interrupts separately. Use this information to
select the target CPU when a vector is allocated for a managed interrupt,
which results in more evenly distributed vector assignments. The above
example results in the following allocations:

 CPU0   CPU1
 managed_allocated: 0  0<- Select CPU0 for 1st allocation
 --> allocated: 3  3

 managed_allocated: 1  0<- Select CPU1 for 2nd allocation
 --> allocated: 3  4

 managed_allocated: 1  1<- Select CPU0 for 3rd allocation
 --> allocated: 4  4

The allocation of non-managed interrupts is not affected by this change and
is still evaluating the available count.

The overall distribution of interrupt vectors for both types of interrupts
might still not be perfectly even depending on the number of non-managed
and managed interrupts in a system, but due to the reservation guarantee
for managed interrupts this cannot be avoided.

Expose the new field in debugfs as well.

[ tglx: Clarified the background of the problem in the changelog and
described it independent of NVME ]

Signed-off-by: Long Li 
Signed-off-by:

[tip:irq/core] genirq/matrix: Improve target CPU selection for managed interrupts.

2018-11-06 Thread tip-bot for Long Li
Commit-ID:  e8da8794a7fd9eef1ec9a07f0d4897c68581c72b
Gitweb: https://git.kernel.org/tip/e8da8794a7fd9eef1ec9a07f0d4897c68581c72b
Author: Long Li 
AuthorDate: Tue, 6 Nov 2018 04:00:00 +
Committer:  Thomas Gleixner 
CommitDate: Tue, 6 Nov 2018 23:20:13 +0100

genirq/matrix: Improve target CPU selection for managed interrupts.

On large systems with multiple devices of the same class (e.g. NVMe disks,
using managed interrupts), the kernel can affinitize these interrupts to a
small subset of CPUs instead of spreading them out evenly.

irq_matrix_alloc_managed() tries to select the CPU in the supplied cpumask
of possible target CPUs which has the lowest number of interrupt vectors
allocated.

This is done by searching the CPU with the highest number of available
vectors. While this is correct for non-managed CPUs it can select the wrong
CPU for managed interrupts. Under certain constellations this results in
affinitizing the managed interrupts of several devices to a single CPU in
a set.

The book keeping of available vectors works the following way:

 1) Non-managed interrupts:

available is decremented when the interrupt is actually requested by
the device driver and a vector is assigned. It's incremented when the
interrupt and the vector are freed.

 2) Managed interrupts:

Managed interrupts guarantee vector reservation when the MSI/MSI-X
functionality of a device is enabled, which is achieved by reserving
vectors in the bitmaps of the possible target CPUs. This reservation
decrements the available count on each possible target CPU.

When the interrupt is requested by the device driver then a vector is
allocated from the reserved region. The operation is reversed when the
interrupt is freed by the device driver. Neither of these operations
affect the available count.

The reservation persist up to the point where the MSI/MSI-X
functionality is disabled and only this operation increments the
available count again.

For non-managed interrupts the available count is the correct selection
criterion because the guaranteed reservations need to be taken into
account. Using the allocated counter could lead to a failing allocation in
the following situation (total vector space of 10 assumed):

 CPU0   CPU1
 available: 2  0
 allocated: 5  3   <--- CPU1 is selected, but available space = 0
 managed reserved:  3  7

 while available yields the correct result.

For managed interrupts the available count is not the appropriate
selection criterion because as explained above the available count is not
affected by the actual vector allocation.

The following example illustrates that. Total vector space of 10
assumed. The starting point is:

 CPU0   CPU1
 available: 5  4
 allocated: 2  3
 managed reserved:  3  3

 Allocating vectors for three non-managed interrupts will result in
 affinitizing the first two to CPU0 and the third one to CPU1 because the
 available count is adjusted with each allocation:

  CPU0  CPU1
 available:  5 4<- Select CPU0 for 1st allocation
 --> allocated:  3 3

 available:  4 4<- Select CPU0 for 2nd allocation
 --> allocated:  4 3

 available:  3 4<- Select CPU1 for 3rd allocation
 --> allocated:  4 4

 But the allocation of three managed interrupts starting from the same
 point will affinitize all of them to CPU0 because the available count is
 not affected by the allocation (see above). So the end result is:

  CPU0  CPU1
 available:  5 4
 allocated:  5 3

Introduce a "managed_allocated" field in struct cpumap to track the vector
allocation for managed interrupts separately. Use this information to
select the target CPU when a vector is allocated for a managed interrupt,
which results in more evenly distributed vector assignments. The above
example results in the following allocations:

 CPU0   CPU1
 managed_allocated: 0  0<- Select CPU0 for 1st allocation
 --> allocated: 3  3

 managed_allocated: 1  0<- Select CPU1 for 2nd allocation
 --> allocated: 3  4

 managed_allocated: 1  1<- Select CPU0 for 3rd allocation
 --> allocated: 4  4

The allocation of non-managed interrupts is not affected by this change and
is still evaluating the available count.

The overall distribution of interrupt vectors for both types of interrupts
might still not be perfectly even depending on the number of non-managed
and managed interrupts in a system, but due to the reservation guarantee
for managed interrupts this cannot be avoided.

Expose the new field in debugfs as well.

[ tglx: Clarified the background of the problem in the changelog and
described it independent of NVME ]

Signed-off-by: Long Li 
Signed-off-by:

[Patch v4] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-05 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than the 
actual
number it's assigned.

To get a more even distribution for allocating managed IRQs, we need to keep 
track
of how many of them are allocated on a given CPU. Introduce "managed_allocated"
in struct cpumap to track those managed IRQs that are allocated on this CPU, and
change the code to use this information for deciding how to allocate CPU for
managed IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 34 ++
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..92337703ca9f 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
unsigned intavailable;
unsigned intallocated;
unsigned intmanaged;
+   unsigned intmanaged_allocated;
boolinitialized;
boolonline;
unsigned long   alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix 
*m,
return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+   const struct cpumask *msk)
+{
+   unsigned int cpu, best_cpu, allocated = UINT_MAX;
+   struct cpumap *cm;
+
+   best_cpu = UINT_MAX;
+
+   for_each_cpu(cpu, msk) {
+   cm = per_cpu_ptr(m->maps, cpu);
+
+   if (!cm->online || cm->managed_allocated > allocated)
+   continue;
+
+   best_cpu = cpu;
+   allocated = cm->managed_allocated;
+   }
+   return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
if (cpumask_empty(msk))
return -EINVAL;
 
-   cpu = matrix_find_best_cpu(m, msk);
+   cpu = matrix_find_best_cpu_managed(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
return -ENOSPC;
set_bit(bit, cm->alloc_map);
cm->allocated++;
+   cm->managed_allocated++;
m->total_allocated++;
*mapped_cpu = cpu;
trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
 
clear_bit(bit, cm->alloc_map);
cm->allocated--;
+   if(managed)
+   cm->managed_allocated--;
 
if (cm->online)
m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct 
irq_matrix *m, int ind)
seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
   m->system_map);
-   seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+   seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " 
");
cpus_read_lock();
for_each_online_cpu(cpu) {
struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
 
-   seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
-  cpu, cm->available, cm->managed, cm->allocated,
+   seq_printf(sf, "%*s %4d  %4u  %4u  %4u %4u  %*pbl\n", ind, " ",
+  cpu, cm->available, cm->managed,
+  cm->managed_allocated, cm->allocated,
   m->matrix_bits, cm->alloc_map);
}
cpus_read_unlock();
-- 
2.14.1



[Patch v4] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-05 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than the 
actual
number it's assigned.

To get a more even distribution for allocating managed IRQs, we need to keep 
track
of how many of them are allocated on a given CPU. Introduce "managed_allocated"
in struct cpumap to track those managed IRQs that are allocated on this CPU, and
change the code to use this information for deciding how to allocate CPU for
managed IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 34 ++
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..92337703ca9f 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
unsigned intavailable;
unsigned intallocated;
unsigned intmanaged;
+   unsigned intmanaged_allocated;
boolinitialized;
boolonline;
unsigned long   alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix 
*m,
return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+   const struct cpumask *msk)
+{
+   unsigned int cpu, best_cpu, allocated = UINT_MAX;
+   struct cpumap *cm;
+
+   best_cpu = UINT_MAX;
+
+   for_each_cpu(cpu, msk) {
+   cm = per_cpu_ptr(m->maps, cpu);
+
+   if (!cm->online || cm->managed_allocated > allocated)
+   continue;
+
+   best_cpu = cpu;
+   allocated = cm->managed_allocated;
+   }
+   return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
if (cpumask_empty(msk))
return -EINVAL;
 
-   cpu = matrix_find_best_cpu(m, msk);
+   cpu = matrix_find_best_cpu_managed(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
return -ENOSPC;
set_bit(bit, cm->alloc_map);
cm->allocated++;
+   cm->managed_allocated++;
m->total_allocated++;
*mapped_cpu = cpu;
trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
 
clear_bit(bit, cm->alloc_map);
cm->allocated--;
+   if(managed)
+   cm->managed_allocated--;
 
if (cm->online)
m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct 
irq_matrix *m, int ind)
seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
   m->system_map);
-   seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+   seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " 
");
cpus_read_lock();
for_each_online_cpu(cpu) {
struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
 
-   seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
-  cpu, cm->available, cm->managed, cm->allocated,
+   seq_printf(sf, "%*s %4d  %4u  %4u  %4u %4u  %*pbl\n", ind, " ",
+  cpu, cm->available, cm->managed,
+  cm->managed_allocated, cm->allocated,
   m->matrix_bits, cm->alloc_map);
}
cpus_read_unlock();
-- 
2.14.1



[tip:irq/core] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-05 Thread tip-bot for Long Li
Commit-ID:  b82592199032bf7c778f861b936287e37ebc9f62
Gitweb: https://git.kernel.org/tip/b82592199032bf7c778f861b936287e37ebc9f62
Author: Long Li 
AuthorDate: Fri, 2 Nov 2018 18:02:48 +
Committer:  Thomas Gleixner 
CommitDate: Mon, 5 Nov 2018 12:16:26 +0100

genirq/affinity: Spread IRQs to all available NUMA nodes

If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts
which are allocated for a device, the interrupt affinity spreading code
fails to spread them across all nodes.

The reason is, that the spreading code starts from node 0 and continues up
to the number of interrupts requested for allocation. This leaves the nodes
past the last interrupt unused.

This results in interrupt concentration on the first nodes which violates
the assumption of the block layer that all nodes are covered evenly. As a
consequence the NUMA nodes above the number of interrupts are all assigned
to hardware queue 0 and therefore NUMA node 0, which results in bad
performance and has CPU hotplug implications, because queue 0 gets shut
down when the last CPU of node 0 is offlined.

Go over all NUMA nodes and assign them round-robin to all requested
interrupts to solve this.

[ tglx: Massaged changelog ]

Signed-off-by: Long Li 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Ming Lei 
Cc: Michael Kelley 
Link: https://lkml.kernel.org/r/20181102180248.13583-1-lon...@linuxonhyperv.com

---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   done = numvecs;
goto out;
}
 


[tip:irq/core] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-05 Thread tip-bot for Long Li
Commit-ID:  b82592199032bf7c778f861b936287e37ebc9f62
Gitweb: https://git.kernel.org/tip/b82592199032bf7c778f861b936287e37ebc9f62
Author: Long Li 
AuthorDate: Fri, 2 Nov 2018 18:02:48 +
Committer:  Thomas Gleixner 
CommitDate: Mon, 5 Nov 2018 12:16:26 +0100

genirq/affinity: Spread IRQs to all available NUMA nodes

If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts
which are allocated for a device, the interrupt affinity spreading code
fails to spread them across all nodes.

The reason is, that the spreading code starts from node 0 and continues up
to the number of interrupts requested for allocation. This leaves the nodes
past the last interrupt unused.

This results in interrupt concentration on the first nodes which violates
the assumption of the block layer that all nodes are covered evenly. As a
consequence the NUMA nodes above the number of interrupts are all assigned
to hardware queue 0 and therefore NUMA node 0, which results in bad
performance and has CPU hotplug implications, because queue 0 gets shut
down when the last CPU of node 0 is offlined.

Go over all NUMA nodes and assign them round-robin to all requested
interrupts to solve this.

[ tglx: Massaged changelog ]

Signed-off-by: Long Li 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Ming Lei 
Cc: Michael Kelley 
Link: https://lkml.kernel.org/r/20181102180248.13583-1-lon...@linuxonhyperv.com

---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   done = numvecs;
goto out;
}
 


RE: [PATCH v3] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-03 Thread Long Li
> Subject: Re: [PATCH v3] genirq/matrix: Choose CPU for managed IRQs based
> on how many of them are allocated
> 
> On Sat, 3 Nov 2018, Thomas Gleixner wrote:
> > On Fri, 2 Nov 2018, Long Li wrote:
> > >  /**
> > >   * irq_matrix_assign_system - Assign system wide entry in the matrix
> > >   * @m:   Matrix pointer
> > > @@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix
> *m, const struct cpumask *msk,
> > >   if (cpumask_empty(msk))
> > >   return -EINVAL;
> > >
> > > - cpu = matrix_find_best_cpu(m, msk);
> > > + cpu = matrix_find_best_cpu_managed(m, msk);
> > >   if (cpu == UINT_MAX)
> > >   return -ENOSPC;
> > >
> > > @@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix
> *m, const struct cpumask *msk,
> > >   return -ENOSPC;
> > >   set_bit(bit, cm->alloc_map);
> > >   cm->allocated++;
> > > + cm->managed_allocated++;
> > >   m->total_allocated++;
> > >   *mapped_cpu = cpu;
> > >   trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
> >
> > so far so good. But what exactly decrements managed_allocated ?
> 
> Another thing. If we add that counter, then it would be good to expose it in
> the debugfs files as well.

I will send an update to address those.

Long

> 
> Thanks,
> 
>   tglx


RE: [PATCH v3] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-03 Thread Long Li
> Subject: Re: [PATCH v3] genirq/matrix: Choose CPU for managed IRQs based
> on how many of them are allocated
> 
> On Sat, 3 Nov 2018, Thomas Gleixner wrote:
> > On Fri, 2 Nov 2018, Long Li wrote:
> > >  /**
> > >   * irq_matrix_assign_system - Assign system wide entry in the matrix
> > >   * @m:   Matrix pointer
> > > @@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix
> *m, const struct cpumask *msk,
> > >   if (cpumask_empty(msk))
> > >   return -EINVAL;
> > >
> > > - cpu = matrix_find_best_cpu(m, msk);
> > > + cpu = matrix_find_best_cpu_managed(m, msk);
> > >   if (cpu == UINT_MAX)
> > >   return -ENOSPC;
> > >
> > > @@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix
> *m, const struct cpumask *msk,
> > >   return -ENOSPC;
> > >   set_bit(bit, cm->alloc_map);
> > >   cm->allocated++;
> > > + cm->managed_allocated++;
> > >   m->total_allocated++;
> > >   *mapped_cpu = cpu;
> > >   trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
> >
> > so far so good. But what exactly decrements managed_allocated ?
> 
> Another thing. If we add that counter, then it would be good to expose it in
> the debugfs files as well.

I will send an update to address those.

Long

> 
> Thanks,
> 
>   tglx


[Patch v2] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-02 Thread Long Li
From: Long Li 

On systems with large number of NUMA nodes, there may be more NUMA nodes than
the number of MSI/MSI-X interrupts that device requests for. The current code
always picks up the NUMA nodes starting from the node 0, up to the number of
interrupts requested. This may left some later NUMA nodes unused.

For example, if the system has 16 NUMA nodes, and the device reqeusts for 8
interrupts, NUMA node 0 to 7 are assigned for those interrupts, NUMA 8 to 15
are unused.

There are several problems with this approach:
1. Later, when those managed IRQs are allocated, they can not be assigned to
NUMA 8 to 15, this may create an IRQ concentration on NUMA 0 to 7.
2. Some upper layers assume affinity mask has a complete coverage over NUMA 
nodes.
For example, block layer use the affinity mask to decide how to map CPU queues 
to
hardware queues, missing NUMA nodes in the masks may result in an uneven mapping
of queues. For the above example of 16 NUMA nodes, CPU queues on NUMA node 0 to 
7
are assigned to the hardware queues 0 to 7, respectively. But CPU queues on NUMA
node 8 to 15 are all assigned to the hardware queue 0.

Fix this problem by going over all NUMA nodes and assign them round-robin to
all IRQs.

Change in v2: Removed extra code for calculating "done". (Michael Kelley
)

Signed-off-by: Long Li 
---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   done = numvecs;
goto out;
}
 
-- 
2.14.1



[Patch v2] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-02 Thread Long Li
From: Long Li 

On systems with large number of NUMA nodes, there may be more NUMA nodes than
the number of MSI/MSI-X interrupts that device requests for. The current code
always picks up the NUMA nodes starting from the node 0, up to the number of
interrupts requested. This may left some later NUMA nodes unused.

For example, if the system has 16 NUMA nodes, and the device reqeusts for 8
interrupts, NUMA node 0 to 7 are assigned for those interrupts, NUMA 8 to 15
are unused.

There are several problems with this approach:
1. Later, when those managed IRQs are allocated, they can not be assigned to
NUMA 8 to 15, this may create an IRQ concentration on NUMA 0 to 7.
2. Some upper layers assume affinity mask has a complete coverage over NUMA 
nodes.
For example, block layer use the affinity mask to decide how to map CPU queues 
to
hardware queues, missing NUMA nodes in the masks may result in an uneven mapping
of queues. For the above example of 16 NUMA nodes, CPU queues on NUMA node 0 to 
7
are assigned to the hardware queues 0 to 7, respectively. But CPU queues on NUMA
node 8 to 15 are all assigned to the hardware queue 0.

Fix this problem by going over all NUMA nodes and assign them round-robin to
all IRQs.

Change in v2: Removed extra code for calculating "done". (Michael Kelley
)

Signed-off-by: Long Li 
---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   done = numvecs;
goto out;
}
 
-- 
2.14.1



RE: [PATCH] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-02 Thread Long Li
> Subject: RE: [PATCH] genirq/affinity: Spread IRQs to all available NUMA
> nodes
> 
> From: Long Li   Sent: Thursday, November 1, 2018
> 4:52 PM
> >
> > --- a/kernel/irq/affinity.c
> > +++ b/kernel/irq/affinity.c
> > @@ -117,12 +117,13 @@ static int irq_build_affinity_masks(const struct
> irq_affinity *affd,
> >  */
> > if (numvecs <= nodes) {
> > for_each_node_mask(n, nodemsk) {
> > -   cpumask_copy(masks + curvec,
> node_to_cpumask[n]);
> > -   if (++done == numvecs)
> > -   break;
> > +   cpumask_or(masks + curvec, masks + curvec,
> node_to_cpumask[n]);
> > +   done++;
> > if (++curvec == last_affv)
> > curvec = affd->pre_vectors;
> > }
> 
> When the above for loop is exited, 'done' will always be equal to 'nodes'
> since there are no early exits from the loop.  Hence there's no need to be
> incrementing 'done' in the loop.
> 
> > +   if (done > numvecs)
> > +   done = numvecs;
> 
> And if 'done' would always be equal to 'nodes', there is no need for the test.
> Just always set 'done' to 'numvecs'.

Thanks. I will fix this in v2.

> 
> > goto out;
> > }
> >
> > --
> > 2.14.1


RE: [PATCH] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-02 Thread Long Li
> Subject: RE: [PATCH] genirq/affinity: Spread IRQs to all available NUMA
> nodes
> 
> From: Long Li   Sent: Thursday, November 1, 2018
> 4:52 PM
> >
> > --- a/kernel/irq/affinity.c
> > +++ b/kernel/irq/affinity.c
> > @@ -117,12 +117,13 @@ static int irq_build_affinity_masks(const struct
> irq_affinity *affd,
> >  */
> > if (numvecs <= nodes) {
> > for_each_node_mask(n, nodemsk) {
> > -   cpumask_copy(masks + curvec,
> node_to_cpumask[n]);
> > -   if (++done == numvecs)
> > -   break;
> > +   cpumask_or(masks + curvec, masks + curvec,
> node_to_cpumask[n]);
> > +   done++;
> > if (++curvec == last_affv)
> > curvec = affd->pre_vectors;
> > }
> 
> When the above for loop is exited, 'done' will always be equal to 'nodes'
> since there are no early exits from the loop.  Hence there's no need to be
> incrementing 'done' in the loop.
> 
> > +   if (done > numvecs)
> > +   done = numvecs;
> 
> And if 'done' would always be equal to 'nodes', there is no need for the test.
> Just always set 'done' to 'numvecs'.

Thanks. I will fix this in v2.

> 
> > goto out;
> > }
> >
> > --
> > 2.14.1


[PATCH v3] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-01 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than the 
actual
number it's assigned.

To get a more even distribution for allocating managed IRQs, we need to keep 
track
of how many of them are allocated on a given CPU. Introduce "managed_allocated"
in struct cpumap to track those managed IRQs that are allocated on this CPU, and
change the code to use this information for deciding how to allocate CPU for
managed IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..94dd173f24d6 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
unsigned intavailable;
unsigned intallocated;
unsigned intmanaged;
+   unsigned intmanaged_allocated;
boolinitialized;
boolonline;
unsigned long   alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix 
*m,
return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+   const struct cpumask *msk)
+{
+   unsigned int cpu, best_cpu, allocated = UINT_MAX;
+   struct cpumap *cm;
+
+   best_cpu = UINT_MAX;
+
+   for_each_cpu(cpu, msk) {
+   cm = per_cpu_ptr(m->maps, cpu);
+
+   if (!cm->online || cm->managed_allocated > allocated)
+   continue;
+
+   best_cpu = cpu;
+   allocated = cm->managed_allocated;
+   }
+   return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
if (cpumask_empty(msk))
return -EINVAL;
 
-   cpu = matrix_find_best_cpu(m, msk);
+   cpu = matrix_find_best_cpu_managed(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
return -ENOSPC;
set_bit(bit, cm->alloc_map);
cm->allocated++;
+   cm->managed_allocated++;
m->total_allocated++;
*mapped_cpu = cpu;
trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
-- 
2.14.1



[PATCH v3] genirq/matrix: Choose CPU for managed IRQs based on how many of them are allocated

2018-11-01 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than the 
actual
number it's assigned.

To get a more even distribution for allocating managed IRQs, we need to keep 
track
of how many of them are allocated on a given CPU. Introduce "managed_allocated"
in struct cpumap to track those managed IRQs that are allocated on this CPU, and
change the code to use this information for deciding how to allocate CPU for
managed IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..94dd173f24d6 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
unsigned intavailable;
unsigned intallocated;
unsigned intmanaged;
+   unsigned intmanaged_allocated;
boolinitialized;
boolonline;
unsigned long   alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix 
*m,
return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+   const struct cpumask *msk)
+{
+   unsigned int cpu, best_cpu, allocated = UINT_MAX;
+   struct cpumap *cm;
+
+   best_cpu = UINT_MAX;
+
+   for_each_cpu(cpu, msk) {
+   cm = per_cpu_ptr(m->maps, cpu);
+
+   if (!cm->online || cm->managed_allocated > allocated)
+   continue;
+
+   best_cpu = cpu;
+   allocated = cm->managed_allocated;
+   }
+   return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m: Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
if (cpumask_empty(msk))
return -EINVAL;
 
-   cpu = matrix_find_best_cpu(m, msk);
+   cpu = matrix_find_best_cpu_managed(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
return -ENOSPC;
set_bit(bit, cm->alloc_map);
cm->allocated++;
+   cm->managed_allocated++;
m->total_allocated++;
*mapped_cpu = cpu;
trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
-- 
2.14.1



[PATCH] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-01 Thread Long Li
From: Long Li 

On systems with large number of NUMA nodes, there may be more NUMA nodes than
the number of MSI/MSI-X interrupts that device requests for. The current code
always picks up the NUMA nodes starting from the node 0, up to the number of
interrupts requested. This may left some later NUMA nodes unused.

For example, if the system has 16 NUMA nodes, and the device reqeusts for 8
interrupts, NUMA node 0 to 7 are assigned for those interrupts, NUMA 8 to 15
are unused.

There are several problems with this approach:
1. Later, when those managed IRQs are allocated, they can not be assigned to
NUMA 8 to 15, this may create an IRQ concentration on NUMA 0 to 7.
2. Some upper layers assume affinity mask has a complete coverage over NUMA 
nodes.
For example, block layer use the affinity mask to decide how to map CPU queues 
to
hardware queues, missing NUMA nodes in the masks may result in an uneven mapping
of queues. For the above example of 16 NUMA nodes, CPU queues on NUMA node 0 to 
7
are assigned to the hardware queues 0 to 7, respectively. But CPU queues on NUMA
node 8 to 15 are all assigned to the hardware queue 0.

Fix this problem by going over all NUMA nodes and assign them round-robin to
all IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/affinity.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..2d08b560d4b6 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,13 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
+   done++;
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   if (done > numvecs)
+   done = numvecs;
goto out;
}
 
-- 
2.14.1



[PATCH] genirq/affinity: Spread IRQs to all available NUMA nodes

2018-11-01 Thread Long Li
From: Long Li 

On systems with large number of NUMA nodes, there may be more NUMA nodes than
the number of MSI/MSI-X interrupts that device requests for. The current code
always picks up the NUMA nodes starting from the node 0, up to the number of
interrupts requested. This may left some later NUMA nodes unused.

For example, if the system has 16 NUMA nodes, and the device reqeusts for 8
interrupts, NUMA node 0 to 7 are assigned for those interrupts, NUMA 8 to 15
are unused.

There are several problems with this approach:
1. Later, when those managed IRQs are allocated, they can not be assigned to
NUMA 8 to 15, this may create an IRQ concentration on NUMA 0 to 7.
2. Some upper layers assume affinity mask has a complete coverage over NUMA 
nodes.
For example, block layer use the affinity mask to decide how to map CPU queues 
to
hardware queues, missing NUMA nodes in the masks may result in an uneven mapping
of queues. For the above example of 16 NUMA nodes, CPU queues on NUMA node 0 to 
7
are assigned to the hardware queues 0 to 7, respectively. But CPU queues on NUMA
node 8 to 15 are all assigned to the hardware queue 0.

Fix this problem by going over all NUMA nodes and assign them round-robin to
all IRQs.

Signed-off-by: Long Li 
---
 kernel/irq/affinity.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..2d08b560d4b6 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,13 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
 */
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
-   cpumask_copy(masks + curvec, node_to_cpumask[n]);
-   if (++done == numvecs)
-   break;
+   cpumask_or(masks + curvec, masks + curvec, 
node_to_cpumask[n]);
+   done++;
if (++curvec == last_affv)
curvec = affd->pre_vectors;
}
+   if (done > numvecs)
+   done = numvecs;
goto out;
}
 
-- 
2.14.1



RE: [Patch v2] genirq/matrix: Choose CPU for assigning interrupts based on allocated IRQs

2018-11-01 Thread Long Li
> Subject: Re: [Patch v2] genirq/matrix: Choose CPU for assigning interrupts
> based on allocated IRQs
> 
> Long,
> 
> On Thu, 1 Nov 2018, Long Li wrote:
> > On a large system with multiple devices of the same class (e.g. NVMe
> > disks, using managed IRQs), the kernel tends to concentrate their IRQs
> > on several CPUs.
> >
> > The issue is that when NVMe calls irq_matrix_alloc_managed(), the
> > assigned CPU tends to be the first several CPUs in the cpumask,
> > because they check for
> > cpumap->available that will not change after managed IRQs are reserved.
> >
> > In irq_matrix->cpumap, "available" is set when IRQs are allocated
> > earlier in the IRQ allocation process. This value is caculated based
> > on
> 
> calculated
> 
> > 1. how many unmanaged IRQs are allocated on this CPU 2. how many
> > managed IRQs are reserved on this CPU
> >
> > But "available" is not accurate in accouting the real IRQs load on a given 
> > CPU.
> >
> > For a managed IRQ, it tends to reserve more than one CPU, based on
> > cpumask in irq_matrix_reserve_managed. But later when actually
> > allocating CPU for this IRQ, only one CPU is allocated. Because
> > "available" is calculated at the time managed IRQ is reserved, it
> > tends to indicate a CPU has more IRQs than it's actually assigned.
> >
> > When a managed IRQ is assigned to a CPU in irq_matrix_alloc_managed(),
> > it decreases "allocated" based on the actually assignment of this IRQ to 
> > this
> CPU.
> 
> decreases?
> 
> > Unmanaged IRQ also decreases "allocated" after allocating an IRQ on this
> CPU.
> 
> ditto
> 
> > For this reason, checking "allocated" is more accurate than checking
> > "available" for a given CPU, and result in a more evenly distributed
> > IRQ across all CPUs.
> 
> Again, this approach is only correct for managed interrupts. Why?
> 
> Assume that total vector space size  = 10
> 
> CPU 0:
>allocated  =  8
>available  =  1
> 
>i.e. there are 2 managed reserved, but not assigned interrupts
> 
> CPU 1:
>allocated  =  7
>available  =  0
> 
>i.e. there are 3 managed reserved, but not assigned interrupts
> 
> Now allocate a non managed interrupt:
> 
> irq_matrix_alloc()
> 
>   cpu = find_best_cpu() <-- returns CPU1
> 
>   ---> FAIL
> 
> The allocation fails because it cannot allocate from the managed reserved
> space. The managed reserved space is guaranteed even if the vectors are not
> assigned. This is required to make hotplug work and to allow late activation
> without breaking the guarantees.
> 
> Non managed has no guarantees, it's a best effort approach, so it can fail.
> But the fail above is just wrong.
> 
> You really need to treat managed and unmanaged CPU selection differently.

Thank you for the explanation. I will send another patch to do it properly.

Long

> 
> Thanks,
> 
>   tglx


RE: [Patch v2] genirq/matrix: Choose CPU for assigning interrupts based on allocated IRQs

2018-11-01 Thread Long Li
> Subject: Re: [Patch v2] genirq/matrix: Choose CPU for assigning interrupts
> based on allocated IRQs
> 
> Long,
> 
> On Thu, 1 Nov 2018, Long Li wrote:
> > On a large system with multiple devices of the same class (e.g. NVMe
> > disks, using managed IRQs), the kernel tends to concentrate their IRQs
> > on several CPUs.
> >
> > The issue is that when NVMe calls irq_matrix_alloc_managed(), the
> > assigned CPU tends to be the first several CPUs in the cpumask,
> > because they check for
> > cpumap->available that will not change after managed IRQs are reserved.
> >
> > In irq_matrix->cpumap, "available" is set when IRQs are allocated
> > earlier in the IRQ allocation process. This value is caculated based
> > on
> 
> calculated
> 
> > 1. how many unmanaged IRQs are allocated on this CPU 2. how many
> > managed IRQs are reserved on this CPU
> >
> > But "available" is not accurate in accouting the real IRQs load on a given 
> > CPU.
> >
> > For a managed IRQ, it tends to reserve more than one CPU, based on
> > cpumask in irq_matrix_reserve_managed. But later when actually
> > allocating CPU for this IRQ, only one CPU is allocated. Because
> > "available" is calculated at the time managed IRQ is reserved, it
> > tends to indicate a CPU has more IRQs than it's actually assigned.
> >
> > When a managed IRQ is assigned to a CPU in irq_matrix_alloc_managed(),
> > it decreases "allocated" based on the actually assignment of this IRQ to 
> > this
> CPU.
> 
> decreases?
> 
> > Unmanaged IRQ also decreases "allocated" after allocating an IRQ on this
> CPU.
> 
> ditto
> 
> > For this reason, checking "allocated" is more accurate than checking
> > "available" for a given CPU, and result in a more evenly distributed
> > IRQ across all CPUs.
> 
> Again, this approach is only correct for managed interrupts. Why?
> 
> Assume that total vector space size  = 10
> 
> CPU 0:
>allocated  =  8
>available  =  1
> 
>i.e. there are 2 managed reserved, but not assigned interrupts
> 
> CPU 1:
>allocated  =  7
>available  =  0
> 
>i.e. there are 3 managed reserved, but not assigned interrupts
> 
> Now allocate a non managed interrupt:
> 
> irq_matrix_alloc()
> 
>   cpu = find_best_cpu() <-- returns CPU1
> 
>   ---> FAIL
> 
> The allocation fails because it cannot allocate from the managed reserved
> space. The managed reserved space is guaranteed even if the vectors are not
> assigned. This is required to make hotplug work and to allow late activation
> without breaking the guarantees.
> 
> Non managed has no guarantees, it's a best effort approach, so it can fail.
> But the fail above is just wrong.
> 
> You really need to treat managed and unmanaged CPU selection differently.

Thank you for the explanation. I will send another patch to do it properly.

Long

> 
> Thanks,
> 
>   tglx


[Patch v2] genirq/matrix: Choose CPU for assigning interrupts based on allocated IRQs

2018-10-31 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

In irq_matrix->cpumap, "available" is set when IRQs are allocated earlier
in the IRQ allocation process. This value is caculated based on
1. how many unmanaged IRQs are allocated on this CPU
2. how many managed IRQs are reserved on this CPU

But "available" is not accurate in accouting the real IRQs load on a given CPU.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than it's
actually assigned.

When a managed IRQ is assigned to a CPU in irq_matrix_alloc_managed(), it
decreases "allocated" based on the actually assignment of this IRQ to this CPU.
Unmanaged IRQ also decreases "allocated" after allocating an IRQ on this CPU.
For this reason, checking "allocated" is more accurate than checking
"available" for a given CPU, and result in a more evenly distributed IRQ
across all CPUs.

Signed-off-by: Long Li 
Reviewed-by: Michael Kelley 
---
 kernel/irq/matrix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..a51689e3e7c0 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -128,7 +128,7 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, 
struct cpumap *cm,
 static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
const struct cpumask *msk)
 {
-   unsigned int cpu, best_cpu, maxavl = 0;
+   unsigned int cpu, best_cpu, min_allocated = UINT_MAX;
struct cpumap *cm;
 
best_cpu = UINT_MAX;
@@ -136,11 +136,11 @@ static unsigned int matrix_find_best_cpu(struct 
irq_matrix *m,
for_each_cpu(cpu, msk) {
cm = per_cpu_ptr(m->maps, cpu);
 
-   if (!cm->online || cm->available <= maxavl)
+   if (!cm->online || cm->allocated > min_allocated)
continue;
 
best_cpu = cpu;
-   maxavl = cm->available;
+   min_allocated = cm->allocated;
}
return best_cpu;
 }
-- 
2.14.1



[Patch v2] genirq/matrix: Choose CPU for assigning interrupts based on allocated IRQs

2018-10-31 Thread Long Li
From: Long Li 

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

In irq_matrix->cpumap, "available" is set when IRQs are allocated earlier
in the IRQ allocation process. This value is caculated based on
1. how many unmanaged IRQs are allocated on this CPU
2. how many managed IRQs are reserved on this CPU

But "available" is not accurate in accouting the real IRQs load on a given CPU.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than it's
actually assigned.

When a managed IRQ is assigned to a CPU in irq_matrix_alloc_managed(), it
decreases "allocated" based on the actually assignment of this IRQ to this CPU.
Unmanaged IRQ also decreases "allocated" after allocating an IRQ on this CPU.
For this reason, checking "allocated" is more accurate than checking
"available" for a given CPU, and result in a more evenly distributed IRQ
across all CPUs.

Signed-off-by: Long Li 
Reviewed-by: Michael Kelley 
---
 kernel/irq/matrix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..a51689e3e7c0 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -128,7 +128,7 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, 
struct cpumap *cm,
 static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
const struct cpumask *msk)
 {
-   unsigned int cpu, best_cpu, maxavl = 0;
+   unsigned int cpu, best_cpu, min_allocated = UINT_MAX;
struct cpumap *cm;
 
best_cpu = UINT_MAX;
@@ -136,11 +136,11 @@ static unsigned int matrix_find_best_cpu(struct 
irq_matrix *m,
for_each_cpu(cpu, msk) {
cm = per_cpu_ptr(m->maps, cpu);
 
-   if (!cm->online || cm->available <= maxavl)
+   if (!cm->online || cm->allocated > min_allocated)
continue;
 
best_cpu = cpu;
-   maxavl = cm->available;
+   min_allocated = cm->allocated;
}
return best_cpu;
 }
-- 
2.14.1



[Patch v4 1/3] CIFS: Add support for direct I/O read

2018-10-31 Thread Long Li
From: Long Li 

With direct I/O read, we transfer the data directly from transport layer to
the user data buffer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common read code to __cifs_readv for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h   |   1 +
 fs/cifs/cifsglob.h |   5 ++
 fs/cifs/file.c | 219 +++--
 3 files changed, 186 insertions(+), 39 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5f02318..7fba9aa 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct file 
*file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7f62c98..52248dd 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
unsigned intlen;
unsigned inttotal_len;
boolshould_dirty;
+   /*
+* Indicates if this aio_ctx is for direct_io,
+* If yes, iter is a copy of the user passed iov_iter
+*/
+   booldirect_io;
 };
 
 struct cifs_readdata;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 87eece6..daab878 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
kref_put(>ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
-   rdata->pages[i] = NULL;
}
cifs_readdata_release(refcount);
 }
@@ -3092,6 +3091,63 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info 
*server,
return uncached_fill_pages(server, rdata, iter, iter->count);
 }
 
+static int cifs_resend_rdata(struct cifs_readdata *rdata,
+ struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
+{
+   int wait_retry = 0;
+   unsigned int rsize, credits;
+   int rc;
+   struct TCP_Server_Info *server = 
tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+   /*
+* Try to resend this rdata, waiting for credits up to 3 seconds.
+* Note: we are attempting to resend the whole rdata not in segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+   , );
+
+   if (rc)
+   break;
+
+   if (rsize < rdata->bytes) {
+   add_credits_and_wake_if(server, credits, 0);
+   msleep(1000);
+   wait_retry++;
+   }
+   } while (rsize < rdata->bytes && wait_retry < 3);
+
+   /*
+* If we can't find enough credits to send this rdata
+* release the rdata and return failure, this will pass
+* whatever I/O amount we have finished to VFS.
+*/
+   if (rsize < rdata->bytes) {
+   rc = -EBUSY;
+   goto out;
+   }
+
+   rc = -EAGAIN;
+   while (rc == -EAGAIN)
+   if (!rdata->cfile->invalidHandle ||
+   !(rc = cifs_reopen_file(rdata->cfile, true)))
+   rc = server->ops->async_readv(rdata);
+
+   if (!rc) {
+   /* Add to aio pending list */
+   list_add_tail(>list, rdata_list);
+   return 0;
+   }
+
+   add_credits_and_wake_if(server, rdata->credits, 0);
+out:
+   kref_put(>refcount,
+   cifs_uncached_readdata_release);
+
+   return rc;
+}
+
 static int
 cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
@@ -3103,6 +3159,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
int rc;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
+   struct iov_iter direct_iov = ctx->iter;
 
server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
else
pid = current->tgid;
 
+   if (ctx->direct_io)
+   io

[Patch v4 2/3] CIFS: Add support for direct I/O write

2018-10-31 Thread Long Li
From: Long Li 

With direct I/O write, user supplied buffers are pinned to the memory and data
are transferred directly from user buffers to the transport layer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common write code to __cifs_writev for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h |   1 +
 fs/cifs/file.c   | 194 +++
 2 files changed, 154 insertions(+), 41 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7fba9aa..e9c5103 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct 
iov_iter *to);
 extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index daab878..1a41c04 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, 
struct iov_iter *from,
 }
 
 static int
+cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, 
struct cifs_aio_ctx *ctx)
+{
+   int wait_retry = 0;
+   unsigned int wsize, credits;
+   int rc;
+   struct TCP_Server_Info *server = 
tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+   /*
+* Try to resend this wdata, waiting for credits up to 3 seconds.
+* Note: we are attempting to resend the whole wdata not in segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, wdata->bytes, 
, );
+
+   if (rc)
+   break;
+
+   if (wsize < wdata->bytes) {
+   add_credits_and_wake_if(server, credits, 0);
+   msleep(1000);
+   wait_retry++;
+   }
+   } while (wsize < wdata->bytes && wait_retry < 3);
+
+   if (wsize < wdata->bytes) {
+   rc = -EBUSY;
+   goto out;
+   }
+
+   rc = -EAGAIN;
+   while (rc == -EAGAIN)
+   if (!wdata->cfile->invalidHandle ||
+   !(rc = cifs_reopen_file(wdata->cfile, false)))
+   rc = server->ops->async_writev(wdata,
+   cifs_uncached_writedata_release);
+
+   if (!rc) {
+   list_add_tail(>list, wdata_list);
+   return 0;
+   }
+
+   add_credits_and_wake_if(server, wdata->credits, 0);
+out:
+   kref_put(>refcount, cifs_uncached_writedata_release);
+
+   return rc;
+}
+
+static int
 cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 struct cifsFileInfo *open_file,
 struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
@@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
 
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
if (rc)
break;
 
-   nr_pages = get_numpages(wsize, len, _len);
-   wdata = cifs_writedata_alloc(nr_pages,
+   if (ctx->direct_io) {
+   cur_len = iov_iter_get_pages_alloc(
+   from, , wsize, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "direct_writev couldn't get user pages "
+   "(rc=%zd) iter type %d iov_offset %zd 
count"
+   " %zd\n",
+   cur_len, from->type,
+   from->iov_offset, from->count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(from, cur_len);
+
+   nr_pages = (cur_len + start + PAGE_SIZE - 1) / 
PAGE_SIZE;
+
+   wdata = cifs_writedata_direct_alloc(pagevec,
 cifs_uncached_writev_complete);
-

[Patch v4 1/3] CIFS: Add support for direct I/O read

2018-10-31 Thread Long Li
From: Long Li 

With direct I/O read, we transfer the data directly from transport layer to
the user data buffer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common read code to __cifs_readv for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h   |   1 +
 fs/cifs/cifsglob.h |   5 ++
 fs/cifs/file.c | 219 +++--
 3 files changed, 186 insertions(+), 39 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 5f02318..7fba9aa 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -102,6 +102,7 @@ extern int cifs_open(struct inode *inode, struct file 
*file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7f62c98..52248dd 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1146,6 +1146,11 @@ struct cifs_aio_ctx {
unsigned intlen;
unsigned inttotal_len;
boolshould_dirty;
+   /*
+* Indicates if this aio_ctx is for direct_io,
+* If yes, iter is a copy of the user passed iov_iter
+*/
+   booldirect_io;
 };
 
 struct cifs_readdata;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 87eece6..daab878 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
kref_put(>ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
-   rdata->pages[i] = NULL;
}
cifs_readdata_release(refcount);
 }
@@ -3092,6 +3091,63 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info 
*server,
return uncached_fill_pages(server, rdata, iter, iter->count);
 }
 
+static int cifs_resend_rdata(struct cifs_readdata *rdata,
+ struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
+{
+   int wait_retry = 0;
+   unsigned int rsize, credits;
+   int rc;
+   struct TCP_Server_Info *server = 
tlink_tcon(rdata->cfile->tlink)->ses->server;
+
+   /*
+* Try to resend this rdata, waiting for credits up to 3 seconds.
+* Note: we are attempting to resend the whole rdata not in segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, rdata->bytes,
+   , );
+
+   if (rc)
+   break;
+
+   if (rsize < rdata->bytes) {
+   add_credits_and_wake_if(server, credits, 0);
+   msleep(1000);
+   wait_retry++;
+   }
+   } while (rsize < rdata->bytes && wait_retry < 3);
+
+   /*
+* If we can't find enough credits to send this rdata
+* release the rdata and return failure, this will pass
+* whatever I/O amount we have finished to VFS.
+*/
+   if (rsize < rdata->bytes) {
+   rc = -EBUSY;
+   goto out;
+   }
+
+   rc = -EAGAIN;
+   while (rc == -EAGAIN)
+   if (!rdata->cfile->invalidHandle ||
+   !(rc = cifs_reopen_file(rdata->cfile, true)))
+   rc = server->ops->async_readv(rdata);
+
+   if (!rc) {
+   /* Add to aio pending list */
+   list_add_tail(>list, rdata_list);
+   return 0;
+   }
+
+   add_credits_and_wake_if(server, rdata->credits, 0);
+out:
+   kref_put(>refcount,
+   cifs_uncached_readdata_release);
+
+   return rc;
+}
+
 static int
 cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
@@ -3103,6 +3159,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
int rc;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
+   struct iov_iter direct_iov = ctx->iter;
 
server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,6 +3170,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
else
pid = current->tgid;
 
+   if (ctx->direct_io)
+   io

[Patch v4 2/3] CIFS: Add support for direct I/O write

2018-10-31 Thread Long Li
From: Long Li 

With direct I/O write, user supplied buffers are pinned to the memory and data
are transferred directly from user buffers to the transport layer.

Change in v3: add support for kernel AIO

Change in v4:
Refactor common write code to __cifs_writev for direct and non-direct I/O.
Retry on direct I/O failure.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h |   1 +
 fs/cifs/file.c   | 194 +++
 2 files changed, 154 insertions(+), 41 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7fba9aa..e9c5103 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,6 +105,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct 
iov_iter *to);
 extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index daab878..1a41c04 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2524,6 +2524,55 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, 
struct iov_iter *from,
 }
 
 static int
+cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, 
struct cifs_aio_ctx *ctx)
+{
+   int wait_retry = 0;
+   unsigned int wsize, credits;
+   int rc;
+   struct TCP_Server_Info *server = 
tlink_tcon(wdata->cfile->tlink)->ses->server;
+
+   /*
+* Try to resend this wdata, waiting for credits up to 3 seconds.
+* Note: we are attempting to resend the whole wdata not in segments
+*/
+   do {
+   rc = server->ops->wait_mtu_credits(server, wdata->bytes, 
, );
+
+   if (rc)
+   break;
+
+   if (wsize < wdata->bytes) {
+   add_credits_and_wake_if(server, credits, 0);
+   msleep(1000);
+   wait_retry++;
+   }
+   } while (wsize < wdata->bytes && wait_retry < 3);
+
+   if (wsize < wdata->bytes) {
+   rc = -EBUSY;
+   goto out;
+   }
+
+   rc = -EAGAIN;
+   while (rc == -EAGAIN)
+   if (!wdata->cfile->invalidHandle ||
+   !(rc = cifs_reopen_file(wdata->cfile, false)))
+   rc = server->ops->async_writev(wdata,
+   cifs_uncached_writedata_release);
+
+   if (!rc) {
+   list_add_tail(>list, wdata_list);
+   return 0;
+   }
+
+   add_credits_and_wake_if(server, wdata->credits, 0);
+out:
+   kref_put(>refcount, cifs_uncached_writedata_release);
+
+   return rc;
+}
+
+static int
 cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 struct cifsFileInfo *open_file,
 struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
@@ -2537,6 +2586,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
 
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2553,38 +2604,74 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
if (rc)
break;
 
-   nr_pages = get_numpages(wsize, len, _len);
-   wdata = cifs_writedata_alloc(nr_pages,
+   if (ctx->direct_io) {
+   cur_len = iov_iter_get_pages_alloc(
+   from, , wsize, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "direct_writev couldn't get user pages "
+   "(rc=%zd) iter type %d iov_offset %zd 
count"
+   " %zd\n",
+   cur_len, from->type,
+   from->iov_offset, from->count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(from, cur_len);
+
+   nr_pages = (cur_len + start + PAGE_SIZE - 1) / 
PAGE_SIZE;
+
+   wdata = cifs_writedata_direct_alloc(pagevec,
 cifs_uncached_writev_complete);
-

[Patch v4 3/3] CIFS: Add direct I/O functions to file_operations

2018-10-31 Thread Long Li
From: Long Li 

With direct read/write functions implemented, add them to file_operations.

Dircet I/O is used under two conditions:
1. When mounting with "cache=none", CIFS uses direct I/O for all user file
data transfer.
2. When opening a file with O_DIRECT, CIFS uses direct I/O for all data
transfer on this file.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62f1662..f18091b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1113,9 +1113,8 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1169,9 +1168,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = 
{
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
-- 
2.7.4



[Patch v4 3/3] CIFS: Add direct I/O functions to file_operations

2018-10-31 Thread Long Li
From: Long Li 

With direct read/write functions implemented, add them to file_operations.

Dircet I/O is used under two conditions:
1. When mounting with "cache=none", CIFS uses direct I/O for all user file
data transfer.
2. When opening a file with O_DIRECT, CIFS uses direct I/O for all data
transfer on this file.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 62f1662..f18091b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1113,9 +1113,8 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1169,9 +1168,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = 
{
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
-- 
2.7.4



RE: [PATCH] Choose CPU based on allocated IRQs

2018-10-30 Thread Long Li
> Subject: Re: [PATCH] Choose CPU based on allocated IRQs
> 
> Long,
> 
> On Tue, 23 Oct 2018, Long Li wrote:
> 
> thanks for this patch.
> 
> A trivial formal thing ahead. The subject line
> 
>[PATCH] Choose CPU based on allocated IRQs
> 
> is lacking a proper subsystem prefix. In most cases you can figure the prefix
> out by running 'git log path/to/file' which in this case will show you that 
> most
> commits touching this file use the prefix 'genirq/matrix:'.
> 
> So the proper subject would be:
> 
>[PATCH] genirq/matrix: Choose CPU based on allocated IRQs
> 
> Subsystem prefixes are important to see where a patch belongs to right from
> the subject. Without that it could belong to any random part of the kernel
> and needs further inspection of the patch itself. This applies to both email
> and to git shortlog listings.

Thank you. I will send v2 to address this.

> 
> > From: Long Li 
> >
> > In irq_matrix, "available" is set when IRQs are allocated earlier in
> > the IRQ assigning process.
> >
> > Later, when IRQs are activated those values are not good indicators of
> > what CPU to choose to assign to this IRQ.
> 
> Can you please explain why you think that available is the wrong indicator
> and which problem you are trying to solve?
> 
> The WHY is really the most important part of a changelog.

The problem I'm seeing is that on a very large system with multiple devices of 
the same class (e.g. NVMe disks, using managed IRQs), they tend to use 
interrupts on several CPUs on the system. Under heavy load, those several CPUs 
are busy while other CPU are most idling. The issue is that when NVMe call 
irq_matrix_alloc_managed(), the assigned the CPU is always the first CPU in the 
cpumask, because they check for cpumap->available that will not change after 
managed IRQs are reserved in irq_matrix_reserve_managed (which was called from 
the 1st stage of IRQ setup in irq_domain_ops->alloc).

> 
> > Change to choose CPU for an IRQ based on how many IRQs are already
> > allocated on this CPU.
> 
> Looking deeper. The initial values are:
> 
> available = alloc_size - (managed + systembits)
> allocated = 0
> 
> There are two distinct functionalities which modify 'available' and 
> 'allocated'
> (omitting the reverse operations for simplicity):
> 
> 1) managed interrupts
> 
>reserve_managed()
>   managed++;
>   available--;
> 
>alloc_managed()
> allocated++;
> 
> 2) regular interrupts
> 
>alloc()
>   allocated++;
>   available--;
> 
> So 'available' can be lower than 'allocated' depending on the number of
> reserved managed interrupts, which have not yet been activated.
> 
> So for all regular interrupts we really want to look at the number of 
> 'available'
> vectors because the reserved managed ones are already accounted there
> and they need to be taken into account.

I think "reserved managed" may not always be accurate. Reserved managed IRQs 
may not always get activated. For an irq_data, when irq_matrix_reserve_managed 
is called, all the CPUs in the cpumask are reserved. Later, only one of them is 
activated via the call to irq_matrix_alloc_managed(). So we end up with a 
number of "reserved managed" that never get used.

> 
> For the spreading of managed interrupts in alloc_managed() that's indeed a
> different story and 'allocated' is more correct. But even that is not 
> completely
> accurate and can lead to the wrong result. The accurate solution would be to
> account the managed _and_ allocated vectors separately and do the
> spreading for managed interrupts based on that.

I think checking for "allocated" is the best approach for picking which CPU to 
assign for a given irq_data, since we really can't rely on "managed" to decide 
how busy this CPU really is. Checking for "allocated" should work for both 
unmanaged and managed IRQs.

> 
> Thanks,
> 
>   tglx


RE: [PATCH] Choose CPU based on allocated IRQs

2018-10-30 Thread Long Li
> Subject: Re: [PATCH] Choose CPU based on allocated IRQs
> 
> Long,
> 
> On Tue, 23 Oct 2018, Long Li wrote:
> 
> thanks for this patch.
> 
> A trivial formal thing ahead. The subject line
> 
>[PATCH] Choose CPU based on allocated IRQs
> 
> is lacking a proper subsystem prefix. In most cases you can figure the prefix
> out by running 'git log path/to/file' which in this case will show you that 
> most
> commits touching this file use the prefix 'genirq/matrix:'.
> 
> So the proper subject would be:
> 
>[PATCH] genirq/matrix: Choose CPU based on allocated IRQs
> 
> Subsystem prefixes are important to see where a patch belongs to right from
> the subject. Without that it could belong to any random part of the kernel
> and needs further inspection of the patch itself. This applies to both email
> and to git shortlog listings.

Thank you. I will send v2 to address this.

> 
> > From: Long Li 
> >
> > In irq_matrix, "available" is set when IRQs are allocated earlier in
> > the IRQ assigning process.
> >
> > Later, when IRQs are activated those values are not good indicators of
> > what CPU to choose to assign to this IRQ.
> 
> Can you please explain why you think that available is the wrong indicator
> and which problem you are trying to solve?
> 
> The WHY is really the most important part of a changelog.

The problem I'm seeing is that on a very large system with multiple devices of 
the same class (e.g. NVMe disks, using managed IRQs), they tend to use 
interrupts on several CPUs on the system. Under heavy load, those several CPUs 
are busy while other CPU are most idling. The issue is that when NVMe call 
irq_matrix_alloc_managed(), the assigned the CPU is always the first CPU in the 
cpumask, because they check for cpumap->available that will not change after 
managed IRQs are reserved in irq_matrix_reserve_managed (which was called from 
the 1st stage of IRQ setup in irq_domain_ops->alloc).

> 
> > Change to choose CPU for an IRQ based on how many IRQs are already
> > allocated on this CPU.
> 
> Looking deeper. The initial values are:
> 
> available = alloc_size - (managed + systembits)
> allocated = 0
> 
> There are two distinct functionalities which modify 'available' and 
> 'allocated'
> (omitting the reverse operations for simplicity):
> 
> 1) managed interrupts
> 
>reserve_managed()
>   managed++;
>   available--;
> 
>alloc_managed()
> allocated++;
> 
> 2) regular interrupts
> 
>alloc()
>   allocated++;
>   available--;
> 
> So 'available' can be lower than 'allocated' depending on the number of
> reserved managed interrupts, which have not yet been activated.
> 
> So for all regular interrupts we really want to look at the number of 
> 'available'
> vectors because the reserved managed ones are already accounted there
> and they need to be taken into account.

I think "reserved managed" may not always be accurate. Reserved managed IRQs 
may not always get activated. For an irq_data, when irq_matrix_reserve_managed 
is called, all the CPUs in the cpumask are reserved. Later, only one of them is 
activated via the call to irq_matrix_alloc_managed(). So we end up with a 
number of "reserved managed" that never get used.

> 
> For the spreading of managed interrupts in alloc_managed() that's indeed a
> different story and 'allocated' is more correct. But even that is not 
> completely
> accurate and can lead to the wrong result. The accurate solution would be to
> account the managed _and_ allocated vectors separately and do the
> spreading for managed interrupts based on that.

I think checking for "allocated" is the best approach for picking which CPU to 
assign for a given irq_data, since we really can't rely on "managed" to decide 
how busy this CPU really is. Checking for "allocated" should work for both 
unmanaged and managed IRQs.

> 
> Thanks,
> 
>   tglx


[PATCH] Choose CPU based on allocated IRQs

2018-10-22 Thread Long Li
From: Long Li 

In irq_matrix, "available" is set when IRQs are allocated earlier in the IRQ
assigning process.

Later, when IRQs are activated those values are not good indicators of what
CPU to choose to assign to this IRQ.

Change to choose CPU for an IRQ based on how many IRQs are already allocated
on this CPU.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..a51689e3e7c0 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -128,7 +128,7 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, 
struct cpumap *cm,
 static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
const struct cpumask *msk)
 {
-   unsigned int cpu, best_cpu, maxavl = 0;
+   unsigned int cpu, best_cpu, min_allocated = UINT_MAX;
struct cpumap *cm;
 
best_cpu = UINT_MAX;
@@ -136,11 +136,11 @@ static unsigned int matrix_find_best_cpu(struct 
irq_matrix *m,
for_each_cpu(cpu, msk) {
cm = per_cpu_ptr(m->maps, cpu);
 
-   if (!cm->online || cm->available <= maxavl)
+   if (!cm->online || cm->allocated > min_allocated)
continue;
 
best_cpu = cpu;
-   maxavl = cm->available;
+   min_allocated = cm->allocated;
}
return best_cpu;
 }
-- 
2.14.1



[PATCH] Choose CPU based on allocated IRQs

2018-10-22 Thread Long Li
From: Long Li 

In irq_matrix, "available" is set when IRQs are allocated earlier in the IRQ
assigning process.

Later, when IRQs are activated those values are not good indicators of what
CPU to choose to assign to this IRQ.

Change to choose CPU for an IRQ based on how many IRQs are already allocated
on this CPU.

Signed-off-by: Long Li 
---
 kernel/irq/matrix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..a51689e3e7c0 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -128,7 +128,7 @@ static unsigned int matrix_alloc_area(struct irq_matrix *m, 
struct cpumap *cm,
 static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
const struct cpumask *msk)
 {
-   unsigned int cpu, best_cpu, maxavl = 0;
+   unsigned int cpu, best_cpu, min_allocated = UINT_MAX;
struct cpumap *cm;
 
best_cpu = UINT_MAX;
@@ -136,11 +136,11 @@ static unsigned int matrix_find_best_cpu(struct 
irq_matrix *m,
for_each_cpu(cpu, msk) {
cm = per_cpu_ptr(m->maps, cpu);
 
-   if (!cm->online || cm->available <= maxavl)
+   if (!cm->online || cm->allocated > min_allocated)
continue;
 
best_cpu = cpu;
-   maxavl = cm->available;
+   min_allocated = cm->allocated;
}
return best_cpu;
 }
-- 
2.14.1



RE: [PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read

2018-09-24 Thread Long Li
> Subject: Re: [PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read
> 
> чт, 20 сент. 2018 г. в 14:22, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O read, we transfer the data directly from transport
> > layer to the user data buffer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h   |   1 +
> >  fs/cifs/cifsglob.h |   5 ++
> >  fs/cifs/file.c | 210
> +
> >  3 files changed, 187 insertions(+), 29 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > f047e87..ed5479c 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -101,6 +101,7 @@ extern int cifs_open(struct inode *inode, struct
> > file *file);  extern int cifs_close(struct inode *inode, struct file
> > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > iov_iter *to);
> > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > +*to);
> >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > b/fs/cifs/cifsglob.h index 9dcaed0..2131fec 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -1172,6 +1172,11 @@ struct cifs_aio_ctx {
> > unsigned intlen;
> > unsigned inttotal_len;
> > boolshould_dirty;
> > +   /*
> > +* Indicates if this aio_ctx is for direct_io,
> > +* If yes, iter is a copy of the user passed iov_iter
> > +*/
> > +   booldirect_io;
> >  };
> >
> >  struct cifs_readdata;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7..6a939fa
> > 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> *refcount)
> > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > for (i = 0; i < rdata->nr_pages; i++) {
> > put_page(rdata->pages[i]);
> > -   rdata->pages[i] = NULL;
> 
> why is this needed?
It is not needed. But there is no need to set pages[i] to NULL, so just remove 
this.
> 
> > }
> > cifs_readdata_release(refcount);  } @@ -3004,7 +3003,7 @@
> > cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
> > return remaining ? -EFAULT : 0;  }
> >
> > -static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
> > +static void collect_uncached_read_data(struct cifs_readdata *rdata,
> > +struct cifs_aio_ctx *ctx);
> >
> >  static void
> >  cifs_uncached_readv_complete(struct work_struct *work) @@ -3013,7
> > +3012,7 @@ cifs_uncached_readv_complete(struct work_struct *work)
> > struct cifs_readdata,
> > work);
> >
> > complete(>done);
> > -   collect_uncached_read_data(rdata->ctx);
> > +   collect_uncached_read_data(rdata, rdata->ctx);
> > /* the below call can possibly free the last ref to aio ctx */
> > kref_put(>refcount, cifs_uncached_readdata_release);  }
> > @@ -3103,6 +3102,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > int rc;
> > pid_t pid;
> > struct TCP_Server_Info *server;
> > +   struct page **pagevec;
> > +   size_t start;
> > +   struct iov_iter direct_iov = ctx->iter;
> >
> > server = tlink_tcon(open_file->tlink)->ses->server;
> >
> > @@ -3111,6 +3113,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > else
> > pid = current->tgid;
> >
> > +   if (ctx->direct_io)
> > +   iov_iter_advance(_iov, offset - ctx->pos);
> > +
> > do {
> > rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> >, );
> > @@ -3118,20 +3123,56 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > 

RE: [PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read

2018-09-24 Thread Long Li
> Subject: Re: [PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read
> 
> чт, 20 сент. 2018 г. в 14:22, Long Li :
> >
> > From: Long Li 
> >
> > With direct I/O read, we transfer the data directly from transport
> > layer to the user data buffer.
> >
> > Change in v3: add support for kernel AIO
> >
> > Signed-off-by: Long Li 
> > ---
> >  fs/cifs/cifsfs.h   |   1 +
> >  fs/cifs/cifsglob.h |   5 ++
> >  fs/cifs/file.c | 210
> +
> >  3 files changed, 187 insertions(+), 29 deletions(-)
> >
> > diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index
> > f047e87..ed5479c 100644
> > --- a/fs/cifs/cifsfs.h
> > +++ b/fs/cifs/cifsfs.h
> > @@ -101,6 +101,7 @@ extern int cifs_open(struct inode *inode, struct
> > file *file);  extern int cifs_close(struct inode *inode, struct file
> > *file);  extern int cifs_closedir(struct inode *inode, struct file
> > *file);  extern ssize_t cifs_user_readv(struct kiocb *iocb, struct
> > iov_iter *to);
> > +extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter
> > +*to);
> >  extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter
> > *to);  extern ssize_t cifs_user_writev(struct kiocb *iocb, struct
> > iov_iter *from);  extern ssize_t cifs_strict_writev(struct kiocb
> > *iocb, struct iov_iter *from); diff --git a/fs/cifs/cifsglob.h
> > b/fs/cifs/cifsglob.h index 9dcaed0..2131fec 100644
> > --- a/fs/cifs/cifsglob.h
> > +++ b/fs/cifs/cifsglob.h
> > @@ -1172,6 +1172,11 @@ struct cifs_aio_ctx {
> > unsigned intlen;
> > unsigned inttotal_len;
> > boolshould_dirty;
> > +   /*
> > +* Indicates if this aio_ctx is for direct_io,
> > +* If yes, iter is a copy of the user passed iov_iter
> > +*/
> > +   booldirect_io;
> >  };
> >
> >  struct cifs_readdata;
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7..6a939fa
> > 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref
> *refcount)
> > kref_put(>ctx->refcount, cifs_aio_ctx_release);
> > for (i = 0; i < rdata->nr_pages; i++) {
> > put_page(rdata->pages[i]);
> > -   rdata->pages[i] = NULL;
> 
> why is this needed?
It is not needed. But there is no need to set pages[i] to NULL, so just remove 
this.
> 
> > }
> > cifs_readdata_release(refcount);  } @@ -3004,7 +3003,7 @@
> > cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
> > return remaining ? -EFAULT : 0;  }
> >
> > -static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
> > +static void collect_uncached_read_data(struct cifs_readdata *rdata,
> > +struct cifs_aio_ctx *ctx);
> >
> >  static void
> >  cifs_uncached_readv_complete(struct work_struct *work) @@ -3013,7
> > +3012,7 @@ cifs_uncached_readv_complete(struct work_struct *work)
> > struct cifs_readdata,
> > work);
> >
> > complete(>done);
> > -   collect_uncached_read_data(rdata->ctx);
> > +   collect_uncached_read_data(rdata, rdata->ctx);
> > /* the below call can possibly free the last ref to aio ctx */
> > kref_put(>refcount, cifs_uncached_readdata_release);  }
> > @@ -3103,6 +3102,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > int rc;
> > pid_t pid;
> > struct TCP_Server_Info *server;
> > +   struct page **pagevec;
> > +   size_t start;
> > +   struct iov_iter direct_iov = ctx->iter;
> >
> > server = tlink_tcon(open_file->tlink)->ses->server;
> >
> > @@ -3111,6 +3113,9 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > else
> > pid = current->tgid;
> >
> > +   if (ctx->direct_io)
> > +   iov_iter_advance(_iov, offset - ctx->pos);
> > +
> > do {
> > rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
> >, );
> > @@ -3118,20 +3123,56 @@ cifs_send_async_read(loff_t offset, size_t len,
> struct cifsFileInfo *open_file,
> > 

[PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read

2018-09-20 Thread Long Li
From: Long Li 

With direct I/O read, we transfer the data directly from transport layer to
the user data buffer.

Change in v3: add support for kernel AIO

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h   |   1 +
 fs/cifs/cifsglob.h |   5 ++
 fs/cifs/file.c | 210 +
 3 files changed, 187 insertions(+), 29 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f047e87..ed5479c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,6 +101,7 @@ extern int cifs_open(struct inode *inode, struct file 
*file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed0..2131fec 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1172,6 +1172,11 @@ struct cifs_aio_ctx {
unsigned intlen;
unsigned inttotal_len;
boolshould_dirty;
+   /*
+* Indicates if this aio_ctx is for direct_io,
+* If yes, iter is a copy of the user passed iov_iter
+*/
+   booldirect_io;
 };
 
 struct cifs_readdata;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8d41ca7..6a939fa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
kref_put(>ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
-   rdata->pages[i] = NULL;
}
cifs_readdata_release(refcount);
 }
@@ -3004,7 +3003,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct 
iov_iter *iter)
return remaining ? -EFAULT : 0;
 }
 
-static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
+static void collect_uncached_read_data(struct cifs_readdata *rdata, struct 
cifs_aio_ctx *ctx);
 
 static void
 cifs_uncached_readv_complete(struct work_struct *work)
@@ -3013,7 +3012,7 @@ cifs_uncached_readv_complete(struct work_struct *work)
struct cifs_readdata, work);
 
complete(>done);
-   collect_uncached_read_data(rdata->ctx);
+   collect_uncached_read_data(rdata, rdata->ctx);
/* the below call can possibly free the last ref to aio ctx */
kref_put(>refcount, cifs_uncached_readdata_release);
 }
@@ -3103,6 +3102,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
int rc;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
+   struct iov_iter direct_iov = ctx->iter;
 
server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,6 +3113,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
else
pid = current->tgid;
 
+   if (ctx->direct_io)
+   iov_iter_advance(_iov, offset - ctx->pos);
+
do {
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
   , );
@@ -3118,20 +3123,56 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
break;
 
cur_len = min_t(const size_t, len, rsize);
-   npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-   /* allocate a readdata struct */
-   rdata = cifs_readdata_alloc(npages,
+   if (ctx->direct_io) {
+
+   cur_len = iov_iter_get_pages_alloc(
+   _iov, ,
+   cur_len, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "couldn't get user pages (cur_len=%zd)"
+   " iter type %d"
+   " iov_offset %zd count %zd\n",
+   cur_len, direct_iov.type, 
direct_iov.iov_offset,
+   direct_iov.count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(_iov, cur_len);
+
+   rdata = cifs_readdata_direct_alloc(
+   pagevec, cifs_uncached_readv_complete);
+

[PATCH V3 (resend) 3/7] CIFS: Add support for direct I/O read

2018-09-20 Thread Long Li
From: Long Li 

With direct I/O read, we transfer the data directly from transport layer to
the user data buffer.

Change in v3: add support for kernel AIO

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h   |   1 +
 fs/cifs/cifsglob.h |   5 ++
 fs/cifs/file.c | 210 +
 3 files changed, 187 insertions(+), 29 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f047e87..ed5479c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,6 +101,7 @@ extern int cifs_open(struct inode *inode, struct file 
*file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
 extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed0..2131fec 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1172,6 +1172,11 @@ struct cifs_aio_ctx {
unsigned intlen;
unsigned inttotal_len;
boolshould_dirty;
+   /*
+* Indicates if this aio_ctx is for direct_io,
+* If yes, iter is a copy of the user passed iov_iter
+*/
+   booldirect_io;
 };
 
 struct cifs_readdata;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8d41ca7..6a939fa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2965,7 +2965,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
kref_put(>ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
-   rdata->pages[i] = NULL;
}
cifs_readdata_release(refcount);
 }
@@ -3004,7 +3003,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct 
iov_iter *iter)
return remaining ? -EFAULT : 0;
 }
 
-static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
+static void collect_uncached_read_data(struct cifs_readdata *rdata, struct 
cifs_aio_ctx *ctx);
 
 static void
 cifs_uncached_readv_complete(struct work_struct *work)
@@ -3013,7 +3012,7 @@ cifs_uncached_readv_complete(struct work_struct *work)
struct cifs_readdata, work);
 
complete(>done);
-   collect_uncached_read_data(rdata->ctx);
+   collect_uncached_read_data(rdata, rdata->ctx);
/* the below call can possibly free the last ref to aio ctx */
kref_put(>refcount, cifs_uncached_readdata_release);
 }
@@ -3103,6 +3102,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
int rc;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
+   struct iov_iter direct_iov = ctx->iter;
 
server = tlink_tcon(open_file->tlink)->ses->server;
 
@@ -3111,6 +3113,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
else
pid = current->tgid;
 
+   if (ctx->direct_io)
+   iov_iter_advance(_iov, offset - ctx->pos);
+
do {
rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
   , );
@@ -3118,20 +3123,56 @@ cifs_send_async_read(loff_t offset, size_t len, struct 
cifsFileInfo *open_file,
break;
 
cur_len = min_t(const size_t, len, rsize);
-   npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-   /* allocate a readdata struct */
-   rdata = cifs_readdata_alloc(npages,
+   if (ctx->direct_io) {
+
+   cur_len = iov_iter_get_pages_alloc(
+   _iov, ,
+   cur_len, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "couldn't get user pages (cur_len=%zd)"
+   " iter type %d"
+   " iov_offset %zd count %zd\n",
+   cur_len, direct_iov.type, 
direct_iov.iov_offset,
+   direct_iov.count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(_iov, cur_len);
+
+   rdata = cifs_readdata_direct_alloc(
+   pagevec, cifs_uncached_readv_complete);
+

[PATCH V3 (resend) 4/7] CIFS: Add support for direct I/O write

2018-09-20 Thread Long Li
From: Long Li 

With direct I/O write, user supplied buffers are pinned to the memory and data
are transferred directly from user buffers to the transport layer.

Change in v3: add support for kernel AIO

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h |   1 +
 fs/cifs/file.c   | 196 ++-
 2 files changed, 166 insertions(+), 31 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ed5479c..cc54051 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -104,6 +104,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct 
iov_iter *to);
 extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6a939fa..2a5d209 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2537,6 +2537,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
 
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2553,38 +2555,74 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
if (rc)
break;
 
-   nr_pages = get_numpages(wsize, len, _len);
-   wdata = cifs_writedata_alloc(nr_pages,
+   if (ctx->direct_io) {
+   cur_len = iov_iter_get_pages_alloc(
+   from, , wsize, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "direct_writev couldn't get user pages "
+   "(rc=%zd) iter type %d iov_offset %zd 
count"
+   " %zd\n",
+   cur_len, from->type,
+   from->iov_offset, from->count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(from, cur_len);
+
+   nr_pages = (cur_len + start + PAGE_SIZE - 1) / 
PAGE_SIZE;
+
+   wdata = cifs_writedata_direct_alloc(pagevec,
 cifs_uncached_writev_complete);
-   if (!wdata) {
-   rc = -ENOMEM;
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
+   if (!wdata) {
+   rc = -ENOMEM;
+   add_credits_and_wake_if(server, credits, 0);
+   break;
+   }
 
-   rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
-   if (rc) {
-   kfree(wdata);
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
 
-   num_pages = nr_pages;
-   rc = wdata_fill_from_iovec(wdata, from, _len, _pages);
-   if (rc) {
-   for (i = 0; i < nr_pages; i++)
-   put_page(wdata->pages[i]);
-   kfree(wdata);
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
+   wdata->page_offset = start;
+   wdata->tailsz =
+   nr_pages > 1 ?
+   cur_len - (PAGE_SIZE - start) -
+   (nr_pages - 2) * PAGE_SIZE :
+   cur_len;
+   } else {
+   nr_pages = get_numpages(wsize, len, _len);
+   wdata = cifs_writedata_alloc(nr_pages,
+cifs_uncached_writev_complete);
+   if (!wdata) {
+   rc = -ENOMEM;
+   add_credits_and_wake_if(server, credits, 0);
+   break;
+   }
 
-   /*
-* Bring nr_pages down to the number of pages we actually used,
-* and free any pages that we didn't use.
-*/
-   for ( ; nr_pages > num_p

[PATCH V3 (resend) 2/7] CIFS: SMBD: Do not call ib_dereg_mr on invalidated memory registration

2018-09-20 Thread Long Li
From: Long Li 

It is not necessary to deregister a memory registration after it has been
successfully invalidated.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5fdb9a5..5e28236 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -2295,8 +2295,12 @@ static void smbd_mr_recovery_work(struct work_struct 
*work)
int rc;
 
list_for_each_entry(smbdirect_mr, >mr_list, list) {
-   if (smbdirect_mr->state == MR_INVALIDATED ||
-   smbdirect_mr->state == MR_ERROR) {
+   if (smbdirect_mr->state == MR_INVALIDATED)
+   ib_dma_unmap_sg(
+   info->id->device, smbdirect_mr->sgl,
+   smbdirect_mr->sgl_count,
+   smbdirect_mr->dir);
+   else if (smbdirect_mr->state == MR_ERROR) {
 
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2320,25 +2324,21 @@ static void smbd_mr_recovery_work(struct work_struct 
*work)
smbd_disconnect_rdma_connection(info);
continue;
}
+   } else
+   /* This MR is being used, don't recover it */
+   continue;
 
-   if (smbdirect_mr->state == MR_INVALIDATED)
-   ib_dma_unmap_sg(
-   info->id->device, smbdirect_mr->sgl,
-   smbdirect_mr->sgl_count,
-   smbdirect_mr->dir);
-
-   smbdirect_mr->state = MR_READY;
+   smbdirect_mr->state = MR_READY;
 
-   /* smbdirect_mr->state is updated by this function
-* and is read and updated by I/O issuing CPUs trying
-* to get a MR, the call to atomic_inc_return
-* implicates a memory barrier and guarantees this
-* value is updated before waking up any calls to
-* get_mr() from the I/O issuing CPUs
-*/
-   if (atomic_inc_return(>mr_ready_count) == 1)
-   wake_up_interruptible(>wait_mr);
-   }
+   /* smbdirect_mr->state is updated by this function
+* and is read and updated by I/O issuing CPUs trying
+* to get a MR, the call to atomic_inc_return
+* implicates a memory barrier and guarantees this
+* value is updated before waking up any calls to
+* get_mr() from the I/O issuing CPUs
+*/
+   if (atomic_inc_return(>mr_ready_count) == 1)
+   wake_up_interruptible(>wait_mr);
}
 }
 
-- 
2.7.4



[PATCH V3 (resend) 5/7] CIFS: Add direct I/O functions to file_operations

2018-09-20 Thread Long Li
From: Long Li 

With direct read/write functions implemented, add them to file_operations.

Dircet I/O is used under two conditions:
1. When mounting with "cache=none", CIFS uses direct I/O for all user file
data transfer.
2. When opening a file with O_DIRECT, CIFS uses direct I/O for all data
transfer on this file.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7065426..3ba44f1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1159,9 +1159,8 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1215,9 +1214,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = 
{
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
-- 
2.7.4



[PATCH V3 (resend) 4/7] CIFS: Add support for direct I/O write

2018-09-20 Thread Long Li
From: Long Li 

With direct I/O write, user supplied buffers are pinned to the memory and data
are transferred directly from user buffers to the transport layer.

Change in v3: add support for kernel AIO

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.h |   1 +
 fs/cifs/file.c   | 196 ++-
 2 files changed, 166 insertions(+), 31 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ed5479c..cc54051 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -104,6 +104,7 @@ extern ssize_t cifs_user_readv(struct kiocb *iocb, struct 
iov_iter *to);
 extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
 extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
 extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6a939fa..2a5d209 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2537,6 +2537,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
+   struct page **pagevec;
+   size_t start;
 
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2553,38 +2555,74 @@ cifs_write_from_iter(loff_t offset, size_t len, struct 
iov_iter *from,
if (rc)
break;
 
-   nr_pages = get_numpages(wsize, len, _len);
-   wdata = cifs_writedata_alloc(nr_pages,
+   if (ctx->direct_io) {
+   cur_len = iov_iter_get_pages_alloc(
+   from, , wsize, );
+   if (cur_len < 0) {
+   cifs_dbg(VFS,
+   "direct_writev couldn't get user pages "
+   "(rc=%zd) iter type %d iov_offset %zd 
count"
+   " %zd\n",
+   cur_len, from->type,
+   from->iov_offset, from->count);
+   dump_stack();
+   break;
+   }
+   iov_iter_advance(from, cur_len);
+
+   nr_pages = (cur_len + start + PAGE_SIZE - 1) / 
PAGE_SIZE;
+
+   wdata = cifs_writedata_direct_alloc(pagevec,
 cifs_uncached_writev_complete);
-   if (!wdata) {
-   rc = -ENOMEM;
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
+   if (!wdata) {
+   rc = -ENOMEM;
+   add_credits_and_wake_if(server, credits, 0);
+   break;
+   }
 
-   rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
-   if (rc) {
-   kfree(wdata);
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
 
-   num_pages = nr_pages;
-   rc = wdata_fill_from_iovec(wdata, from, _len, _pages);
-   if (rc) {
-   for (i = 0; i < nr_pages; i++)
-   put_page(wdata->pages[i]);
-   kfree(wdata);
-   add_credits_and_wake_if(server, credits, 0);
-   break;
-   }
+   wdata->page_offset = start;
+   wdata->tailsz =
+   nr_pages > 1 ?
+   cur_len - (PAGE_SIZE - start) -
+   (nr_pages - 2) * PAGE_SIZE :
+   cur_len;
+   } else {
+   nr_pages = get_numpages(wsize, len, _len);
+   wdata = cifs_writedata_alloc(nr_pages,
+cifs_uncached_writev_complete);
+   if (!wdata) {
+   rc = -ENOMEM;
+   add_credits_and_wake_if(server, credits, 0);
+   break;
+   }
 
-   /*
-* Bring nr_pages down to the number of pages we actually used,
-* and free any pages that we didn't use.
-*/
-   for ( ; nr_pages > num_p

[PATCH V3 (resend) 5/7] CIFS: Add direct I/O functions to file_operations

2018-09-20 Thread Long Li
From: Long Li 

With direct read/write functions implemented, add them to file_operations.

Dircet I/O is used under two conditions:
1. When mounting with "cache=none", CIFS uses direct I/O for all user file
data transfer.
2. When opening a file with O_DIRECT, CIFS uses direct I/O for all data
transfer on this file.

Signed-off-by: Long Li 
---
 fs/cifs/cifsfs.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7065426..3ba44f1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1159,9 +1159,8 @@ const struct file_operations cifs_file_strict_ops = {
 };
 
 const struct file_operations cifs_file_direct_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -1215,9 +1214,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = 
{
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
-   /* BB reevaluate whether they can be done with directio, no cache */
-   .read_iter = cifs_user_readv,
-   .write_iter = cifs_user_writev,
+   .read_iter = cifs_direct_readv,
+   .write_iter = cifs_direct_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
-- 
2.7.4



  1   2   3   4   5   6   7   8   9   10   >