[PATCH 1/1] scsi: storvsc: Avoid allocating memory for temp cpumasks

2018-05-17 Thread Michael Kelley
Current code allocates 240 Kbytes (in typical configs) for
each synthetic SCSI controller to use as temp cpumask variables.
Recode to avoid needing the temp cpumask variables and remove the
memory allocation.

Signed-off-by: Michael Kelley <mikel...@microsoft.com>
---

This patch is for the 4.18/scsi-queue branch and improves on commit
2217a47de42f on April 20, 2018.

---
 drivers/scsi/storvsc_drv.c | 50 ++
 1 file changed, 19 insertions(+), 31 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index d3897c4..a667119 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -474,13 +474,6 @@ struct storvsc_device {
 * Mask of CPUs bound to subchannels.
 */
struct cpumask alloced_cpus;
-   /*
-* Pre-allocated struct cpumask for each hardware queue.
-* struct cpumask is used by selecting out-going channels. It is a
-* big structure, default to 1024k bytes when CONFIG_MAXSMP=y.
-* Pre-allocate it to avoid allocation on the kernel stack.
-*/
-   struct cpumask *cpumask_chns;
/* Used for vsc/vsp channel reset process */
struct storvsc_cmd_request init_request;
struct storvsc_cmd_request reset_request;
@@ -885,13 +878,6 @@ static int storvsc_channel_init(struct hv_device *device, 
bool is_fc)
if (stor_device->stor_chns == NULL)
return -ENOMEM;
 
-   stor_device->cpumask_chns = kcalloc(num_possible_cpus(),
-   sizeof(struct cpumask), GFP_KERNEL);
-   if (stor_device->cpumask_chns == NULL) {
-   kfree(stor_device->stor_chns);
-   return -ENOMEM;
-   }
-
stor_device->stor_chns[device->channel->target_cpu] = device->channel;
cpumask_set_cpu(device->channel->target_cpu,
_device->alloced_cpus);
@@ -1252,7 +1238,6 @@ static int storvsc_dev_remove(struct hv_device *device)
vmbus_close(device->channel);
 
kfree(stor_device->stor_chns);
-   kfree(stor_device->cpumask_chns);
kfree(stor_device);
return 0;
 }
@@ -1262,7 +1247,7 @@ static struct vmbus_channel *get_og_chn(struct 
storvsc_device *stor_device,
 {
u16 slot = 0;
u16 hash_qnum;
-   struct cpumask *alloced_mask = _device->cpumask_chns[q_num];
+   const struct cpumask *node_mask;
int num_channels, tgt_cpu;
 
if (stor_device->num_sc == 0)
@@ -1278,10 +1263,13 @@ static struct vmbus_channel *get_og_chn(struct 
storvsc_device *stor_device,
 * III. Mapping is persistent.
 */
 
-   cpumask_and(alloced_mask, _device->alloced_cpus,
-   cpumask_of_node(cpu_to_node(q_num)));
+   node_mask = cpumask_of_node(cpu_to_node(q_num));
 
-   num_channels = cpumask_weight(alloced_mask);
+   num_channels = 0;
+   for_each_cpu(tgt_cpu, _device->alloced_cpus) {
+   if (cpumask_test_cpu(tgt_cpu, node_mask))
+   num_channels++;
+   }
if (num_channels == 0)
return stor_device->device->channel;
 
@@ -1289,7 +1277,9 @@ static struct vmbus_channel *get_og_chn(struct 
storvsc_device *stor_device,
while (hash_qnum >= num_channels)
hash_qnum -= num_channels;
 
-   for_each_cpu(tgt_cpu, alloced_mask) {
+   for_each_cpu(tgt_cpu, _device->alloced_cpus) {
+   if (!cpumask_test_cpu(tgt_cpu, node_mask))
+   continue;
if (slot == hash_qnum)
break;
slot++;
@@ -1308,7 +1298,7 @@ static int storvsc_do_io(struct hv_device *device,
struct vstor_packet *vstor_packet;
struct vmbus_channel *outgoing_channel, *channel;
int ret = 0;
-   struct cpumask *alloced_mask;
+   const struct cpumask *node_mask;
int tgt_cpu;
 
vstor_packet = >vstor_packet;
@@ -1329,11 +1319,11 @@ static int storvsc_do_io(struct hv_device *device,
 * Ideally, we want to pick a different channel if
 * available on the same NUMA node.
 */
-   alloced_mask = _device->cpumask_chns[q_num];
-   cpumask_and(alloced_mask, _device->alloced_cpus,
-   cpumask_of_node(cpu_to_node(q_num)));
-
-   for_each_cpu_wrap(tgt_cpu, alloced_mask, q_num + 1) {
+   node_mask = cpumask_of_node(cpu_to_node(q_num));
+   for_each_cpu_wrap(tgt_cpu,
+_device->alloced_cpus, q_num + 1) {
+   if (!cpumask_test_cpu(tgt_cpu, node_mask))
+   continue;
if (tgt_cpu == q_num)
  

RE: [Patch v2] Storvsc: Select channel based on available percentage of ring buffer to write

2018-04-22 Thread Michael Kelley (EOSG)
> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  
> On Behalf
> Of Long Li
> Sent: Thursday, April 19, 2018 2:54 PM
> To: KY Srinivasan ; Haiyang Zhang 
> ; Stephen
> Hemminger ; James E . J . Bottomley 
> ;
> Martin K . Petersen ; 
> de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-ker...@vger.kernel.org
> Cc: Long Li 
> Subject: [Patch v2] Storvsc: Select channel based on available percentage of 
> ring buffer to
> write
> 
> From: Long Li 
> 
> This is a best effort for estimating on how busy the ring buffer is for
> that channel, based on available buffer to write in percentage. It is still
> possible that at the time of actual ring buffer write, the space may not be
> available due to other processes may be writing at the time.
> 
> Selecting a channel based on how full it is can reduce the possibility that
> a ring buffer write will fail, and avoid the situation a channel is over
> busy.
> 
> Now it's possible that storvsc can use a smaller ring buffer size
> (e.g. 40k bytes) to take advantage of cache locality.
> 
> Changes.
> v2: Pre-allocate struct cpumask on the heap.
> Struct cpumask is a big structure (1k bytes) when CONFIG_NR_CPUS=8192 (default
> value when CONFIG_MAXSMP=y). Don't use kernel stack for it by pre-allocating
> them using kmalloc when channels are first initialized.
> 
> Signed-off-by: Long Li 
> ---
>  drivers/scsi/storvsc_drv.c | 90 
> --
>  1 file changed, 72 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index a2ec0bc9e9fa..2a9fff94dd1a 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -395,6 +395,12 @@ MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer 
> size
> (bytes)");
> 
>  module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
>  MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to 
> subchannels");
> +
> +static int ring_avail_percent_lowater = 10;
> +module_param(ring_avail_percent_lowater, int, S_IRUGO);
> +MODULE_PARM_DESC(ring_avail_percent_lowater,
> + "Select a channel if available ring size > this in percent");
> +
>  /*
>   * Timeout in seconds for all devices managed by this driver.
>   */
> @@ -468,6 +474,13 @@ struct storvsc_device {
>* Mask of CPUs bound to subchannels.
>*/
>   struct cpumask alloced_cpus;
> + /*
> +  * Pre-allocated struct cpumask for each hardware queue.
> +  * struct cpumask is used by selecting out-going channels. It is a
> +  * big structure, default to 1024k bytes when CONFIG_MAXSMP=y.

I think you mean "1024 bytes" or "1k bytes" in the above comment.

> +  * Pre-allocate it to avoid allocation on the kernel stack.
> +  */
> + struct cpumask *cpumask_chns;
>   /* Used for vsc/vsp channel reset process */
>   struct storvsc_cmd_request init_request;
>   struct storvsc_cmd_request reset_request;
> @@ -872,6 +885,13 @@ static int storvsc_channel_init(struct hv_device 
> *device, bool is_fc)
>   if (stor_device->stor_chns == NULL)
>   return -ENOMEM;
> 
> + stor_device->cpumask_chns = kcalloc(num_possible_cpus(),
> + sizeof(struct cpumask), GFP_KERNEL);

Note that num_possible_cpus() is 240 for a Hyper-V 2016 guest unless 
overridden on the kernel boot line, so this is going to allocate 240 Kbytes for 
each
synthetic SCSI controller.  On an Azure VM, which has two IDE and two SCSI
controllers, this is nearly 1 Mbyte.  It's unfortunate to have to allocate this 
much
memory for a what is essentially a temporary variable.   Further down in these
comments, I've proposed an alternate implementation of the code that avoids
the need for the temporary variable, and hence avoids the need for this
allocation.

> + if (stor_device->cpumask_chns == NULL) {
> + kfree(stor_device->stor_chns);
> + return -ENOMEM;
> + }
> +
>   stor_device->stor_chns[device->channel->target_cpu] = device->channel;
>   cpumask_set_cpu(device->channel->target_cpu,
>   _device->alloced_cpus);
> @@ -1232,6 +1252,7 @@ static int storvsc_dev_remove(struct hv_device *device)
>   vmbus_close(device->channel);
> 
>   kfree(stor_device->stor_chns);
> + kfree(stor_device->cpumask_chns);
>   kfree(stor_device);
>   return 0;
>  }
> @@ -1241,7 +1262,7 @@ static struct vmbus_channel *get_og_chn(struct 
> storvsc_device
> *stor_device,
>  {1G/
>   u16 slot = 0;
>   u16 hash_qnum;
> - struct cpumask alloced_mask;
> + struct cpumask *alloced_mask = _device->cpumask_chns[q_num];
>   int num_channels, tgt_cpu;
> 
>   if (stor_device->num_sc == 0)
> @@ -1257,10 

RE: [Resend Patch 2/3] Netvsc: Use the vmbus functiton to calculate ring buffer percentage

2018-04-13 Thread Michael Kelley (EOSG)
> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org <linux-kernel-ow...@vger.kernel.org> 
> On Behalf
> Of Long Li
> Sent: Tuesday, March 27, 2018 5:49 PM
> To: KY Srinivasan <k...@microsoft.com>; Haiyang Zhang 
> <haiya...@microsoft.com>; Stephen
> Hemminger <sthem...@microsoft.com>; James E . J . Bottomley 
> <jbottom...@odin.com>;
> Martin K . Petersen <martin.peter...@oracle.com>; 
> de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-ker...@vger.kernel.org; net...@vger.kernel.org
> Cc: Long Li <lon...@microsoft.com>
> Subject: [Resend Patch 2/3] Netvsc: Use the vmbus functiton to calculate ring 
> buffer
> percentage
> 
> From: Long Li <lon...@microsoft.com>
> 
> In Vmbus, we have defined a function to calculate available ring buffer
> percentage to write.
> 
> Use that function and remove netvsc's private version.
> 
> Signed-off-by: Long Li <lon...@microsoft.com>

Reviewed-by:  Michael Kelley <mikel...@microsoft.com>

> ---
>  drivers/net/hyperv/hyperv_net.h |  1 -
>  drivers/net/hyperv/netvsc.c | 17 +++--
>  drivers/net/hyperv/netvsc_drv.c |  3 ---
>  3 files changed, 3 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index cd538d5a7986..a0199ab13d67 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -189,7 +189,6 @@ struct netvsc_device;
>  struct net_device_context;
> 
>  extern u32 netvsc_ring_bytes;
> -extern struct reciprocal_value netvsc_ring_reciprocal;
> 
>  struct netvsc_device *netvsc_device_add(struct hv_device *device,
>   const struct netvsc_device_info *info);
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 0265d703eb03..8af0069e4d8c 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -31,7 +31,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
> 
> @@ -590,17 +589,6 @@ void netvsc_device_remove(struct hv_device *device)
>  #define RING_AVAIL_PERCENT_HIWATER 20
>  #define RING_AVAIL_PERCENT_LOWATER 10
> 
> -/*
> - * Get the percentage of available bytes to write in the ring.
> - * The return value is in range from 0 to 100.
> - */
> -static u32 hv_ringbuf_avail_percent(const struct hv_ring_buffer_info 
> *ring_info)
> -{
> - u32 avail_write = hv_get_bytes_to_write(ring_info);
> -
> - return reciprocal_divide(avail_write  * 100, netvsc_ring_reciprocal);
> -}
> -
>  static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
>u32 index)
>  {
> @@ -649,7 +637,8 @@ static void netvsc_send_tx_complete(struct netvsc_device
> *net_device,
>   wake_up(_device->wait_drain);
> 
>   if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
> - (hv_ringbuf_avail_percent(>outbound) >
> RING_AVAIL_PERCENT_HIWATER ||
> + (hv_get_avail_to_write_percent(>outbound) >
> +  RING_AVAIL_PERCENT_HIWATER ||
>queue_sends < 1)) {
>   netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
>   ndev_ctx->eth_stats.wake_queue++;
> @@ -757,7 +746,7 @@ static inline int netvsc_send_pkt(
>   struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
>   u64 req_id;
>   int ret;
> - u32 ring_avail = hv_ringbuf_avail_percent(_channel->outbound);
> + u32 ring_avail = hv_get_avail_to_write_percent(_channel->outbound);
> 
>   nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
>   if (skb)
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index faea0be18924..b0b1c2fd2b7b 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -35,7 +35,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
> 
>  #include 
>  #include 
> @@ -55,7 +54,6 @@ static unsigned int ring_size __ro_after_init = 128;
>  module_param(ring_size, uint, S_IRUGO);
>  MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
>  unsigned int netvsc_ring_bytes __ro_after_init;
> -struct reciprocal_value netvsc_ring_reciprocal __ro_after_init;
> 
>  static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
>   NETIF_MSG_LINK | NETIF_MSG_IFUP |
> @@ -2186,7 +2184,6 @@ static int __init netvsc_drv_init(void)
>   ring_size);
>   }
>   netvsc_ring_bytes = ring_size * PAGE_SIZE;
> - netvsc_ring_reciprocal = reciprocal_value(netvsc_ring_bytes);
> 
>   ret = vmbus_driver_register(_drv);
>   if (ret)
> --
> 2.14.1



RE: [Resend Patch 3/3] Storvsc: Select channel based on available percentage of ring buffer to write

2018-04-13 Thread Michael Kelley (EOSG)
> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  
> On Behalf
> Of Long Li
> Sent: Tuesday, March 27, 2018 5:49 PM
> To: KY Srinivasan ; Haiyang Zhang 
> ; Stephen
> Hemminger ; James E . J . Bottomley 
> ;
> Martin K . Petersen ; 
> de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-ker...@vger.kernel.org; net...@vger.kernel.org
> Cc: Long Li 
> Subject: [Resend Patch 3/3] Storvsc: Select channel based on available 
> percentage of ring
> buffer to write
> 
> From: Long Li 
> 
> This is a best effort for estimating on how busy the ring buffer is for
> that channel, based on available buffer to write in percentage. It is still
> possible that at the time of actual ring buffer write, the space may not be
> available due to other processes may be writing at the time.
> 
> Selecting a channel based on how full it is can reduce the possibility that
> a ring buffer write will fail, and avoid the situation a channel is over
> busy.
> 
> Now it's possible that storvsc can use a smaller ring buffer size
> (e.g. 40k bytes) to take advantage of cache locality.
> 
> Signed-off-by: Long Li 
> ---
>  drivers/scsi/storvsc_drv.c | 62 
> +-
>  1 file changed, 50 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index a2ec0bc9e9fa..b1a87072b3ab 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -395,6 +395,12 @@ MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer 
> size
> (bytes)");
> 
>  module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
>  MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to 
> subchannels");
> +
> +static int ring_avail_percent_lowater = 10;

Reserving 10% of each ring buffer by default seems like more than is needed
in the storvsc driver.  That would be about 4Kbytes for the 40K ring buffer
you suggest, and even more for a ring buffer of 128K.  Each outgoing record
is only about 344 bytes (I'd have to check exactly).  With the new channel
selection algorithm below, the only time we use a channel that is already
below the low water mark is when no channel could be found that is above
the low water mark.   There could be a case of two or more threads deciding
that a channel is above the low water mark at the same time and both
choosing it, but that's likely to be rare.  So it seems like we could set the
default low water mark to 5 percent or even 3 percent, which will let more
of the ring buffer be used, and let a channel be assigned according to the
algorithm, rather than falling through to the default because all channels
appear to be "full".

> +module_param(ring_avail_percent_lowater, int, S_IRUGO);
> +MODULE_PARM_DESC(ring_avail_percent_lowater,
> + "Select a channel if available ring size > this in percent");
> +
>  /*
>   * Timeout in seconds for all devices managed by this driver.
>   */
> @@ -1285,9 +1291,9 @@ static int storvsc_do_io(struct hv_device *device,
>  {
>   struct storvsc_device *stor_device;
>   struct vstor_packet *vstor_packet;
> - struct vmbus_channel *outgoing_channel;
> + struct vmbus_channel *outgoing_channel, *channel;
>   int ret = 0;
> - struct cpumask alloced_mask;
> + struct cpumask alloced_mask, other_numa_mask;
>   int tgt_cpu;
> 
>   vstor_packet = >vstor_packet;
> @@ -1301,22 +1307,53 @@ static int storvsc_do_io(struct hv_device *device,
>   /*
>* Select an an appropriate channel to send the request out.
>*/
> -
>   if (stor_device->stor_chns[q_num] != NULL) {
>   outgoing_channel = stor_device->stor_chns[q_num];
> - if (outgoing_channel->target_cpu == smp_processor_id()) {
> + if (outgoing_channel->target_cpu == q_num) {
>   /*
>* Ideally, we want to pick a different channel if
>* available on the same NUMA node.
>*/
>   cpumask_and(_mask, _device->alloced_cpus,
>   cpumask_of_node(cpu_to_node(q_num)));
> - for_each_cpu_wrap(tgt_cpu, _mask,
> - outgoing_channel->target_cpu + 1) {
> - if (tgt_cpu != outgoing_channel->target_cpu) {
> - outgoing_channel =
> - stor_device->stor_chns[tgt_cpu];
> - break;
> +
> + for_each_cpu_wrap(tgt_cpu, _mask, q_num + 1) {
> + if (tgt_cpu == q_num)
> + continue;
> + channel = 

RE: [PATCH v2] storvsc: Set up correct queue depth values for IDE devices

2018-03-24 Thread Michael Kelley (EOSG)
> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org <linux-kernel-ow...@vger.kernel.org> 
> On Behalf
> Of Long Li
> Sent: Thursday, March 22, 2018 2:47 PM
> To: KY Srinivasan <k...@microsoft.com>; Haiyang Zhang 
> <haiya...@microsoft.com>; Stephen
> Hemminger <sthem...@microsoft.com>; James E . J . Bottomley 
> <jbottom...@odin.com>;
> Martin K . Petersen <martin.peter...@oracle.com>; 
> de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-ker...@vger.kernel.org
> Cc: Long Li <lon...@microsoft.com>
> Subject: [PATCH v2] storvsc: Set up correct queue depth values for IDE devices
> 
> From: Long Li <lon...@microsoft.com>
> 
> Unlike SCSI and FC, we don't use multiple channels for IDE.
> Also fix the calculation for sub-channels.
> 
> Change log:
> v2: Addressed comment on incorrect number of sub-channels.
> (Michael Kelley <michael.h.kel...@microsoft.com>)
> 
> Signed-off-by: Long Li <lon...@microsoft.com>

Reviewed-by: Michael Kelley <mikel...@microsoft.com>

> ---
>  drivers/scsi/storvsc_drv.c | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index 8c51d628b52e..a2ec0bc9e9fa 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1722,11 +1722,14 @@ static int storvsc_probe(struct hv_device *device,
>   max_targets = STORVSC_MAX_TARGETS;
>   max_channels = STORVSC_MAX_CHANNELS;
>   /*
> -  * On Windows8 and above, we support sub-channels for storage.
> +  * On Windows8 and above, we support sub-channels for storage
> +  * on SCSI and FC controllers.
>* The number of sub-channels offerred is based on the number of
>* VCPUs in the guest.
>*/
> - max_sub_channels = (num_cpus / storvsc_vcpus_per_sub_channel);
> + if (!dev_is_ide)
> + max_sub_channels =
> + (num_cpus - 1) / storvsc_vcpus_per_sub_channel;
>   }
> 
>   scsi_driver.can_queue = (max_outstanding_req_per_channel *
> --
> 2.14.1



RE: [PATCH] storvsc: Set up correct queue depth values for IDE devices

2018-03-16 Thread Michael Kelley (EOSG)
> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  
> On Behalf
> Of Long Li
> Sent: Thursday, March 15, 2018 4:52 PM
> To: KY Srinivasan ; Haiyang Zhang 
> ; Stephen
> Hemminger ; James E . J . Bottomley 
> ;
> Martin K . Petersen ; 
> de...@linuxdriverproject.org; linux-
> s...@vger.kernel.org; linux-ker...@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH] storvsc: Set up correct queue depth values for IDE devices
> 
> From: Long Li 
> 
> Unlike SCSI and FC, we don't use multiple channels for IDE. So set queue depth
> correctly for IDE.
> 
> Also set the correct cmd_per_lun for all devices.
> 
> Signed-off-by: Long Li 
> ---
>  drivers/scsi/storvsc_drv.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index 8c51d628b52e..fba170640e9c 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1722,15 +1722,19 @@ static int storvsc_probe(struct hv_device *device,
>   max_targets = STORVSC_MAX_TARGETS;
>   max_channels = STORVSC_MAX_CHANNELS;
>   /*
> -  * On Windows8 and above, we support sub-channels for storage.
> +  * On Windows8 and above, we support sub-channels for storage
> +  * on SCSI and FC controllers.
>* The number of sub-channels offerred is based on the number of
>* VCPUs in the guest.
>*/
> - max_sub_channels = (num_cpus / storvsc_vcpus_per_sub_channel);
> + if (!dev_is_ide)
> + max_sub_channels =
> + num_cpus / storvsc_vcpus_per_sub_channel;

This calculation of the # of sub-channels doesn't get the right answer (and it
didn't before this patch either).  storvsc_vcpus_per_sub_channel defaults to 4.
If num_cpus is 8, max_sub_channels will be 2, but it should be 1.  The
sub-channel count should not include the main channel since we add 1 to the
sub-channel count below when calculating can_queue.

Furthermore, this is calculation is just a guess, in the sense that we're
replicating the algorithm we think Hyper-V is using to determine the number
of sub-channels to offer.   It turns out Hyper-V is changing that algorithm for
particular devices in an upcoming new Azure VM size.  But the only use of
max_sub_channels is in the calculation of can_queue below, so the impact
is minimal.

>   }
> 
>   scsi_driver.can_queue = (max_outstanding_req_per_channel *
>(max_sub_channels + 1));
> + scsi_driver.cmd_per_lun = scsi_driver.can_queue;

can_queue is defined as "int", while cmd_per_lun is defined as "short".
The calculated value of can_queue could easily be over 32,767 with
15 sub-channels and max_outstanding_req_per_channel being 3036
for the default 1 Mbyte ring buffer.  So the assignment to cmd_per_lun
could produce truncation and even a negative number.

More broadly, since max_outstanding_req_per_channel is based on
the ring buffer size, these calculations imply that Hyper-V storvsp's
queuing capacity is based on the ring buffer size.  I don't think that's
the case.  From conversations with the storvsp folks, I think Hyper-V
always removes entries from the guest->host ring buffer and then
lets storvsp queue them separately.   So we don't want to be linking
cmd_per_lun (or even can_queue, for that matter) to the ring buffer
size.  The current default ring buffer size of 1 Mbyte is probably 10x
bigger than needed, and we want to be able to adjust that without
ending up with can_queue and cmd_per_lun values that are too
small.

We would probably do better to set can_queue to a constant, and
leave cmd_per_lun at its current value of 2048.   The can_queue
value is already capped at 10240 in the blk-mq layer, so maybe
that's a reasonable constant to use.

Thoughts?

> 
>   host = scsi_host_alloc(_driver,
>  sizeof(struct hv_host_device));
> --
> 2.14.1



RE: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a channel for I/O requests

2018-01-31 Thread Michael Kelley (EOSG)
> From: Long Li
> Sent: Wednesday, January 31, 2018 12:23 PM
> To: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>; KY Srinivasan
> <k...@microsoft.com>; Stephen Hemminger <sthem...@microsoft.com>;
> martin.peter...@oracle.com; de...@linuxdriverproject.org; 
> linux-ker...@vger.kernel.org;
> linux-scsi@vger.kernel.org; James E . J . Bottomley <j...@linux.vnet.ibm.com>
> Subject: RE: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a 
> channel for I/O
> requests
> 
> > Subject: RE: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a
> > channel for I/O requests
> >
> > Updated/corrected two email addresses ...
> >
> > > -Original Message-
> > > From: Michael Kelley (EOSG)
> > > Sent: Wednesday, January 24, 2018 2:14 PM
> > > To: KY Srinivasan <k...@microsoft.com>; Stephen Hemminger
> > > <sthem...@microsoft.com>; martin.peter...@oracle.com;
> > > lo...@microsoft.com; jbottom...@odin.com;
> > > de...@linuxdriverproject.org; linux-ker...@vger.kernel.org;
> > > linux-scsi@vger.kernel.org
> > > Cc: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>
> > > Subject: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a
> > > channel for I/O requests
> > >
> > > Update the algorithm in storvsc_do_io to look for a channel starting
> > > with the current CPU + 1 and wrap around (within the current NUMA
> > > node). This spreads VMbus interrupts more evenly across CPUs. Previous
> > > code always started with first CPU in the current NUMA node, skewing
> > > the interrupt load to that CPU.
> > >
> > > Signed-off-by: Michael Kelley <mikel...@microsoft.com>
> > > ---
> > >  drivers/scsi/storvsc_drv.c | 3 ++-
> > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> > > index e07907d..f3264c4 100644
> > > --- a/drivers/scsi/storvsc_drv.c
> > > +++ b/drivers/scsi/storvsc_drv.c
> > > @@ -1310,7 +1310,8 @@ static int storvsc_do_io(struct hv_device *device,
> > >*/
> > >   cpumask_and(_mask, _device-
> > >alloced_cpus,
> > >
> > cpumask_of_node(cpu_to_node(q_num)));
> > > - for_each_cpu(tgt_cpu, _mask) {
> > > + for_each_cpu_wrap(tgt_cpu, _mask,
> > > + outgoing_channel->target_cpu + 1) {
> 
> Does it work when target_cpu is the last CPU on the system?
> 
> Otherwise, looking good.

Yes, it works.  for_each_cpu_wrap() correctly wraps in the case where
the 3rd parameter ('start') is one past the end of the mask.  Arguably,
we shouldn't rely on that, and should do the wrap to 0 before calling
for_each_cpu_wrap().

> 
> > >   if (tgt_cpu != outgoing_channel->target_cpu)
> > {
> > >   outgoing_channel =
> > >   stor_device->stor_chns[tgt_cpu];
> > > --
> > > 1.8.3.1


[PATCH 1/1] scsi: storvsc: Increase cmd_per_lun for higher speed devices

2018-01-24 Thread Michael Kelley (EOSG)
Increase cmd_per_lun to allow more I/Os in progress per device,
particularly for NVMe's.  The Hyper-V host side can handle the
higher count with no issues.

Signed-off-by: Michael Kelley <mikel...@microsoft.com>
---
 drivers/scsi/storvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index f3264c4..6205107 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1657,7 +1657,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scmnd)
.eh_timed_out = storvsc_eh_timed_out,
.slave_alloc =  storvsc_device_alloc,
.slave_configure =  storvsc_device_configure,
-   .cmd_per_lun =  255,
+   .cmd_per_lun =  2048,
.this_id =  -1,
.use_clustering =   ENABLE_CLUSTERING,
/* Make sure we dont get a sg segment crosses a page boundary */
-- 
1.8.3.1


RE: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a channel for I/O requests

2018-01-24 Thread Michael Kelley (EOSG)
Updated/corrected two email addresses ...

> -Original Message-
> From: Michael Kelley (EOSG)
> Sent: Wednesday, January 24, 2018 2:14 PM
> To: KY Srinivasan <k...@microsoft.com>; Stephen Hemminger 
> <sthem...@microsoft.com>;
> martin.peter...@oracle.com; lo...@microsoft.com; jbottom...@odin.com;
> de...@linuxdriverproject.org; linux-ker...@vger.kernel.org; 
> linux-scsi@vger.kernel.org
> Cc: Michael Kelley (EOSG) <michael.h.kel...@microsoft.com>
> Subject: [PATCH 1/1] scsi: storvsc: Spread interrupts when picking a channel 
> for I/O requests
> 
> Update the algorithm in storvsc_do_io to look for a channel
> starting with the current CPU + 1 and wrap around (within the
> current NUMA node). This spreads VMbus interrupts more evenly
> across CPUs. Previous code always started with first CPU in
> the current NUMA node, skewing the interrupt load to that CPU.
> 
> Signed-off-by: Michael Kelley <mikel...@microsoft.com>
> ---
>  drivers/scsi/storvsc_drv.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index e07907d..f3264c4 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1310,7 +1310,8 @@ static int storvsc_do_io(struct hv_device *device,
>*/
>   cpumask_and(_mask, _device->alloced_cpus,
>   cpumask_of_node(cpu_to_node(q_num)));
> - for_each_cpu(tgt_cpu, _mask) {
> + for_each_cpu_wrap(tgt_cpu, _mask,
> + outgoing_channel->target_cpu + 1) {
>   if (tgt_cpu != outgoing_channel->target_cpu) {
>   outgoing_channel =
>   stor_device->stor_chns[tgt_cpu];
> --
> 1.8.3.1


[PATCH 1/1] scsi: storvsc: Spread interrupts when picking a channel for I/O requests

2018-01-24 Thread Michael Kelley (EOSG)
Update the algorithm in storvsc_do_io to look for a channel
starting with the current CPU + 1 and wrap around (within the
current NUMA node). This spreads VMbus interrupts more evenly
across CPUs. Previous code always started with first CPU in
the current NUMA node, skewing the interrupt load to that CPU.

Signed-off-by: Michael Kelley <mikel...@microsoft.com>
---
 drivers/scsi/storvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index e07907d..f3264c4 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1310,7 +1310,8 @@ static int storvsc_do_io(struct hv_device *device,
 */
cpumask_and(_mask, _device->alloced_cpus,
cpumask_of_node(cpu_to_node(q_num)));
-   for_each_cpu(tgt_cpu, _mask) {
+   for_each_cpu_wrap(tgt_cpu, _mask,
+   outgoing_channel->target_cpu + 1) {
if (tgt_cpu != outgoing_channel->target_cpu) {
outgoing_channel =
stor_device->stor_chns[tgt_cpu];
-- 
1.8.3.1