Quoting from commit 7527810573436f ("Drivers: hv: vmbus: Introduce
the CHANNELMSG_MODIFYCHANNEL message type"),

  "[...] Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL
   messages with the promise that (after the ACK is sent) the
   channel won't send any more interrupts to the "old" CPU.

   The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is
   problematic if the user want to take a CPU offline, since we
   don't want to take a CPU offline (and, potentially, "lose"
   channel interrupts on such CPU) if the host is still processing
   a CHANNELMSG_MODIFYCHANNEL message associated to that CPU."

Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE(24) message type,
which embodies the type of the CHANNELMSG_MODIFYCHANNEL ACK.

Signed-off-by: Andrea Parri (Microsoft) <parri.and...@gmail.com>
---
 drivers/hv/channel.c      | 108 ++++++++++++++++++++++++++++++++------
 drivers/hv/channel_mgmt.c |  42 +++++++++++++++
 drivers/hv/connection.c   |   3 +-
 drivers/hv/hv.c           |  52 ++++++++++++++++++
 drivers/hv/hv_trace.h     |  15 ++++++
 drivers/hv/vmbus_drv.c    |   4 +-
 include/linux/hyperv.h    |  13 ++++-
 7 files changed, 218 insertions(+), 19 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fbdda9938039a..6801d89a20051 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -209,31 +209,107 @@ int vmbus_send_tl_connect_request(const guid_t 
*shv_guest_servie_id,
 }
 EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
 
+static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 
target_vp)
+{
+       struct vmbus_channel_modifychannel msg;
+       int ret;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+       msg.child_relid = channel->offermsg.child_relid;
+       msg.target_vp = target_vp;
+
+       ret = vmbus_post_msg(&msg, sizeof(msg), true);
+       trace_vmbus_send_modifychannel(&msg, ret);
+
+       return ret;
+}
+
+static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 
target_vp)
+{
+       struct vmbus_channel_modifychannel *msg;
+       struct vmbus_channel_msginfo *info;
+       unsigned long flags;
+       int ret;
+
+       info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
+                               sizeof(struct vmbus_channel_modifychannel),
+                      GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+
+       init_completion(&info->waitevent);
+       info->waiting_channel = channel;
+
+       msg = (struct vmbus_channel_modifychannel *)info->msg;
+       msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+       msg->child_relid = channel->offermsg.child_relid;
+       msg->target_vp = target_vp;
+
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+       list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+       if (channel->rescind) {
+               ret = -ENODEV;
+               goto free_info;
+       }
+
+       ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_modifychannel), 
true);
+       trace_vmbus_send_modifychannel(msg, ret);
+       if (ret != 0)
+               goto clean_msglist;
+
+       /*
+        * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could 
block on
+        * the mutex and be unable to signal the completion.
+        */
+       mutex_unlock(&vmbus_connection.channel_mutex);
+       wait_for_completion(&info->waitevent);
+       mutex_lock(&vmbus_connection.channel_mutex);
+
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+       list_del(&info->msglistentry);
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+       if (channel->rescind) {
+               ret = -ENODEV;
+               goto free_info;
+       }
+
+       if (info->response.modify_response.status) {
+               kfree(info);
+               return -EAGAIN;
+       }
+
+       kfree(info);
+       return 0;
+
+clean_msglist:
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+       list_del(&info->msglistentry);
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+free_info:
+       kfree(info);
+       return ret;
+}
+
 /*
  * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
  *
  * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  Also, Hyper-V does not
- * ACK such messages.  IOW we can't know when the host will stop interrupting
- * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ * ACK such messages before VERSION_WIN10_V5_3.  Without ACK, we can not know
+ * when the host will stop interrupting the "old" vCPU and start interrupting
+ * the "new" vCPU for the given channel.
  *
  * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
  * VERSION_WIN10_V4_1.
  */
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
 {
-       struct vmbus_channel_modifychannel conn_msg;
-       int ret;
-
-       memset(&conn_msg, 0, sizeof(conn_msg));
-       conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
-       conn_msg.child_relid = child_relid;
-       conn_msg.target_vp = target_vp;
-
-       ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
-
-       trace_vmbus_send_modifychannel(&conn_msg, ret);
-
-       return ret;
+       if (vmbus_proto_version >= VERSION_WIN10_V5_3)
+               return send_modifychannel_with_ack(channel, target_vp);
+       return send_modifychannel_without_ack(channel, target_vp);
 }
 EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
 
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 1d44bb635bb84..8fcb66d623ba4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -1200,6 +1200,46 @@ static void vmbus_onopen_result(struct 
vmbus_channel_message_header *hdr)
        spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 }
 
+/*
+ * vmbus_onmodifychannel_response - Modify Channel response handler.
+ *
+ * This is invoked when we received a response to our channel modify request.
+ * Find the matching request, copy the response and signal the requesting 
thread.
+ */
+static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header 
*hdr)
+{
+       struct vmbus_channel_modifychannel_response *response;
+       struct vmbus_channel_msginfo *msginfo;
+       unsigned long flags;
+
+       response = (struct vmbus_channel_modifychannel_response *)hdr;
+
+       trace_vmbus_onmodifychannel_response(response);
+
+       /*
+        * Find the modify msg, copy the response and signal/unblock the wait 
event.
+        */
+       spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+       list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, 
msglistentry) {
+               struct vmbus_channel_message_header *responseheader =
+                               (struct vmbus_channel_message_header 
*)msginfo->msg;
+
+               if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
+                       struct vmbus_channel_modifychannel *modifymsg;
+
+                       modifymsg = (struct vmbus_channel_modifychannel 
*)msginfo->msg;
+                       if (modifymsg->child_relid == response->child_relid) {
+                               memcpy(&msginfo->response.modify_response, 
response,
+                                      sizeof(struct 
vmbus_channel_modifychannel_response));
+                               complete(&msginfo->waitevent);
+                               break;
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
 /*
  * vmbus_ongpadl_created - GPADL created handler.
  *
@@ -1366,6 +1406,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
        { CHANNELMSG_TL_CONNECT_REQUEST,        0, NULL, 0},
        { CHANNELMSG_MODIFYCHANNEL,             0, NULL, 0},
        { CHANNELMSG_TL_CONNECT_RESULT,         0, NULL, 0},
+       { CHANNELMSG_MODIFYCHANNEL_RESPONSE,    1, 
vmbus_onmodifychannel_response,
+               sizeof(struct vmbus_channel_modifychannel_response)},
 };
 
 /*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 11170d9a2e1a5..cdf41c504d914 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -45,6 +45,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
  * Table of VMBus versions listed from newest to oldest.
  */
 static __u32 vmbus_versions[] = {
+       VERSION_WIN10_V5_3,
        VERSION_WIN10_V5_2,
        VERSION_WIN10_V5_1,
        VERSION_WIN10_V5,
@@ -60,7 +61,7 @@ static __u32 vmbus_versions[] = {
  * Maximal VMBus protocol version guests can negotiate.  Useful to cap the
  * VMBus version for testing and debugging purpose.
  */
-static uint max_version = VERSION_WIN10_V5_2;
+static uint max_version = VERSION_WIN10_V5_3;
 
 module_param(max_version, uint, S_IRUGO);
 MODULE_PARM_DESC(max_version,
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 0cde10fe0e71f..35f240f4c833e 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -16,6 +16,7 @@
 #include <linux/version.h>
 #include <linux/random.h>
 #include <linux/clockchips.h>
+#include <linux/delay.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/mshyperv.h>
 #include "hyperv_vmbus.h"
@@ -237,6 +238,40 @@ void hv_synic_disable_regs(unsigned int cpu)
        hv_set_synic_state(sctrl.as_uint64);
 }
 
+#define HV_MAX_TRIES 3
+/*
+ * Scan the event flags page of 'this' CPU looking for any bit that is set.  
If we find one
+ * bit set, then wait for a few milliseconds.  Repeat these steps for a 
maximum of 3 times.
+ * Return 'true', if there is still any set bit after this operation; 'false', 
otherwise.
+ */
+static bool hv_synic_event_pending(void)
+{
+       struct hv_per_cpu_context *hv_cpu = 
this_cpu_ptr(hv_context.cpu_context);
+       union hv_synic_event_flags *event =
+               (union hv_synic_event_flags *)hv_cpu->synic_event_page + 
VMBUS_MESSAGE_SINT;
+       unsigned long *recv_int_page = event->flags;
+       bool pending;
+       u32 relid;
+       int tries = 0;
+
+retry:
+       pending = false;
+       for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
+               /* Special case - VMBus channel protocol messages */
+               if (relid == 0)
+                       continue;
+               if (sync_test_bit(relid, recv_int_page)) {
+                       pending = true;
+                       break;
+               }
+       }
+       if (pending && tries++ < HV_MAX_TRIES) {
+               usleep_range(10000, 20000);
+               goto retry;
+       }
+       return pending;
+}
+
 int hv_synic_cleanup(unsigned int cpu)
 {
        struct vmbus_channel *channel, *sc;
@@ -276,6 +311,23 @@ int hv_synic_cleanup(unsigned int cpu)
        if (channel_found && vmbus_connection.conn_state == CONNECTED)
                return -EBUSY;
 
+       if (vmbus_proto_version >= VERSION_WIN10_V5_3) {
+               /*
+                * channel_found == false means that any channels that were 
previously
+                * assigned to the CPU have been reassigned elsewhere.  Since 
we have
+                * received a ModifyChannel ACK from Hyper-V for all such 
reassignments,
+                * we know that Hyper-V won't set any new bits in the event 
flags page.
+                * However, there may be existing bits set in this page that 
have not
+                * been processed by vmbus_chan_sched().  We scan the event 
flags page
+                * looking for any bits that are set and waiting (with a 
timeout) for
+                * vmbus_chan_sched() to process such bits.  If bits are still 
set after
+                * this operation (and VMBus is connected), we fail the CPU 
offlining
+                * operation.
+                */
+               if (hv_synic_event_pending() && vmbus_connection.conn_state == 
CONNECTED)
+                       return -EBUSY;
+       }
+
        hv_stimer_legacy_cleanup(cpu);
 
        hv_synic_disable_regs(cpu);
diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
index 6063bb21bb137..3e83c24856dbe 100644
--- a/drivers/hv/hv_trace.h
+++ b/drivers/hv/hv_trace.h
@@ -86,6 +86,21 @@ TRACE_EVENT(vmbus_onopen_result,
                    )
        );
 
+TRACE_EVENT(vmbus_onmodifychannel_response,
+           TP_PROTO(const struct vmbus_channel_modifychannel_response 
*response),
+           TP_ARGS(response),
+           TP_STRUCT__entry(
+                   __field(u32, child_relid)
+                   __field(u32, status)
+                   ),
+           TP_fast_assign(__entry->child_relid = response->child_relid;
+                          __entry->status = response->status;
+                   ),
+           TP_printk("child_relid 0x%x, status %d",
+                     __entry->child_relid,  __entry->status
+                   )
+       );
+
 TRACE_EVENT(vmbus_ongpadl_created,
            TP_PROTO(const struct vmbus_channel_gpadl_created *gpadlcreated),
            TP_ARGS(gpadlcreated),
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4fad3e6745e53..3e1cd5e8f769e 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1767,13 +1767,15 @@ static ssize_t target_cpu_store(struct vmbus_channel 
*channel,
        if (target_cpu == origin_cpu)
                goto cpu_store_unlock;
 
-       if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+       if (vmbus_send_modifychannel(channel,
                                     hv_cpu_number_to_vp_number(target_cpu))) {
                ret = -EIO;
                goto cpu_store_unlock;
        }
 
        /*
+        * For version before VERSION_WIN10_V5_3, the following warning holds:
+        *
         * Warning.  At this point, there is *no* guarantee that the host will
         * have successfully processed the vmbus_send_modifychannel() request.
         * See the header comment of vmbus_send_modifychannel() for more info.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 1ce131f29f3b4..808acf4c3fe61 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent(
  * 5 . 0  (Newer Windows 10)
  * 5 . 1  (Windows 10 RS4)
  * 5 . 2  (Windows Server 2019, RS5)
+ * 5 . 3  (Windows Server 2021) // FIXME: use proper version number/name
  */
 
 #define VERSION_WS2008  ((0 << 16) | (13))
@@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent(
 #define VERSION_WIN10_V5 ((5 << 16) | (0))
 #define VERSION_WIN10_V5_1 ((5 << 16) | (1))
 #define VERSION_WIN10_V5_2 ((5 << 16) | (2))
+#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD          (sizeof(u8) * 16384)
@@ -475,6 +477,7 @@ enum vmbus_channel_message_type {
        CHANNELMSG_TL_CONNECT_REQUEST           = 21,
        CHANNELMSG_MODIFYCHANNEL                = 22,
        CHANNELMSG_TL_CONNECT_RESULT            = 23,
+       CHANNELMSG_MODIFYCHANNEL_RESPONSE       = 24,
        CHANNELMSG_COUNT
 };
 
@@ -588,6 +591,13 @@ struct vmbus_channel_open_result {
        u32 status;
 } __packed;
 
+/* Modify Channel Result parameters */
+struct vmbus_channel_modifychannel_response {
+       struct vmbus_channel_message_header header;
+       u32 child_relid;
+       u32 status;
+} __packed;
+
 /* Close channel parameters; */
 struct vmbus_channel_close_channel {
        struct vmbus_channel_message_header header;
@@ -720,6 +730,7 @@ struct vmbus_channel_msginfo {
                struct vmbus_channel_gpadl_torndown gpadl_torndown;
                struct vmbus_channel_gpadl_created gpadl_created;
                struct vmbus_channel_version_response version_response;
+               struct vmbus_channel_modifychannel_response modify_response;
        } response;
 
        u32 msgsize;
@@ -1562,7 +1573,7 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
                                  const guid_t *shv_host_servie_id);
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
 void vmbus_set_event(struct vmbus_channel *channel);
 
 /* Get the start of the ring buffer. */
-- 
2.25.1

Reply via email to