RE: [PATCH v7 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-16 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Friday, April 16, 2021 12:40 PM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> and...@lunn.ch; be...@petrovitsch.priv.at; rdun...@infradead.org;
> Shachar Raindel ; linux-kernel@vger.kernel.org;
> linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH v7 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Thu, 15 Apr 2021 23:07:05 -0700
> Dexuan Cui  wrote:
> 
> > diff --git a/drivers/net/hyperv/netvsc_drv.c
> b/drivers/net/hyperv/netvsc_drv.c
> > index 7349a70af083..f682a5572d84 100644
> > --- a/drivers/net/hyperv/netvsc_drv.c
> > +++ b/drivers/net/hyperv/netvsc_drv.c
> > @@ -2297,6 +2297,7 @@ static struct net_device *get_netvsc_byslot(const
> struct net_device *vf_netdev)
> >  {
> > struct device *parent = vf_netdev->dev.parent;
> > struct net_device_context *ndev_ctx;
> > +   struct net_device *ndev;
> > struct pci_dev *pdev;
> > u32 serial;
> >
> > @@ -2319,8 +2320,17 @@ static struct net_device
> *get_netvsc_byslot(const struct net_device *vf_netdev)
> > if (!ndev_ctx->vf_alloc)
> > continue;
> >
> > -   if (ndev_ctx->vf_serial == serial)
> > -   return hv_get_drvdata(ndev_ctx->device_ctx);
> > +   if (ndev_ctx->vf_serial != serial)
> > +   continue;
> > +
> > +   ndev = hv_get_drvdata(ndev_ctx->device_ctx);
> > +   if (ndev->addr_len != vf_netdev->addr_len ||
> > +   memcmp(ndev->perm_addr, vf_netdev->perm_addr,
> > +  ndev->addr_len) != 0)
> > +   continue;
> > +
> > +   return ndev;
> > +
> > }
> >
> > netdev_notice(vf_netdev,
> 
> 
> This probably should be a separate patch.
> I think it is trying to address the case of VF discovery in Hyper-V/Azure 
> where
> the reported
> VF from Hypervisor is bogus or confused.

This is for the Multi vPorts feature of MANA driver, which allows one VF to 
create multiple vPorts (NICs). They have the same PCI device and same VF 
serial number, but different MACs. 

So we put the change in one patch to avoid distro vendors missing this 
change when backporting the MANA driver.

Thanks,
- Haiyang



RE: [PATCH v6 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-15 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Thursday, April 15, 2021 5:08 PM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> and...@lunn.ch; be...@petrovitsch.priv.at; rdun...@infradead.org;
> Shachar Raindel ; linux-kernel@vger.kernel.org;
> linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH v6 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Wed, 14 Apr 2021 22:45:19 -0700
> Dexuan Cui  wrote:
> 
> > +static int mana_query_vport_cfg(struct mana_port_context *apc, u32
> vport_index,
> > +   u32 *max_sq, u32 *max_rq, u32
> *num_indir_entry) {
> > +   struct mana_query_vport_cfg_resp resp = {};
> > +   struct mana_query_vport_cfg_req req = {};
> > +   int err;
> > +
> > +   mana_gd_init_req_hdr(, MANA_QUERY_VPORT_CONFIG,
> > +sizeof(req), sizeof(resp));
> > +
> > +   req.vport_index = vport_index;
> > +
> > +   err = mana_send_request(apc->ac, , sizeof(req), ,
> > +   sizeof(resp));
> > +   if (err)
> > +   return err;
> > +
> > +   err = mana_verify_resp_hdr(,
> MANA_QUERY_VPORT_CONFIG,
> > +  sizeof(resp));
> > +   if (err)
> > +   return err;
> > +
> > +   if (resp.hdr.status)
> > +   return -EPROTO;
> > +
> > +   *max_sq = resp.max_num_sq;
> > +   *max_rq = resp.max_num_rq;
> > +   *num_indir_entry = resp.num_indirection_ent;
> > +
> > +   apc->port_handle = resp.vport;
> > +   memcpy(apc->mac_addr, resp.mac_addr, ETH_ALEN);
> 
> You could use ether_addr_copy here.
> 
> 
> > +int mana_do_attach(struct net_device *ndev, enum mana_attach_caller
> > +caller) {
> > +   struct mana_port_context *apc = netdev_priv(ndev);
> > +   struct gdma_dev *gd = apc->ac->gdma_dev;
> > +   u32 max_txq, max_rxq, max_queues;
> > +   int port_idx = apc->port_idx;
> > +   u32 num_indirect_entries;
> > +   int err;
> > +
> > +   if (caller == MANA_OPEN)
> > +   goto start_open;
> > +
> > +   err = mana_init_port_context(apc);
> > +   if (err)
> > +   return err;
> > +
> > +   err = mana_query_vport_cfg(apc, port_idx, _txq, _rxq,
> > +  _indirect_entries);
> > +   if (err) {
> > +   netdev_err(ndev, "Failed to query info for vPort 0\n");
> > +   goto reset_apc;
> > +   }
> > +
> > +   max_queues = min_t(u32, max_txq, max_rxq);
> > +   if (apc->max_queues > max_queues)
> > +   apc->max_queues = max_queues;
> > +
> > +   if (apc->num_queues > apc->max_queues)
> > +   apc->num_queues = apc->max_queues;
> > +
> > +   memcpy(ndev->dev_addr, apc->mac_addr, ETH_ALEN);
> 
> And here use ether_addr_copy().

Thanks, I will update these.

- Haiyang


RE: [PATCH v4 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-12 Thread Haiyang Zhang



> -Original Message-
> From: Andrew Lunn 
> Sent: Monday, April 12, 2021 8:32 AM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> be...@petrovitsch.priv.at; rdun...@infradead.org; Shachar Raindel
> ; linux-kernel@vger.kernel.org; linux-
> hyp...@vger.kernel.org
> Subject: Re: [PATCH v4 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> > +static void mana_gd_deregiser_irq(struct gdma_queue *queue) {
> > +   struct gdma_dev *gd = queue->gdma_dev;
> > +   struct gdma_irq_context *gic;
> > +   struct gdma_context *gc;
> > +   struct gdma_resource *r;
> > +   unsigned int msix_index;
> > +   unsigned long flags;
> > +
> > +   /* At most num_online_cpus() + 1 interrupts are used. */
> > +   msix_index = queue->eq.msix_index;
> > +   if (WARN_ON(msix_index > num_online_cpus()))
> > +   return;
> 
> Do you handle hot{un}plug of CPUs?
We don't have hot{un}plug of CPU feature yet.

> 
> > +static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue
> *q_self,
> > +   struct gdma_event *event)
> > +{
> > +   struct hw_channel_context *hwc = ctx;
> > +   struct gdma_dev *gd = hwc->gdma_dev;
> > +   union hwc_init_type_data type_data;
> > +   union hwc_init_eq_id_db eq_db;
> > +   u32 type, val;
> > +
> > +   switch (event->type) {
> > +   case GDMA_EQE_HWC_INIT_EQ_ID_DB:
> > +   eq_db.as_uint32 = event->details[0];
> > +   hwc->cq->gdma_eq->id = eq_db.eq_id;
> > +   gd->doorbell = eq_db.doorbell;
> > +   break;
> > +
> > +   case GDMA_EQE_HWC_INIT_DATA:
> > +
> > +   type_data.as_uint32 = event->details[0];
> > +
> > +   case GDMA_EQE_HWC_INIT_DONE:
> > +   complete(>hwc_init_eqe_comp);
> > +   break;
> 
> ...
> 
> > +   default:
> > +   WARN_ON(1);
> > +   break;
> > +   }
> 
> Are these events from the firmware? If you have newer firmware with an
> older driver, are you going to spam the kernel log with WARN_ON dumps?
For protocol version mismatch, the host and guest will either negotiate the 
highest common version, or fail to probe. So this kind of warnings are not 
expected.

> 
> > +static int mana_move_wq_tail(struct gdma_queue *wq, u32 num_units) {
> > +   u32 used_space_old;
> > +   u32 used_space_new;
> > +
> > +   used_space_old = wq->head - wq->tail;
> > +   used_space_new = wq->head - (wq->tail + num_units);
> > +
> > +   if (used_space_new > used_space_old) {
> > +   WARN_ON(1);
> > +   return -ERANGE;
> > +   }
> 
> You could replace the 1 by the condition. There are a couple of these.
Will do.

Thanks,
- Haiyang


RE: [PATCH v4 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-12 Thread Haiyang Zhang



> -Original Message-
> From: Andrew Lunn 
> Sent: Monday, April 12, 2021 8:16 AM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> be...@petrovitsch.priv.at; rdun...@infradead.org; Shachar Raindel
> ; linux-kernel@vger.kernel.org; linux-
> hyp...@vger.kernel.org
> Subject: Re: [PATCH v4 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> > +static inline bool is_gdma_msg(const void *req) {
> > +   struct gdma_req_hdr *hdr = (struct gdma_req_hdr *)req;
> > +
> > +   if (hdr->req.hdr_type == GDMA_STANDARD_HEADER_TYPE &&
> > +   hdr->resp.hdr_type == GDMA_STANDARD_HEADER_TYPE &&
> > +   hdr->req.msg_size >= sizeof(struct gdma_req_hdr) &&
> > +   hdr->resp.msg_size >= sizeof(struct gdma_resp_hdr) &&
> > +   hdr->req.msg_type != 0 && hdr->resp.msg_type != 0)
> > +   return true;
> > +
> > +   return false;
> > +}
> > +
> > +static inline bool is_gdma_msg_len(const u32 req_len, const u32 resp_len,
> > +  const void *req)
> > +{
> > +   struct gdma_req_hdr *hdr = (struct gdma_req_hdr *)req;
> > +
> > +   if (req_len >= sizeof(struct gdma_req_hdr) &&
> > +   resp_len >= sizeof(struct gdma_resp_hdr) &&
> > +   req_len >= hdr->req.msg_size && resp_len >= hdr->resp.msg_size
> &&
> > +   is_gdma_msg(req)) {
> > +   return true;
> > +   }
> > +
> > +   return false;
> > +}
> 
> You missed adding the mana_ prefix here. There might be others.
> 
> > +#define CQE_POLLING_BUFFER 512
> > +struct ana_eq {
> > +   struct gdma_queue *eq;
> > +   struct gdma_comp cqe_poll[CQE_POLLING_BUFFER]; };
> 
> > +static int ana_poll(struct napi_struct *napi, int budget) {
> 
> You also have a few cases of ana_, not mana_. There might be others.

We will rename the remaining ana_ to mana_.
Also the driver version to protocol version in your previous comments.

Thanks,
- Haiyang


RE: [PATCH v3 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-08 Thread Haiyang Zhang



> -Original Message-
> From: David Miller 
> Sent: Thursday, April 8, 2021 8:41 PM
> To: Dexuan Cui 
> Cc: k...@kernel.org; KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> and...@lunn.ch; be...@petrovitsch.priv.at; rdun...@infradead.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH v3 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> From: Dexuan Cui 
> Date: Fri, 9 Apr 2021 00:24:51 +
> 
> >> From: David Miller 
> >> Sent: Thursday, April 8, 2021 4:46 PM
> >> ...
> >> > +struct gdma_msg_hdr {
> >> > +u32 hdr_type;
> >> > +u32 msg_type;
> >> > +u16 msg_version;
> >> > +u16 hwc_msg_id;
> >> > +u32 msg_size;
> >> > +} __packed;
> >> > +
> >> > +struct gdma_dev_id {
> >> > +union {
> >> > +struct {
> >> > +u16 type;
> >> > +u16 instance;
> >> > +};
> >> > +
> >> > +u32 as_uint32;
> >> > +};
> >> > +} __packed;
> >>
> >> Please don't  use __packed unless absolutely necessary.  It generates
> >> suboptimal code (byte at a time
> >> accesses etc.) and for many of these you don't even need it.
> >
> > In the driver code, all the structs/unions marked by __packed are used to
> > talk with the hardware, so I think __packed is necessary here?
> 
> It actually isan't in many cases, check with and without the __packed
> directive
> and see if anything chasnges.
> 
> > Do you think if it's better if we remove all the __packed, and add
> > static_assert(sizeof(struct XXX) == YYY) instead? e.g.
> >
> > @@ -105,7 +105,8 @@ struct gdma_msg_hdr {
> > u16 msg_version;
> > u16 hwc_msg_id;
> > u32 msg_size;
> > -} __packed;
> > +};
> > +static_assert(sizeof(struct gdma_msg_hdr) == 16);
> 
> This won't make sure the structure member offsets are what you expect.
> 
> I think you'll have to go through the structures one-by-one by hand to
> figure out which ones really require the __packed attribute and which do not.

For the structs containing variables with the same sizes, or already size 
aligned 
variables, we knew the __packed has no effect. And for these structs, it 
doesn't 
cause performance impact either, correct? 

But in the future, if different sized variables are added, the __packed may 
become necessary again. To prevent anyone accidently forget to add __packed 
when adding new variables to these structs, can we keep the __packed for all 
messages going through the "wire"?

Thanks,
- Haiyang




RE: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-08 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Thursday, April 8, 2021 12:52 PM
> To: Randy Dunlap 
> Cc: Dexuan Cui ; da...@davemloft.net;
> k...@kernel.org; KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> and...@lunn.ch; be...@petrovitsch.priv.at; linux-kernel@vger.kernel.org;
> linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Thu, 8 Apr 2021 09:22:57 -0700
> Randy Dunlap  wrote:
> 
> > On 4/8/21 2:15 AM, Dexuan Cui wrote:
> > > diff --git a/drivers/net/ethernet/microsoft/Kconfig
> > > b/drivers/net/ethernet/microsoft/Kconfig
> > > new file mode 100644
> > > index ..12ef6b581566
> > > --- /dev/null
> > > +++ b/drivers/net/ethernet/microsoft/Kconfig
> > > @@ -0,0 +1,30 @@
> > > +#
> > > +# Microsoft Azure network device configuration #
> > > +
> > > +config NET_VENDOR_MICROSOFT
> > > + bool "Microsoft Azure Network Device"
> >
> > Seems to me that should be generalized, more like:
> >
> > bool "Microsoft Network Devices"
> 
> Yes, that is what it should be at this level.
> 
> >
> >
> > > + default y
> 
> This follows the existing policy for network vendor level
> 
> > > + help
> > > +   If you have a network (Ethernet) device belonging to this class, say 
> > > Y.
> > > +
> > > +   Note that the answer to this question doesn't directly affect the
> > > +   kernel: saying N will just cause the configurator to skip the
> > > +   question about Microsoft Azure network device. If you say Y, you
> >
> >about Microsoft networking devices.
> >
> > > +   will be asked for your specific device in the following question.
> > > +
> > > +if NET_VENDOR_MICROSOFT
> > > +
> > > +config MICROSOFT_MANA
> > > + tristate "Microsoft Azure Network Adapter (MANA) support"
> > > + default m
> >
> > Please drop the default m. We don't randomly add drivers to be built.
> 
> Yes, it should be no (or no default which is the default for default)
> 
> > Or leave this as is and change NET_VENDOR_MICROSOFT to be default n.
> >
> >
> > > + depends on PCI_MSI && X86_64
> > > + select PCI_HYPERV
> > > + help
> > > +   This driver supports Microsoft Azure Network Adapter (MANA).
> > > +   So far, the driver is only validated on X86_64.
> >
> > validated how?
> 
> Maybe change validated to supported?

Sounds better. We will change it to "supported".
Also other suggested changes.

Thanks,
- Haiyang


RE: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-08 Thread Haiyang Zhang



> -Original Message-
> From: Andrew Lunn 
> Sent: Thursday, April 8, 2021 12:45 PM
> To: Haiyang Zhang 
> Cc: Randy Dunlap ; Dexuan Cui
> ; da...@davemloft.net; k...@kernel.org; KY
> Srinivasan ; Stephen Hemminger
> ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> be...@petrovitsch.priv.at; linux-kernel@vger.kernel.org; linux-
> hyp...@vger.kernel.org
> Subject: Re: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> > > > diff --git a/drivers/net/ethernet/microsoft/Kconfig
> > > b/drivers/net/ethernet/microsoft/Kconfig
> > > > new file mode 100644
> > > > index ..12ef6b581566
> > > > --- /dev/null
> > > > +++ b/drivers/net/ethernet/microsoft/Kconfig
> > > > @@ -0,0 +1,30 @@
> > > > +#
> > > > +# Microsoft Azure network device configuration
> > > > +#
> > > > +
> > > > +config NET_VENDOR_MICROSOFT
> > > > +   bool "Microsoft Azure Network Device"
> > >
> > > Seems to me that should be generalized, more like:
> > >
> > >   bool "Microsoft Network Devices"
> > This device is planned for Azure cloud at this time.
> > We will update the wording if things change.
> 
> This section is about the Vendor. Broadcom, Marvell, natsemi, toshiba,
> etc. Microsoft is the Vendor here and all Microsoft Ethernet drivers
> belong here. It does not matter what platform they are for.

Thanks. We will update the wording.

- Haiyang


RE: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-08 Thread Haiyang Zhang


> -Original Message-
> From: Randy Dunlap 
> Sent: Thursday, April 8, 2021 12:23 PM
> To: Dexuan Cui ; da...@davemloft.net;
> k...@kernel.org; KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; l...@kernel.org;
> and...@lunn.ch; be...@petrovitsch.priv.at
> Cc: linux-kernel@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH v2 net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On 4/8/21 2:15 AM, Dexuan Cui wrote:
> > diff --git a/drivers/net/ethernet/microsoft/Kconfig
> b/drivers/net/ethernet/microsoft/Kconfig
> > new file mode 100644
> > index ..12ef6b581566
> > --- /dev/null
> > +++ b/drivers/net/ethernet/microsoft/Kconfig
> > @@ -0,0 +1,30 @@
> > +#
> > +# Microsoft Azure network device configuration
> > +#
> > +
> > +config NET_VENDOR_MICROSOFT
> > +   bool "Microsoft Azure Network Device"
> 
> Seems to me that should be generalized, more like:
> 
>   bool "Microsoft Network Devices"
This device is planned for Azure cloud at this time.
We will update the wording if things change.

> 
> 
> > +   default y
> > +   help
> > + If you have a network (Ethernet) device belonging to this class, say 
> > Y.
> > +
> > + Note that the answer to this question doesn't directly affect the
> > + kernel: saying N will just cause the configurator to skip the
> > + question about Microsoft Azure network device. If you say Y, you
> 
>  about Microsoft networking devices.
(ditto)

> 
> > + will be asked for your specific device in the following question.
> > +
> > +if NET_VENDOR_MICROSOFT
> > +
> > +config MICROSOFT_MANA
> > +   tristate "Microsoft Azure Network Adapter (MANA) support"
> > +   default m
> 
> Please drop the default m. We don't randomly add drivers to be built.
We will.

> 
> Or leave this as is and change NET_VENDOR_MICROSOFT to be default n.
> 
> 
> > +   depends on PCI_MSI && X86_64
> > +   select PCI_HYPERV
> > +   help
> > + This driver supports Microsoft Azure Network Adapter (MANA).
> > + So far, the driver is only validated on X86_64.
> 
> validated how?
On our pre-released HW.

Thanks,
- Haiyang


RE: [PATCH net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-07 Thread Haiyang Zhang



> -Original Message-
> From: Wei Liu 
> Sent: Wednesday, April 7, 2021 11:01 AM
> To: Haiyang Zhang 
> Cc: Wei Liu ; Dexuan Cui ;
> da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> Wei Liu ; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Wed, Apr 07, 2021 at 02:34:01PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Wei Liu 
> > > Sent: Wednesday, April 7, 2021 9:17 AM
> > > To: Dexuan Cui 
> > > Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> > > ; Haiyang Zhang ;
> Stephen
> > > Hemminger ; wei@kernel.org; Wei Liu
> > > ; net...@vger.kernel.org; linux-
> > > ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> > > Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft
> > > Azure Network Adapter (MANA)
> > >
> > > On Tue, Apr 06, 2021 at 04:23:21PM -0700, Dexuan Cui wrote:
> > > [...]
> > > > +config MICROSOFT_MANA
> > > > +   tristate "Microsoft Azure Network Adapter (MANA) support"
> > > > +   default m
> > > > +   depends on PCI_MSI
> > > > +   select PCI_HYPERV
> > >
> > > OOI which part of the code requires PCI_HYPERV?
> > >
> > > Asking because I can't immediately find code that looks to be
> > > Hyper-V specific (searching for vmbus etc). This device looks like
> > > any other PCI devices to me.
> >
> > It depends on the VF nic's PCI config space which is presented by the
> pci_hyperv driver.
> 
> I think all it matters is the PCI bus is able to handle the configuration 
> space
> access, right? Assuming there is an emulated PCI root complex which
> exposes the config space to the driver, will this driver still work?
> 
> I'm trying to understand how tightly coupled with Hyper-V PCI this driver is.
> In an alternative universe, Microsft may suddenly decide to sell this
> hardware and someone wants to passthrough an VF via VFIO. I don't see
> how this driver wouldn't work, hence the original question.
> 
> There is no need to change the code. I'm just curious about a tiny detail in
> the implementation.

Currently, the PCI config space only comes from pci_hyperv, so we have this 
dependency.

If the pci config space is presented from other ways in an "alternative 
universe", 
we may need to add other dependencies. And yes, the VF should continue to work:)

Thanks,
- Haiyang


RE: [PATCH net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-07 Thread Haiyang Zhang



> -Original Message-
> From: Leon Romanovsky 
> Sent: Wednesday, April 7, 2021 10:55 AM
> To: Haiyang Zhang 
> Cc: Dexuan Cui ; da...@davemloft.net;
> k...@kernel.org; KY Srinivasan ; Stephen Hemminger
> ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Wed, Apr 07, 2021 at 02:41:45PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Leon Romanovsky 
> > > Sent: Wednesday, April 7, 2021 8:51 AM
> > > To: Dexuan Cui 
> > > Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> > > ; Haiyang Zhang ;
> Stephen
> > > Hemminger ; wei@kernel.org; Wei Liu
> > > ; net...@vger.kernel.org; linux-
> > > ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> > > Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft
> > > Azure Network Adapter (MANA)
> > >
> > > On Wed, Apr 07, 2021 at 08:40:13AM +, Dexuan Cui wrote:
> > > > > From: Leon Romanovsky 
> > > > > Sent: Wednesday, April 7, 2021 1:10 AM
> > > > >
> > > > > <...>
> > > > >
> > > > > > +int gdma_verify_vf_version(struct pci_dev *pdev) {
> > > > > > +   struct gdma_context *gc = pci_get_drvdata(pdev);
> > > > > > +   struct gdma_verify_ver_req req = { 0 };
> > > > > > +   struct gdma_verify_ver_resp resp = { 0 };
> > > > > > +   int err;
> > > > > > +
> > > > > > +   gdma_init_req_hdr(,
> GDMA_VERIFY_VF_DRIVER_VERSION,
> > > > > > + sizeof(req), sizeof(resp));
> > > > > > +
> > > > > > +   req.protocol_ver_min = GDMA_PROTOCOL_FIRST;
> > > > > > +   req.protocol_ver_max = GDMA_PROTOCOL_LAST;
> > > > > > +
> > > > > > +   err = gdma_send_request(gc, sizeof(req), , sizeof(resp),
> );
> > > > > > +   if (err || resp.hdr.status) {
> > > > > > +   pr_err("VfVerifyVersionOutput: %d, status=0x%x\n",
> err,
> > > > > > +  resp.hdr.status);
> > > > > > +   return -EPROTO;
> > > > > > +   }
> > > > > > +
> > > > > > +   return 0;
> > > > > > +}
> > > > >
> > > > > <...>
> > > > > > +   err = gdma_verify_vf_version(pdev);
> > > > > > +   if (err)
> > > > > > +   goto remove_irq;
> > > > >
> > > > > Will this VF driver be used in the guest VM? What will prevent
> > > > > from users
> > > to
> > > > > change it?
> > > > > I think that such version negotiation scheme is not allowed.
> > > >
> > > > Yes, the VF driver is expected to run in a Linux VM that runs on Azure.
> > > >
> > > > Currently gdma_verify_vf_version() just tells the PF driver that
> > > > the VF
> > > driver
> > > > is only able to support GDMA_PROTOCOL_V1, and want to use
> > > > GDMA_PROTOCOL_V1's message formats to talk to the PF driver later.
> > > >
> > > > enum {
> > > > GDMA_PROTOCOL_UNDEFINED = 0,
> > > > GDMA_PROTOCOL_V1 = 1,
> > > > GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1,
> > > > GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1,
> > > > GDMA_PROTOCOL_VALUE_MAX
> > > > };
> > > >
> > > > The PF driver is supposed to always support GDMA_PROTOCOL_V1, so I
> > > expect
> > > > here gdma_verify_vf_version() should succeed. If a user changes
> > > > the Linux
> > > VF
> > > > driver and try to use a protocol version not supported by the PF
> > > > driver,
> > > then
> > > > gdma_verify_vf_version() will fail; later, if the VF driver tries
> > > > to talk to the
> > > PF
> > > > driver using an unsupported message format, the PF driver will
> > > > return a
> > > failure.
> > >
> > > The worry is not for the current code, but for the future one when
> > > you will support v2, v3 e.t.c. First, your code will look like a
> > > spaghetti and second, users will try and mix vX with "unsupported"
> > > commands just for the fun.
> >
> > In the future, if the protocol version updated on the host side,
> > guests need to support different host versions because not all hosts
> > are updated (simultaneously). So this negotiation is necessary to know
> > the supported version, and decide the proper command version to use.
> 
> And how do other paravirtual drivers solve this negotiation scheme?

I saw some other drivers used version negotiation too, for example:

/**
 *  ixgbevf_negotiate_api_version_vf - Negotiate supported API version
 *  @hw: pointer to the HW structure
 *  @api: integer containing requested API version
 **/
static int ixgbevf_negotiate_api_version_vf(struct ixgbe_hw *hw, int api)
{

Thanks,
- Haiyang


RE: [PATCH net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-07 Thread Haiyang Zhang



> -Original Message-
> From: Leon Romanovsky 
> Sent: Wednesday, April 7, 2021 8:51 AM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Wed, Apr 07, 2021 at 08:40:13AM +, Dexuan Cui wrote:
> > > From: Leon Romanovsky 
> > > Sent: Wednesday, April 7, 2021 1:10 AM
> > >
> > > <...>
> > >
> > > > +int gdma_verify_vf_version(struct pci_dev *pdev)
> > > > +{
> > > > +   struct gdma_context *gc = pci_get_drvdata(pdev);
> > > > +   struct gdma_verify_ver_req req = { 0 };
> > > > +   struct gdma_verify_ver_resp resp = { 0 };
> > > > +   int err;
> > > > +
> > > > +   gdma_init_req_hdr(, GDMA_VERIFY_VF_DRIVER_VERSION,
> > > > + sizeof(req), sizeof(resp));
> > > > +
> > > > +   req.protocol_ver_min = GDMA_PROTOCOL_FIRST;
> > > > +   req.protocol_ver_max = GDMA_PROTOCOL_LAST;
> > > > +
> > > > +   err = gdma_send_request(gc, sizeof(req), , sizeof(resp), 
> > > > );
> > > > +   if (err || resp.hdr.status) {
> > > > +   pr_err("VfVerifyVersionOutput: %d, status=0x%x\n", err,
> > > > +  resp.hdr.status);
> > > > +   return -EPROTO;
> > > > +   }
> > > > +
> > > > +   return 0;
> > > > +}
> > >
> > > <...>
> > > > +   err = gdma_verify_vf_version(pdev);
> > > > +   if (err)
> > > > +   goto remove_irq;
> > >
> > > Will this VF driver be used in the guest VM? What will prevent from users
> to
> > > change it?
> > > I think that such version negotiation scheme is not allowed.
> >
> > Yes, the VF driver is expected to run in a Linux VM that runs on Azure.
> >
> > Currently gdma_verify_vf_version() just tells the PF driver that the VF
> driver
> > is only able to support GDMA_PROTOCOL_V1, and want to use
> > GDMA_PROTOCOL_V1's message formats to talk to the PF driver later.
> >
> > enum {
> > GDMA_PROTOCOL_UNDEFINED = 0,
> > GDMA_PROTOCOL_V1 = 1,
> > GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1,
> > GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1,
> > GDMA_PROTOCOL_VALUE_MAX
> > };
> >
> > The PF driver is supposed to always support GDMA_PROTOCOL_V1, so I
> expect
> > here gdma_verify_vf_version() should succeed. If a user changes the Linux
> VF
> > driver and try to use a protocol version not supported by the PF driver,
> then
> > gdma_verify_vf_version() will fail; later, if the VF driver tries to talk 
> > to the
> PF
> > driver using an unsupported message format, the PF driver will return a
> failure.
> 
> The worry is not for the current code, but for the future one when you will
> support v2, v3 e.t.c. First, your code will look like a spaghetti and
> second, users will try and mix vX with "unsupported" commands just for the
> fun.

In the future, if the protocol version updated on the host side, guests need 
to support different host versions because not all hosts are updated 
(simultaneously). So this negotiation is necessary to know the supported 
version, and decide the proper command version to use. 

If any user try "unsupported commands just for the fun", the host will deny 
and return an error.

Thanks,
- Haiyang


RE: [PATCH net-next] net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)

2021-04-07 Thread Haiyang Zhang



> -Original Message-
> From: Wei Liu 
> Sent: Wednesday, April 7, 2021 9:17 AM
> To: Dexuan Cui 
> Cc: da...@davemloft.net; k...@kernel.org; KY Srinivasan
> ; Haiyang Zhang ; Stephen
> Hemminger ; wei@kernel.org; Wei Liu
> ; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: Re: [PATCH net-next] net: mana: Add a driver for Microsoft Azure
> Network Adapter (MANA)
> 
> On Tue, Apr 06, 2021 at 04:23:21PM -0700, Dexuan Cui wrote:
> [...]
> > +config MICROSOFT_MANA
> > +   tristate "Microsoft Azure Network Adapter (MANA) support"
> > +   default m
> > +   depends on PCI_MSI
> > +   select PCI_HYPERV
> 
> OOI which part of the code requires PCI_HYPERV?
> 
> Asking because I can't immediately find code that looks to be Hyper-V
> specific (searching for vmbus etc). This device looks like any other PCI 
> devices
> to me.

It depends on the VF nic's PCI config space which is presented by the 
pci_hyperv driver.

Thanks,
- Haiyang


RE: [PATCH net-next] hv_netvsc: Add error handling while switching data path

2021-03-30 Thread Haiyang Zhang



> -Original Message-
> From: Vitaly Kuznetsov 
> Sent: Tuesday, March 30, 2021 7:43 AM
> To: Haiyang Zhang ; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org
> Cc: Haiyang Zhang ; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; da...@davemloft.net; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next] hv_netvsc: Add error handling while switching
> data path
> 
> Haiyang Zhang  writes:
> 
> > Add error handling in case of failure to send switching data path message
> > to the host.
> >
> > Reported-by: Shachar Raindel 
> > Signed-off-by: Haiyang Zhang 
> >
> > ---
> >  drivers/net/hyperv/hyperv_net.h |  6 +-
> >  drivers/net/hyperv/netvsc.c | 35 +-
> ---
> >  drivers/net/hyperv/netvsc_drv.c | 18 +++--
> >  3 files changed, 48 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/net/hyperv/hyperv_net.h
> b/drivers/net/hyperv/hyperv_net.h
> > index 59ac04a610ad..442c520ab8f3 100644
> > --- a/drivers/net/hyperv/hyperv_net.h
> > +++ b/drivers/net/hyperv/hyperv_net.h
> > @@ -269,7 +269,7 @@ int rndis_filter_receive(struct net_device *ndev,
> >  int rndis_filter_set_device_mac(struct netvsc_device *ndev,
> > const char *mac);
> >
> > -void netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
> > +int netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
> >
> >  #define NVSP_INVALID_PROTOCOL_VERSION  ((u32)0x)
> >
> > @@ -1718,4 +1718,8 @@ struct rndis_message {
> >  #define TRANSPORT_INFO_IPV6_TCP 0x10
> >  #define TRANSPORT_INFO_IPV6_UDP 0x20
> >
> > +#define RETRY_US_LO5000
> > +#define RETRY_US_HI1
> > +#define RETRY_MAX  2000/* >10 sec */
> > +
> >  #endif /* _HYPERV_NET_H */
> > diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> > index 5bce24731502..9d07c9ce4be2 100644
> > --- a/drivers/net/hyperv/netvsc.c
> > +++ b/drivers/net/hyperv/netvsc.c
> > @@ -31,12 +31,13 @@
> >   * Switch the data path from the synthetic interface to the VF
> >   * interface.
> >   */
> > -void netvsc_switch_datapath(struct net_device *ndev, bool vf)
> > +int netvsc_switch_datapath(struct net_device *ndev, bool vf)
> >  {
> > struct net_device_context *net_device_ctx = netdev_priv(ndev);
> > struct hv_device *dev = net_device_ctx->device_ctx;
> > struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx-
> >nvdev);
> > struct nvsp_message *init_pkt = _dev->channel_init_pkt;
> > +   int ret, retry = 0;
> >
> > /* Block sending traffic to VF if it's about to be gone */
> > if (!vf)
> > @@ -51,15 +52,41 @@ void netvsc_switch_datapath(struct net_device
> *ndev, bool vf)
> > init_pkt->msg.v4_msg.active_dp.active_datapath =
> > NVSP_DATAPATH_SYNTHETIC;
> >
> > +again:
> > trace_nvsp_send(ndev, init_pkt);
> >
> > -   vmbus_sendpacket(dev->channel, init_pkt,
> > +   ret = vmbus_sendpacket(dev->channel, init_pkt,
> >sizeof(struct nvsp_message),
> > -  (unsigned long)init_pkt,
> > -  VM_PKT_DATA_INBAND,
> > +  (unsigned long)init_pkt, VM_PKT_DATA_INBAND,
> >
> VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
> > +
> > +   /* If failed to switch to/from VF, let data_path_is_vf stay false,
> > +* so we use synthetic path to send data.
> > +*/
> > +   if (ret) {
> > +   if (ret != -EAGAIN) {
> > +   netdev_err(ndev,
> > +  "Unable to send sw datapath msg,
> err: %d\n",
> > +  ret);
> > +   return ret;
> > +   }
> > +
> > +   if (retry++ < RETRY_MAX) {
> > +   usleep_range(RETRY_US_LO, RETRY_US_HI);
> > +   goto again;
> > +   } else {
> > +   netdev_err(
> > +   ndev,
> > +   "Retry failed to send sw datapath msg,
> err: %d\n",
> > +   ret);
> 
> err is always -EAGAIN here, right?
> 
> > +   return ret;
> > +   }
> 
> Nitpicking: I think we can simplify the above a bit:
> 
>   if (ret) {
>   if (ret == -EAGAIN && retry++ < RETRY_MAX) {
>   usleep_range(RETRY_US_LO, RETRY_US_HI);
>   goto again;
>   }
>   netdev_err(ndev, "Unable to send sw datapath msg,
> err: %d\n", ret);
>   return ret;
>   }
> 

Yes this looks cleaner.

Thanks,

Haiyang


[PATCH net-next] hv_netvsc: Add error handling while switching data path

2021-03-29 Thread Haiyang Zhang
Add error handling in case of failure to send switching data path message
to the host.

Reported-by: Shachar Raindel 
Signed-off-by: Haiyang Zhang 

---
 drivers/net/hyperv/hyperv_net.h |  6 +-
 drivers/net/hyperv/netvsc.c | 35 +
 drivers/net/hyperv/netvsc_drv.c | 18 +++--
 3 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 59ac04a610ad..442c520ab8f3 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -269,7 +269,7 @@ int rndis_filter_receive(struct net_device *ndev,
 int rndis_filter_set_device_mac(struct netvsc_device *ndev,
const char *mac);
 
-void netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
+int netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
 
 #define NVSP_INVALID_PROTOCOL_VERSION  ((u32)0x)
 
@@ -1718,4 +1718,8 @@ struct rndis_message {
 #define TRANSPORT_INFO_IPV6_TCP 0x10
 #define TRANSPORT_INFO_IPV6_UDP 0x20
 
+#define RETRY_US_LO5000
+#define RETRY_US_HI1
+#define RETRY_MAX  2000/* >10 sec */
+
 #endif /* _HYPERV_NET_H */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 5bce24731502..9d07c9ce4be2 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -31,12 +31,13 @@
  * Switch the data path from the synthetic interface to the VF
  * interface.
  */
-void netvsc_switch_datapath(struct net_device *ndev, bool vf)
+int netvsc_switch_datapath(struct net_device *ndev, bool vf)
 {
struct net_device_context *net_device_ctx = netdev_priv(ndev);
struct hv_device *dev = net_device_ctx->device_ctx;
struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
struct nvsp_message *init_pkt = _dev->channel_init_pkt;
+   int ret, retry = 0;
 
/* Block sending traffic to VF if it's about to be gone */
if (!vf)
@@ -51,15 +52,41 @@ void netvsc_switch_datapath(struct net_device *ndev, bool 
vf)
init_pkt->msg.v4_msg.active_dp.active_datapath =
NVSP_DATAPATH_SYNTHETIC;
 
+again:
trace_nvsp_send(ndev, init_pkt);
 
-   vmbus_sendpacket(dev->channel, init_pkt,
+   ret = vmbus_sendpacket(dev->channel, init_pkt,
   sizeof(struct nvsp_message),
-  (unsigned long)init_pkt,
-  VM_PKT_DATA_INBAND,
+  (unsigned long)init_pkt, VM_PKT_DATA_INBAND,
   VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+
+   /* If failed to switch to/from VF, let data_path_is_vf stay false,
+* so we use synthetic path to send data.
+*/
+   if (ret) {
+   if (ret != -EAGAIN) {
+   netdev_err(ndev,
+  "Unable to send sw datapath msg, err: %d\n",
+  ret);
+   return ret;
+   }
+
+   if (retry++ < RETRY_MAX) {
+   usleep_range(RETRY_US_LO, RETRY_US_HI);
+   goto again;
+   } else {
+   netdev_err(
+   ndev,
+   "Retry failed to send sw datapath msg, err: 
%d\n",
+   ret);
+   return ret;
+   }
+   }
+
wait_for_completion(_dev->channel_init_wait);
net_device_ctx->data_path_is_vf = vf;
+
+   return 0;
 }
 
 /* Worker to setup sub channels on initial setup
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 97b5c9b60503..7349a70af083 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -38,9 +38,6 @@
 #include "hyperv_net.h"
 
 #define RING_SIZE_MIN  64
-#define RETRY_US_LO5000
-#define RETRY_US_HI1
-#define RETRY_MAX  2000/* >10 sec */
 
 #define LINKCHANGE_INT (2 * HZ)
 #define VF_TAKEOVER_INT (HZ / 10)
@@ -2402,6 +2399,7 @@ static int netvsc_vf_changed(struct net_device 
*vf_netdev, unsigned long event)
struct netvsc_device *netvsc_dev;
struct net_device *ndev;
bool vf_is_up = false;
+   int ret;
 
if (event != NETDEV_GOING_DOWN)
vf_is_up = netif_running(vf_netdev);
@@ -2418,9 +2416,17 @@ static int netvsc_vf_changed(struct net_device 
*vf_netdev, unsigned long event)
if (net_device_ctx->data_path_is_vf == vf_is_up)
return NOTIFY_OK;
 
-   netvsc_switch_datapath(ndev, vf_is_up);
-   netdev_info(ndev, "Data path switched %s VF: %s\n",
-   vf_is_up ? "to" : "from", vf_netdev->name);
+   ret = netvsc_switch_datapath(ndev, vf_is_up);
+
+   if

[PATCH net-next] hv_netvsc: Add a comment clarifying batching logic

2021-03-12 Thread Haiyang Zhang
From: Shachar Raindel 

The batching logic in netvsc_send is non-trivial, due to
a combination of the Linux API and the underlying hypervisor
interface. Add a comment explaining why the code is written this
way.

Signed-off-by: Shachar Raindel 
Signed-off-by: Haiyang Zhang 
---
 .../ethernet/microsoft/netvsc.rst | 14 -
 drivers/net/hyperv/netvsc.c   | 20 +++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git 
a/Documentation/networking/device_drivers/ethernet/microsoft/netvsc.rst 
b/Documentation/networking/device_drivers/ethernet/microsoft/netvsc.rst
index c3f51c672a68..fc5acd427a5d 100644
--- a/Documentation/networking/device_drivers/ethernet/microsoft/netvsc.rst
+++ b/Documentation/networking/device_drivers/ethernet/microsoft/netvsc.rst
@@ -87,11 +87,15 @@ Receive Buffer
   contain one or more packets. The number of receive sections may be changed
   via ethtool Rx ring parameters.
 
-  There is a similar send buffer which is used to aggregate packets for 
sending.
-  The send area is broken into chunks of 6144 bytes, each of section may
-  contain one or more packets. The send buffer is an optimization, the driver
-  will use slower method to handle very large packets or if the send buffer
-  area is exhausted.
+  There is a similar send buffer which is used to aggregate packets
+  for sending.  The send area is broken into chunks, typically of 6144
+  bytes, each of section may contain one or more packets. Small
+  packets are usually transmitted via copy to the send buffer. However,
+  if the buffer is temporarily exhausted, or the packet to be transmitted is
+  an LSO packet, the driver will provide the host with pointers to the data
+  from the SKB. This attempts to achieve a balance between the overhead of
+  data copy and the impact of remapping VM memory to be accessible by the
+  host.
 
 XDP support
 ---
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index dc3f73c3b33e..dc333dceb055 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1006,6 +1006,26 @@ static inline void move_pkt_msd(struct hv_netvsc_packet 
**msd_send,
 }
 
 /* RCU already held by caller */
+/* Batching/bouncing logic is designed to attempt to optimize
+ * performance.
+ *
+ * For small, non-LSO packets we copy the packet to a send buffer
+ * which is pre-registered with the Hyper-V side. This enables the
+ * hypervisor to avoid remapping the aperture to access the packet
+ * descriptor and data.
+ *
+ * If we already started using a buffer and the netdev is transmitting
+ * a burst of packets, keep on copying into the buffer until it is
+ * full or we are done collecting a burst. If there is an existing
+ * buffer with space for the RNDIS descriptor but not the packet, copy
+ * the RNDIS descriptor to the buffer, keeping the packet in place.
+ *
+ * If we do batching and send more than one packet using a single
+ * NetVSC message, free the SKBs of the packets copied, except for the
+ * last packet. This is done to streamline the handling of the case
+ * where the last packet only had the RNDIS descriptor copied to the
+ * send buffer, with the data pointers included in the NetVSC message.
+ */
 int netvsc_send(struct net_device *ndev,
struct hv_netvsc_packet *packet,
struct rndis_message *rndis_msg,
-- 
2.25.1



RE: [PATCH v2 4/4] hv_netvsc: Restrict configurations on isolated guests

2021-01-26 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri (Microsoft) 
> Sent: Tuesday, January 26, 2021 6:57 AM
> To: linux-kernel@vger.kernel.org
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; Michael Kelley
> ; linux-hyp...@vger.kernel.org; Tianyu Lan
> ; Saruhan Karademir
> ; Juan Vazquez ; Andrea
> Parri (Microsoft) ; Jakub Kicinski
> ; David S. Miller ;
> net...@vger.kernel.org
> Subject: [PATCH v2 4/4] hv_netvsc: Restrict configurations on isolated guests
> 
> Restrict the NVSP protocol version(s) that will be negotiated with the host to
> be NVSP_PROTOCOL_VERSION_61 or greater if the guest is running isolated.
> Moreover, do not advertise the SR-IOV capability and ignore
> NVSP_MSG_4_TYPE_SEND_VF_ASSOCIATION messages in isolated guests,
> which are not supposed to support SR-IOV.  This reduces the footprint of the
> code that will be exercised by Confidential VMs and hence the exposure to
> bugs and vulnerabilities.
> 
> Signed-off-by: Andrea Parri (Microsoft) 
> Acked-by: Jakub Kicinski 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org

Reviewed-by: Haiyang Zhang 
Thanks.


RE: [PATCH 4/4] hv_netvsc: Restrict configurations on isolated guests

2021-01-21 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri 
> Sent: Wednesday, January 20, 2021 11:05 PM
> To: Haiyang Zhang 
> Cc: linux-kernel@vger.kernel.org; KY Srinivasan ;
> Stephen Hemminger ; Wei Liu
> ; Michael Kelley ; Tianyu Lan
> ; Saruhan Karademir
> ; Juan Vazquez ; linux-
> hyp...@vger.kernel.org; David S. Miller ; Jakub
> Kicinski ; net...@vger.kernel.org
> Subject: Re: [PATCH 4/4] hv_netvsc: Restrict configurations on isolated
> guests
> 
> > > @@ -544,7 +545,8 @@ static int negotiate_nvsp_ver(struct hv_device
> > > *device,
> > >   init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
> > >
> > >   if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
> > > - init_packet->msg.v2_msg.send_ndis_config.capability.sriov =
> > > 1;
> > > + if (!hv_is_isolation_supported())
> > > + init_packet-
> > > >msg.v2_msg.send_ndis_config.capability.sriov = 1;
> >
> > Please also add a log there stating we don't support sriov in this case.
> Otherwise,
> > customers will ask why vf not showing up.
> 
> IIUC, you're suggesting that I append something like:
> 
> + else
> + netdev_info(ndev, "SR-IOV not advertised: isolation
> supported\n");
> 
> I've added this locally; please let me know if you had something else
> /better in mind.

This message explains the failure reason better:
  "SR-IOV not advertised by guests on the host supporting isolation"

> 
> 
> > > @@ -563,6 +565,13 @@ static int negotiate_nvsp_ver(struct hv_device
> > > *device,
> > >   return ret;
> > >  }
> > >
> > > +static bool nvsp_is_valid_version(u32 version)
> > > +{
> > > +   if (hv_is_isolation_supported())
> > > +   return version >= NVSP_PROTOCOL_VERSION_61;
> > > +   return true;
> > Hosts support isolation should run nvsp 6.1+. This error is not expected.
> > Instead of fail silently, we should log an error to explain why it's 
> > failed, and
> the current version and expected version.
> 
> Please see my next comment below.
> 
> 
> > > +}
> > > +
> > >  static int netvsc_connect_vsp(struct hv_device *device,
> > > struct netvsc_device *net_device,
> > > const struct netvsc_device_info *device_info)
> > > @@ -579,12 +588,17 @@ static int netvsc_connect_vsp(struct hv_device
> > > *device,
> > >   init_packet = _device->channel_init_pkt;
> > >
> > >   /* Negotiate the latest NVSP protocol supported */
> > > - for (i = ARRAY_SIZE(ver_list) - 1; i >= 0; i--)
> > > + for (i = ARRAY_SIZE(ver_list) - 1; i >= 0; i--) {
> > > + if (!nvsp_is_valid_version(ver_list[i])) {
> > > + ret = -EPROTO;
> > > + goto cleanup;
> > > + }
> >
> > This code can catch the invalid, but cannot get the current host nvsp
> version.
> > I'd suggest move this check after version negotiation is done. So we can log
> what's
> > the current host nvsp version, and why we fail it (the expected nvsp ver).
> 
> Mmh, invalid versions are not negotiated.  How about I simply add the
> following logging right before the above 'ret = -EPROTO' say?
> 
> + netdev_err(ndev, "Invalid NVSP version %x
> (expected >= %x): isolation supported\n",
> +ver_list[i], NVSP_PROTOCOL_VERSION_61);
> 
> (or something along these lines)

The negotiation process runs from the latest to oldest. If the host is 5, your 
code 
will fail before trying v6.0, and log:
"Invalid NVSP version 6  (expected >= 60001): isolation supported"
This will make user think the NVSP version is 6.0.

Since you will let the NIC fail and cleanup, there is no harm to check the 
version 
after negotiation. And this case is unexpected from a "normal" host. So I 
suggest 
move the check after negotiation is done, then we know the actual host nvsp 
version that causing this issue. And we can bring the accurate info to host 
team 
for better diagnosability.

Please point out this invalid version is caused by the host side, like this:
"Invalid NVSP version 0x5  (expected >= 0x60001) from the host with 
isolation support"
Also please use "0x%x" for hexadecimal numbers.

> 
> 
> > > @@ -1357,7 +1371,8 @@ static void netvsc_receive_inband(struct
> > > net_device *ndev,
> > >   break;
> > >
> > >   case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
> > > - netvsc_send_vf(ndev, nvmsg, msglen);
> > > + if (!hv_is_isolation_supported())
> > > + netvsc_send_vf(ndev, nvmsg, msglen);
> >
> > When the driver doesn't advertise SRIOV, this message is not expected.
> > Instead of ignore silently, we should log an error.
> 
> I've appended:
> 
> + else
> + netdev_err(ndev, "Unexpected VF message:
> isolation supported\n");

Please log the msg type:
  "Ignore VF_ASSOCIATION msg from the host supporting isolation"

Thanks,
- Haiyang


RE: [PATCH 4/4] hv_netvsc: Restrict configurations on isolated guests

2021-01-20 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri (Microsoft) 
> Sent: Tuesday, January 19, 2021 12:59 PM
> To: linux-kernel@vger.kernel.org
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; Michael Kelley
> ; Tianyu Lan ;
> Saruhan Karademir ; Juan Vazquez
> ; linux-hyp...@vger.kernel.org; Andrea Parri
> (Microsoft) ; David S. Miller
> ; Jakub Kicinski ;
> net...@vger.kernel.org
> Subject: [PATCH 4/4] hv_netvsc: Restrict configurations on isolated guests
> 
> Restrict the NVSP protocol version(s) that will be negotiated with the
> host to be NVSP_PROTOCOL_VERSION_61 or greater if the guest is running
> isolated.  Moreover, do not advertise the SR-IOV capability and ignore
> NVSP_MSG_4_TYPE_SEND_VF_ASSOCIATION messages in isolated guests,
> which
> are not supposed to support SR-IOV.  This reduces the footprint of the
> code that will be exercised by Confidential VMs and hence the exposure
> to bugs and vulnerabilities.
> 
> Signed-off-by: Andrea Parri (Microsoft) 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> ---
>  drivers/net/hyperv/netvsc.c | 21 ++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 1510a236aa341..8027d553cb67d 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -22,6 +22,7 @@
>  #include 
> 
>  #include 
> +#include 
> 
>  #include "hyperv_net.h"
>  #include "netvsc_trace.h"
> @@ -544,7 +545,8 @@ static int negotiate_nvsp_ver(struct hv_device
> *device,
>   init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
> 
>   if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
> - init_packet->msg.v2_msg.send_ndis_config.capability.sriov =
> 1;
> + if (!hv_is_isolation_supported())
> + init_packet-
> >msg.v2_msg.send_ndis_config.capability.sriov = 1;

Please also add a log there stating we don't support sriov in this case. 
Otherwise,
customers will ask why vf not showing up.

> 
>   /* Teaming bit is needed to receive link speed updates */
>   init_packet-
> >msg.v2_msg.send_ndis_config.capability.teaming = 1;
> @@ -563,6 +565,13 @@ static int negotiate_nvsp_ver(struct hv_device
> *device,
>   return ret;
>  }
> 
> +static bool nvsp_is_valid_version(u32 version)
> +{
> +   if (hv_is_isolation_supported())
> +   return version >= NVSP_PROTOCOL_VERSION_61;
> +   return true;
Hosts support isolation should run nvsp 6.1+. This error is not expected.
Instead of fail silently, we should log an error to explain why it's failed, 
and the current version and expected version.


> +}
> +
>  static int netvsc_connect_vsp(struct hv_device *device,
> struct netvsc_device *net_device,
> const struct netvsc_device_info *device_info)
> @@ -579,12 +588,17 @@ static int netvsc_connect_vsp(struct hv_device
> *device,
>   init_packet = _device->channel_init_pkt;
> 
>   /* Negotiate the latest NVSP protocol supported */
> - for (i = ARRAY_SIZE(ver_list) - 1; i >= 0; i--)
> + for (i = ARRAY_SIZE(ver_list) - 1; i >= 0; i--) {
> + if (!nvsp_is_valid_version(ver_list[i])) {
> + ret = -EPROTO;
> + goto cleanup;
> + }

This code can catch the invalid, but cannot get the current host nvsp version.
I'd suggest move this check after version negotiation is done. So we can log 
what's
the current host nvsp version, and why we fail it (the expected nvsp ver).

>   if (negotiate_nvsp_ver(device, net_device, init_packet,
>  ver_list[i])  == 0) {
>   net_device->nvsp_version = ver_list[i];
>   break;
>   }
> + }
> 
>   if (i < 0) {
>   ret = -EPROTO;
> @@ -1357,7 +1371,8 @@ static void netvsc_receive_inband(struct
> net_device *ndev,
>   break;
> 
>   case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
> - netvsc_send_vf(ndev, nvmsg, msglen);
> + if (!hv_is_isolation_supported())
> + netvsc_send_vf(ndev, nvmsg, msglen);

When the driver doesn't advertise SRIOV, this message is not expected.
Instead of ignore silently, we should log an error.

Thanks,
- Haiyang



RE: [PATCH 3/3] hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove

2021-01-06 Thread Haiyang Zhang



> -Original Message-
> From: Long Li 
> Sent: Tuesday, January 5, 2021 8:16 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; David S. Miller
> ; Jakub Kicinski ; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH 3/3] hv_netvsc: Process NETDEV_GOING_DOWN on VF hot
> remove
> 
> From: Long Li 
> 
> On VF hot remove, NETDEV_GOING_DOWN is sent to notify the VF is about
> to go down. At this time, the VF is still sending/receiving traffic and we
> request the VSP to switch datapath.
> 
> On completion, the datapath is switched to synthetic and we can proceed
> with VF hot remove.
> 
> Signed-off-by: Long Li 

Reviewed-by: Haiyang Zhang 


RE: [PATCH 2/3] hv_netvsc: Wait for completion on request NVSP_MSG4_TYPE_SWITCH_DATA_PATH

2021-01-06 Thread Haiyang Zhang



> -Original Message-
> From: Long Li 
> Sent: Tuesday, January 5, 2021 8:16 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; David S. Miller
> ; Jakub Kicinski ; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH 2/3] hv_netvsc: Wait for completion on request
> NVSP_MSG4_TYPE_SWITCH_DATA_PATH
> 
> From: Long Li 
> 
> The completion indicates if NVSP_MSG4_TYPE_SWITCH_DATA_PATH has
> been
> processed by the VSP. The traffic is steered to VF or synthetic after we
> receive this completion.
> 
> Signed-off-by: Long Li 

Reviewed-by: Haiyang Zhang 


RE: [PATCH 1/3] hv_netvsc: Check VF datapath when sending traffic to VF

2021-01-06 Thread Haiyang Zhang



> -Original Message-
> From: Long Li 
> Sent: Tuesday, January 5, 2021 8:16 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; David S. Miller
> ; Jakub Kicinski ; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Cc: Long Li 
> Subject: [PATCH 1/3] hv_netvsc: Check VF datapath when sending traffic to
> VF
> 
> From: Long Li 
> 
> The driver needs to check if the datapath has been switched to VF before
> sending traffic to VF.
> 
> Signed-off-by: Long Li 
Reviewed-by: Haiyang Zhang 


RE: [PATCH] video: hyperv_fb: Fix the cache type when mapping the VRAM

2020-11-18 Thread Haiyang Zhang



> -Original Message-
> From: Dexuan Cui 
> Sent: Tuesday, November 17, 2020 7:03 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; wei@kernel.org;
> b.zolnier...@samsung.com; linux-hyp...@vger.kernel.org; dri-
> de...@lists.freedesktop.org; linux-fb...@vger.kernel.org; linux-
> ker...@vger.kernel.org; Michael Kelley 
> Cc: Wei Hu ; Dexuan Cui 
> Subject: [PATCH] video: hyperv_fb: Fix the cache type when mapping the
> VRAM
> 
> x86 Hyper-V used to essentially always overwrite the effective cache type of
> guest memory accesses to WB. This was problematic in cases where there is
> a physical device assigned to the VM, since that often requires that the VM
> should have control over cache types. Thus, on newer Hyper-V since 2018,
> Hyper-V always honors the VM's cache type, but unexpectedly Linux VM
> users start to complain that Linux VM's VRAM becomes very slow, and it
> turns out that Linux VM should not map the VRAM uncacheable by ioremap().
> Fix this slowness issue by using ioremap_cache().
> 
> On ARM64, ioremap_cache() is also required as the host also maps the VRAM
> cacheable, otherwise VM Connect can't display properly with ioremap() or
> ioremap_wc().
> 
> With this change, the VRAM on new Hyper-V is as fast as regular RAM, so it's
> no longer necessary to use the hacks we added to mitigate the slowness, i.e.
> we no longer need to allocate physical memory and use it to back up the
> VRAM in Generation-1 VM, and we also no longer need to allocate physical
> memory to back up the framebuffer in a Generation-2 VM and copy the
> framebuffer to the real VRAM. A further big change will address these for
> v5.11.
> 
> Fixes: 68a2d20b79b1 ("drivers/video: add Hyper-V Synthetic Video Frame
> Buffer Driver")
> Tested-by: Boqun Feng 
> Signed-off-by: Dexuan Cui 
> ---
> 
> Hi Wei Liu, can you please pick this up into the hyperv/linux.git tree's 
> hyperv-
> fixes branch? I really hope this patch can be in v5.10 since it fixes a
> longstanding issue:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2FLIS%2Flis-
> next%2Fissues%2F655data=04%7C01%7Chaiyangz%40microsoft.com%
> 7C7e371bb6f79f41aae12208d88b556c85%7C72f988bf86f141af91ab2d7cd011d
> b47%7C1%7C0%7C637412546297591335%7CUnknown%7CTWFpbGZsb3d8eyJ
> WIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C1000sdata=StqnT%2Fx1XVoVWUZbJz5BNjaCIdtuNmSf2JoyLSt0c%2B
> Q%3Dreserved=0
> 
>  drivers/video/fbdev/hyperv_fb.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/video/fbdev/hyperv_fb.c
> b/drivers/video/fbdev/hyperv_fb.c index 5bc86f481a78..c8b0ae676809
> 100644
> --- a/drivers/video/fbdev/hyperv_fb.c
> +++ b/drivers/video/fbdev/hyperv_fb.c
> @@ -1093,7 +1093,12 @@ static int hvfb_getmem(struct hv_device *hdev,
> struct fb_info *info)
>   goto err1;
>   }
> 
> - fb_virt = ioremap(par->mem->start, screen_fb_size);
> + /*
> +  * Map the VRAM cacheable for performance. This is also required
> for
> +  * VM Connect to display properly for ARM64 Linux VM, as the host
> also
> +  * maps the VRAM cacheable.
> +  */
> + fb_virt = ioremap_cache(par->mem->start, screen_fb_size);
>   if (!fb_virt)
>   goto err2;

Reviewed-by: Haiyang Zhang 
Thank you!


RE: [PATCH] hv_netvsc: Validate number of allocated sub-channels

2020-11-18 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri (Microsoft) 
> Sent: Wednesday, November 18, 2020 10:33 AM
> To: linux-kernel@vger.kernel.org
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; linux-
> hyp...@vger.kernel.org; Michael Kelley ; Juan
> Vazquez ; Saruhan Karademir
> ; Andrea Parri (Microsoft)
> ; David S. Miller ; Jakub
> Kicinski ; net...@vger.kernel.org
> Subject: [PATCH] hv_netvsc: Validate number of allocated sub-channels
> 
> Lack of validation could lead to out-of-bound reads and information leaks (cf.
> usage of nvdev->chan_table[]).  Check that the number of allocated sub-
> channels fits into the expected range.
> 
> Suggested-by: Saruhan Karademir 
> Signed-off-by: Andrea Parri (Microsoft) 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> ---
> Based on hyperv-next.
> 
>  drivers/net/hyperv/rndis_filter.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/net/hyperv/rndis_filter.c
> b/drivers/net/hyperv/rndis_filter.c
> index 3835d9bea1005..c5a709f67870f 100644
> --- a/drivers/net/hyperv/rndis_filter.c
> +++ b/drivers/net/hyperv/rndis_filter.c
> @@ -1226,6 +1226,11 @@ int rndis_set_subchannel(struct net_device *ndev,
>   return -EIO;
>   }
> 
> + /* Check that number of allocated sub channel is within the expected
> range */
> + if (init_packet->msg.v5_msg.subchn_comp.num_subchannels >
> nvdev->num_chn - 1) {
> + netdev_err(ndev, "invalid number of allocated sub
> channel\n");
> + return -EINVAL;
> + }
>   nvdev->num_chn = 1 +
>   init_packet->msg.v5_msg.subchn_comp.num_subchannels;

Reviewed-by: Haiyang Zhang 
Thank you.


RE: [PATCH v3] hv_netvsc: Add validation for untrusted Hyper-V values

2020-09-16 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri (Microsoft) 
> Sent: Wednesday, September 16, 2020 5:47 AM
> To: linux-kernel@vger.kernel.org
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; linux-
> hyp...@vger.kernel.org; Andres Beltran ; Michael
> Kelley ; Saruhan Karademir
> ; Juan Vazquez ; Andrea
> Parri ; David S. Miller ;
> Jakub Kicinski ; net...@vger.kernel.org
> Subject: [PATCH v3] hv_netvsc: Add validation for untrusted Hyper-V values
> 
> From: Andres Beltran 
> 
> For additional robustness in the face of Hyper-V errors or malicious
> behavior, validate all values that originate from packets that Hyper-V
> has sent to the guest in the host-to-guest ring buffer. Ensure that
> invalid values cannot cause indexing off the end of an array, or
> subvert an existing validation via integer overflow. Ensure that
> outgoing packets do not have any leftover guest memory that has not
> been zeroed out.
> 
> Signed-off-by: Andres Beltran 
> Co-developed-by: Andrea Parri (Microsoft) 
> Signed-off-by: Andrea Parri (Microsoft) 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> ---
> Changes in v3:
>   - Include header size in the estimate for hv_pkt_datalen (Haiyang)
> Changes in v2:
>   - Replace size check on struct nvsp_message with sub-checks (Haiyang)
> 
>  drivers/net/hyperv/hyperv_net.h   |   4 +
>  drivers/net/hyperv/netvsc.c   | 124 ++
>  drivers/net/hyperv/netvsc_drv.c   |   7 ++
>  drivers/net/hyperv/rndis_filter.c |  73 --
>  4 files changed, 188 insertions(+), 20 deletions(-)

Reviewed-by: Haiyang Zhang 



RE: [PATCH v2] hv_netvsc: Add validation for untrusted Hyper-V values

2020-09-10 Thread Haiyang Zhang



> -Original Message-
> From: Andrea Parri (Microsoft) 
> Sent: Thursday, September 10, 2020 8:48 AM
> To: linux-kernel@vger.kernel.org
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; linux-
> hyp...@vger.kernel.org; Andres Beltran ; Michael
> Kelley ; Saruhan Karademir
> ; Juan Vazquez ; Andrea
> Parri ; David S. Miller ;
> Jakub Kicinski ; net...@vger.kernel.org
> Subject: [PATCH v2] hv_netvsc: Add validation for untrusted Hyper-V values
> 
> From: Andres Beltran 
> 
> For additional robustness in the face of Hyper-V errors or malicious
> behavior, validate all values that originate from packets that Hyper-V
> has sent to the guest in the host-to-guest ring buffer. Ensure that
> invalid values cannot cause indexing off the end of an array, or
> subvert an existing validation via integer overflow. Ensure that
> outgoing packets do not have any leftover guest memory that has not
> been zeroed out.
> 
> Signed-off-by: Andres Beltran 
> Co-developed-by: Andrea Parri (Microsoft) 
> Signed-off-by: Andrea Parri (Microsoft) 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> ---
> Changes in v2:
>   - Replace size check on struct nvsp_message with sub-checks (Haiyang)
> 
>  drivers/net/hyperv/hyperv_net.h   |   4 +
>  drivers/net/hyperv/netvsc.c   | 120 ++
>  drivers/net/hyperv/netvsc_drv.c   |   7 ++
>  drivers/net/hyperv/rndis_filter.c |  73 --
>  4 files changed, 184 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h
> b/drivers/net/hyperv/hyperv_net.h
> index 4d2b2d48ff2a1..da78bd0fb2aa2 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -860,6 +860,10 @@ static inline u32 netvsc_rqstor_size(unsigned long
> ringbytes)
>  ringbytes / NETVSC_MIN_IN_MSG_SIZE;
>  }
> 
> +#define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
> + (offsetof(struct vmtransfer_page_packet_header, ranges) +
> \
> + (rng_cnt) * sizeof(struct vmtransfer_page_range))
> +
>  struct multi_send_data {
>   struct sk_buff *skb; /* skb containing the pkt */
>   struct hv_netvsc_packet *pkt; /* netvsc pkt pending */
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 03e93e3ddbad8..90b7a39c2dc78 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -388,6 +388,15 @@ static int netvsc_init_buf(struct hv_device *device,
>   net_device->recv_section_size = resp->sections[0].sub_alloc_size;
>   net_device->recv_section_cnt = resp->sections[0].num_sub_allocs;
> 
> + /* Ensure buffer will not overflow */
> + if (net_device->recv_section_size < NETVSC_MTU_MIN ||
> (u64)net_device->recv_section_size *
> + (u64)net_device->recv_section_cnt > (u64)buf_size) {
> + netdev_err(ndev, "invalid recv_section_size %u\n",
> +net_device->recv_section_size);
> + ret = -EINVAL;
> + goto cleanup;
> + }
> +
>   /* Setup receive completion ring.
>* Add 1 to the recv_section_cnt because at least one entry in a
>* ring buffer has to be empty.
> @@ -460,6 +469,12 @@ static int netvsc_init_buf(struct hv_device *device,
>   /* Parse the response */
>   net_device->send_section_size = init_packet->msg.
> 
>   v1_msg.send_send_buf_complete.section_size;
> + if (net_device->send_section_size < NETVSC_MTU_MIN) {
> + netdev_err(ndev, "invalid send_section_size %u\n",
> +net_device->send_section_size);
> + ret = -EINVAL;
> + goto cleanup;
> + }
> 
>   /* Section count is simply the size divided by the section size. */
>   net_device->send_section_cnt = buf_size / net_device-
> >send_section_size;
> @@ -740,12 +755,45 @@ static void netvsc_send_completion(struct
> net_device *ndev,
>  int budget)
>  {
>   const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
> + u32 msglen = hv_pkt_datalen(desc);
> +
> + /* Ensure packet is big enough to read header fields */
> + if (msglen < sizeof(struct nvsp_message_header)) {
> + netdev_err(ndev, "nvsp_message length too small: %u\n",
> msglen);
> + return;
> + }
> 
>   switch (nvsp_packet->hdr.msg_type) {
>   case NVSP_MSG_TYPE_INIT_COMPLETE:
> + if (msglen < sizeof(struct nvsp_message_init_complete)) {

This and other similar places should include header size:
if (msglen < sizeof(struct nvsp_message_header) + sizeof(struct 
nvsp_message_init_complete)) {

Thanks,
- Haiyang


[PATCH net, 2/2] hv_netvsc: Fix the queue_mapping in netvsc_vf_xmit()

2020-08-20 Thread Haiyang Zhang
netvsc_vf_xmit() / dev_queue_xmit() will call VF NIC’s ndo_select_queue
or netdev_pick_tx() again. They will use skb_get_rx_queue() to get the
queue number, so the “skb->queue_mapping - 1” will be used. This may
cause the last queue of VF not been used.

Use skb_record_rx_queue() here, so that the skb_get_rx_queue() called
later will get the correct queue number, and VF will be able to use
all queues.

Fixes: b3bf5666a510 ("hv_netvsc: defer queue selection to VF")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 0029292cdb9f..64b0a74c1523 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -502,7 +502,7 @@ static int netvsc_vf_xmit(struct net_device *net, struct 
net_device *vf_netdev,
int rc;
 
skb->dev = vf_netdev;
-   skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+   skb_record_rx_queue(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
 
rc = dev_queue_xmit(skb);
if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
-- 
2.25.1



[PATCH net, 1/2] hv_netvsc: Remove "unlikely" from netvsc_select_queue

2020-08-20 Thread Haiyang Zhang
When using vf_ops->ndo_select_queue, the number of queues of VF is
usually bigger than the synthetic NIC. This condition may happen
often.
Remove "unlikely" from the comparison of ndev->real_num_tx_queues.

Fixes: b3bf5666a510 ("hv_netvsc: defer queue selection to VF")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 787f17e2a971..0029292cdb9f 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -367,7 +367,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, 
struct sk_buff *skb,
}
rcu_read_unlock();
 
-   while (unlikely(txq >= ndev->real_num_tx_queues))
+   while (txq >= ndev->real_num_tx_queues)
txq -= ndev->real_num_tx_queues;
 
return txq;
-- 
2.25.1



[PATCH net, 0/2] hv_netvsc: Some fixes for the select_queue

2020-08-20 Thread Haiyang Zhang
This patch set includes two fixes for the select_queue process.

Haiyang Zhang (2):
  hv_netvsc: Remove "unlikely" from netvsc_select_queue
  hv_netvsc: Fix the queue_mapping in netvsc_vf_xmit()

 drivers/net/hyperv/netvsc_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
2.25.1



RE: [PATCH] hv_netvsc: Add validation for untrusted Hyper-V values

2020-08-02 Thread Haiyang Zhang



> -Original Message-
> From: Andres Beltran 
> Sent: Tuesday, July 28, 2020 6:53 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> wei@kernel.org
> Cc: linux-hyp...@vger.kernel.org; linux-kernel@vger.kernel.org; Michael
> Kelley ; parri.and...@gmail.com; Saruhan
> Karademir ; Andres Beltran ;
> David S . Miller ; Jakub Kicinski ;
> net...@vger.kernel.org
> Subject: [PATCH] hv_netvsc: Add validation for untrusted Hyper-V values
> 
> For additional robustness in the face of Hyper-V errors or malicious
> behavior, validate all values that originate from packets that Hyper-V
> has sent to the guest in the host-to-guest ring buffer. Ensure that
> invalid values cannot cause indexing off the end of an array, or
> subvert an existing validation via integer overflow. Ensure that
> outgoing packets do not have any leftover guest memory that has not
> been zeroed out.
> 
> Cc: David S. Miller 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> Signed-off-by: Andres Beltran 
> ---
>  drivers/net/hyperv/hyperv_net.h   |  4 ++
>  drivers/net/hyperv/netvsc.c   | 99 +++
>  drivers/net/hyperv/netvsc_drv.c   |  7 +++
>  drivers/net/hyperv/rndis_filter.c | 73 ---
>  4 files changed, 163 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index f43b614f2345..7df5943fa46f 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -860,6 +860,10 @@ static inline u32 netvsc_rqstor_size(unsigned long
> ringbytes)
>  ringbytes / NETVSC_MIN_IN_MSG_SIZE;
>  }
> 
> +#define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
> + (offsetof(struct vmtransfer_page_packet_header, ranges) + \
> + (rng_cnt) * sizeof(struct vmtransfer_page_range))
> +
>  struct multi_send_data {
>   struct sk_buff *skb; /* skb containing the pkt */
>   struct hv_netvsc_packet *pkt; /* netvsc pkt pending */
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 79b907a29433..7aa5276a1f36 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -398,6 +398,15 @@ static int netvsc_init_buf(struct hv_device *device,
>   net_device->recv_section_size = resp->sections[0].sub_alloc_size;
>   net_device->recv_section_cnt = resp->sections[0].num_sub_allocs;
> 
> + /* Ensure buffer will not overflow */
> + if (net_device->recv_section_size < NETVSC_MTU_MIN ||
> (u64)net_device->recv_section_size *
> + (u64)net_device->recv_section_cnt > (u64)buf_size) {
> + netdev_err(ndev, "invalid recv_section_size %u\n",
> +net_device->recv_section_size);
> + ret = -EINVAL;
> + goto cleanup;
> + }
> +
>   /* Setup receive completion ring.
>* Add 1 to the recv_section_cnt because at least one entry in a
>* ring buffer has to be empty.
> @@ -479,6 +488,12 @@ static int netvsc_init_buf(struct hv_device *device,
>   /* Parse the response */
>   net_device->send_section_size = init_packet->msg.
>   v1_msg.send_send_buf_complete.section_size;
> + if (net_device->send_section_size < NETVSC_MTU_MIN) {
> + netdev_err(ndev, "invalid send_section_size %u\n",
> +net_device->send_section_size);
> + ret = -EINVAL;
> + goto cleanup;
> + }
> 
>   /* Section count is simply the size divided by the section size. */
>   net_device->send_section_cnt = buf_size / net_device-
> >send_section_size;
> @@ -770,12 +785,24 @@ static void netvsc_send_completion(struct
> net_device *ndev,
>  int budget)
>  {
>   const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
> + u32 msglen = hv_pkt_datalen(desc);
> +
> + /* Ensure packet is big enough to read header fields */
> + if (msglen < sizeof(struct nvsp_message_header)) {
> + netdev_err(ndev, "nvsp_message length too small: %u\n",
> msglen);
> + return;
> + }
> 
>   switch (nvsp_packet->hdr.msg_type) {
>   case NVSP_MSG_TYPE_INIT_COMPLETE:
>   case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE:
>   case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE:
>   case NVSP_MSG5_TYPE_SUBCHANNEL:
> + if (msglen < sizeof(struct nvsp_message)) {
> + netdev_err(ndev, "nvsp_msg5 length too small: %u\n",
> +msglen);
> + return;
> + }

struct nvsp_message includes all message types, so its length is the longest 
type,
The messages from older host version are not necessarily reaching the 
sizeof(struct nvsp_message).

Testing on both new and older hosts are recommended, in case I didn't find out 
all issues
like this one.

Thanks,
- Haiyang


RE: [PATCH] Drivers: hv: vmbus: Fix variable assignments in hv_ringbuffer_read()

2020-07-26 Thread Haiyang Zhang


> -Original Message-
> From: Andres Beltran 
> Sent: Friday, July 24, 2020 7:04 PM
> To: Stephen Hemminger 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu ; linux-hyp...@vger.kernel.org; linux-
> ker...@vger.kernel.org; Michael Kelley ; Andrea
> Parri ; Saruhan Karademir
> 
> Subject: Re: [PATCH] Drivers: hv: vmbus: Fix variable assignments in
> hv_ringbuffer_read()
> 
> On Fri, Jul 24, 2020 at 1:10 PM Stephen Hemminger
>  wrote:
> > What is the rationale for this change, it may break other code.
> >
> > A common API model in Windows world where this originated
> > is to have a call where caller first
> > makes request and then if the requested buffer is not big enough the
> > caller look at the actual length and allocate a bigger buffer.
> >
> > Did you audit all the users of this API to make sure they aren't doing that.
> >
> 
> The rationale for the change was to solve instances like the one
> @Haiyang Zhang pointed out, especially in hv_utils, which needs
> additional hardening. Unfortunately, there is an instance in
> hv_pci_onchannelcallback() that does what you just described. Thus,
> the fix will have to be made to all the callers of vmbus_recvpacket()
> and vmbus_recvpacket_raw() to make sure they check the return value,
> which most callers are not doing now. Thanks for pointing out this
> behavior. I was not aware that the length can be checked by callers to
> allocate a bigger buffer.

To prevent future coding error, please add code comments for 
hv_ringbuffer_read() to indicate that the buffer_actual_len may be 
nonzero when the function fails, and should not be used to 
determine if the function succeeds or not.

Thanks,
- Haiyang



RE: [PATCH] Drivers: hv: vmbus: Fix variable assignments in hv_ringbuffer_read()

2020-07-24 Thread Haiyang Zhang



> -Original Message-
> From: Stephen Hemminger 
> Sent: Friday, July 24, 2020 1:11 PM
> To: Andres Beltran 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> wei@kernel.org; linux-hyp...@vger.kernel.org; linux-
> ker...@vger.kernel.org; Michael Kelley ;
> parri.and...@gmail.com; Saruhan Karademir 
> Subject: Re: [PATCH] Drivers: hv: vmbus: Fix variable assignments in
> hv_ringbuffer_read()
> 
> On Fri, 24 Jul 2020 09:46:06 -0700
> "Andres Beltran"  wrote:
> 
> > Assignments to buffer_actual_len and requestid happen before packetlen
> > is checked to be within buflen. If this condition is true,
> > hv_ringbuffer_read() returns with these variables already set to some
> > value even though no data is actually read. This might create
> > inconsistencies in any routine calling hv_ringbuffer_read(). Assign values
> > to such pointers after the packetlen check.
> >
> > Signed-off-by: Andres Beltran 
> > ---
> >  drivers/hv/ring_buffer.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
> > index 356e22159e83..e277ce7372a4 100644
> > --- a/drivers/hv/ring_buffer.c
> > +++ b/drivers/hv/ring_buffer.c
> > @@ -350,12 +350,13 @@ int hv_ringbuffer_read(struct vmbus_channel
> > *channel,
> >
> > offset = raw ? 0 : (desc->offset8 << 3);
> > packetlen = (desc->len8 << 3) - offset;
> > -   *buffer_actual_len = packetlen;
> > -   *requestid = desc->trans_id;
> >
> > if (unlikely(packetlen > buflen))
> > return -ENOBUFS;
> >
> > +   *buffer_actual_len = packetlen;
> > +   *requestid = desc->trans_id;
> > +
> > /* since ring is double mapped, only one copy is necessary */
> > memcpy(buffer, (const char *)desc + offset, packetlen);
> >
> 
> What is the rationale for this change, it may break other code.
> 
> A common API model in Windows world where this originated
> is to have a call where caller first
> makes request and then if the requested buffer is not big enough the
> caller look at the actual length and allocate a bigger buffer.
> 
> Did you audit all the users of this API to make sure they aren't doing that.

I took a quick look. I saw a case which treats the ringbuffer_read as 
successful if the buffer_actual_len > 0. So if hv_ringbuffer_read() should 
not be fixed this way, then all callers like balloon_onchannelcallback() 
need a fix.

1476 static void balloon_onchannelcallback(void *context)
1477 {
1478 struct hv_device *dev = context;
1479 u32 recvlen;
1480 u64 requestid;
1481 struct dm_message *dm_msg;
1482 struct dm_header *dm_hdr;
1483 struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1484 struct dm_balloon *bal_msg;
1485 struct dm_hot_add *ha_msg;
1486 union dm_mem_page_range *ha_pg_range;
1487 union dm_mem_page_range *ha_region;
1488
1489 memset(recv_buffer, 0, sizeof(recv_buffer));
1490 vmbus_recvpacket(dev->channel, recv_buffer,
1491  HV_HYP_PAGE_SIZE, , );
1492
1493 if (recvlen > 0) {
1494 dm_msg = (struct dm_message *)recv_buffer;
1495 dm_hdr = _msg->hdr;

Thanks,
- Haiyang



RE: [PATCH v8 net-next] net: hyperv: dump TX indirection table to ethtool regs

2020-07-24 Thread Haiyang Zhang



> -Original Message-
> From: Chi Song 
> Sent: Friday, July 24, 2020 12:14 AM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu ; David S. Miller ; Jakub
> Kicinski 
> Cc: linux-hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Subject: [PATCH v8 net-next] net: hyperv: dump TX indirection table to ethtool
> regs
> 
> An imbalanced TX indirection table causes netvsc to have low
> performance. This table is created and managed during runtime. To help
> better diagnose performance issues caused by imbalanced tables, it needs
> make TX indirection tables visible.
> 
> Because TX indirection table is driver specified information, so
> display it via ethtool register dump.
> 
> Signed-off-by: Chi Song 

Reviewed-by: Haiyang Zhang 



RE: [PATCH v7 net-next] net: hyperv: dump TX indirection table to ethtool regs

2020-07-23 Thread Haiyang Zhang



> -Original Message-
> From: Michal Kubecek 
> Sent: Thursday, July 23, 2020 3:36 PM
> To: Chi Song 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu ; David S. Miller ; Jakub
> Kicinski ; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v7 net-next] net: hyperv: dump TX indirection table to
> ethtool regs
> 
> On Wed, Jul 22, 2020 at 11:59:09PM -0700, Chi Song wrote:
> > An imbalanced TX indirection table causes netvsc to have low
> > performance. This table is created and managed during runtime. To help
> > better diagnose performance issues caused by imbalanced tables, it needs
> > make TX indirection tables visible.
> >
> > Because TX indirection table is driver specified information, so
> > display it via ethtool register dump.
> 
> Is the Tx indirection table really unique to netvsc or can we expect
> other drivers to support similar feature? Also, would it make sense to
> allow also setting the table with ethtool? (AFAICS it can be only set
> from hypervisor at the moment.)

Currently, TX indirection table is only used by the Hyper-V synthetic NIC. I'm 
not aware of any other NIC planning to use this.
This table is created by host dynamically based on host side CPU usage, 
and provided to the VM periodically. Our protocol doesn't let the guest side 
to change it.

Thanks,
- Haiyang



RE: [PATCH v6] hv_netvsc: add support for vlans in AF_PACKET mode

2020-07-22 Thread Haiyang Zhang



> -Original Message-
> From: Sriram Krishnan 
> Sent: Wednesday, July 22, 2020 11:39 AM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu 
> Cc: mbumg...@cisco.com; u...@cisco.com; n...@cisco.com; xe-linux-
> exter...@cisco.com; David S. Miller ; Jakub Kicinski
> ; linux-hyp...@vger.kernel.org; net...@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: [PATCH v6] hv_netvsc: add support for vlans in AF_PACKET mode
> 
> Vlan tagged packets are getting dropped when used with DPDK that uses
> the AF_PACKET interface on a hyperV guest.
> 
> The packet layer uses the tpacket interface to communicate the vlans
> information to the upper layers. On Rx path, these drivers can read the
> vlan info from the tpacket header but on the Tx path, this information
> is still within the packet frame and requires the paravirtual drivers to
> push this back into the NDIS header which is then used by the host OS to
> form the packet.
> 
> This transition from the packet frame to NDIS header is currently missing
> hence causing the host OS to drop the all vlan tagged packets sent by
> the drivers that use AF_PACKET (ETH_P_ALL) such as DPDK.
> 
> Here is an overview of the changes in the vlan header in the packet path:
> 
> The RX path (userspace handles everything):
>   1. RX VLAN packet is stripped by HOST OS and placed in NDIS header
>   2. Guest Kernel RX hv_netvsc packets and moves VLAN info from NDIS
>  header into kernel SKB
>   3. Kernel shares packets with user space application with PACKET_MMAP.
>  The SKB VLAN info is copied to tpacket layer and indication set
>  TP_STATUS_VLAN_VALID.
>   4. The user space application will re-insert the VLAN info into the frame
> 
> The TX path:
>   1. The user space application has the VLAN info in the frame.
>   2. Guest kernel gets packets from the application with PACKET_MMAP.
>   3. The kernel later sends the frame to the hv_netvsc driver. The only way
>  to send VLANs is when the SKB is setup & the VLAN is stripped from the
>  frame.
>   4. TX VLAN is re-inserted by HOST OS based on the NDIS header. If it sees
>  a VLAN in the frame the packet is dropped.
> 
> Cc: xe-linux-exter...@cisco.com
> Cc: Sriram Krishnan 
> Signed-off-by: Sriram Krishnan 

Thanks you!
Reviewed-by: Haiyang Zhang 


RE: [PATCH v3] net: hyperv: add support for vlans in netvsc driver

2020-07-21 Thread Haiyang Zhang


> -Original Message-
> From: Sriram Krishnan (srirakr2) 
> Sent: Tuesday, July 21, 2020 3:10 AM
> To: David Miller 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> wei@kernel.org; Malcolm Bumgardner (mbumgard)
> ; Umesha G M (ugm) ; Niranjan M
> M (nimm) ; xe-linux-external(mailer list)  exter...@cisco.com>; k...@kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] net: hyperv: add support for vlans in netvsc driver
> 
> 
> 
> On 21/07/20, 4:57 AM, "David Miller"  wrote:
> 
> From: Sriram Krishnan 
> Date: Mon, 20 Jul 2020 22:15:51 +0530
> 
> > +   if (skb->protocol == htons(ETH_P_8021Q)) {
> > +   u16 vlan_tci = 0;
> > +   skb_reset_mac_header(skb);
> 
>> Please place an empty line between basic block local variable 
> declarations
>> and actual code.
> 
> > +   netdev_err(net,"Pop vlan err 
> %x\n",pop_err);
> 
> > A space is necessary before "pop_err".
> 
> Consolidated list of comments addressed:
> > 1. Blank line between declaration and code.
> Done
> 
> > 2. Error handling is different than other parts of this code.
> >   probably just need a goto drop on error.
> Done
> 
> > It seems like you are putting into message, then driver is putting it
> > into meta-data in next code block. Maybe it should be combined?
> Not done
> This was on purpose. Merging the two code blocks might break existing
> functionality.
> There could be other modes where the packet arrives with 802.1q already in
> the Skb and the skb->protocol needn’t be 802.1q.
> 
> > packet->total_bytes should be updated too.
> Not done.
> The total_bytes needs be the total length of packet after the host OS adds the
> 802.1q header back in before tx. Updating the total_bytes to -= VLAN_HEADER
> will lead to packet drop in the Host OS driver.

If you make this change, did you see any drop in a live test? The
"packet->total_bytes" in struct hv_netvsc_packet  is for book keeping 
only, which is used for stats info, and not visible by the host at all.

I made this suggestion because the "regular" vlan packet length was 
counted by bytes without the VLAN_HLEN(4) -- the vlan tag is 
in the skb metadata, separately from the ethernet header. I want the 
statistical data for AF_PACKET mode consistent with the regular case.

struct hv_netvsc_packet {
/* Bookkeeping stuff */
u8 cp_partial; /* partial copy into send buffer */

u8 rmsg_size; /* RNDIS header and PPI size */
u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */
u8 page_buf_cnt;

u16 q_idx;
u16 total_packets;

u32 total_bytes;
u32 send_buf_index;
u32 total_data_buflen;
};

Thanks
- Haiyang


RE: [PATCH v6 net-next] net: hyperv: Add attributes to show TX indirection table

2020-07-21 Thread Haiyang Zhang



> -Original Message-
> From: Chi Song 
> Sent: Tuesday, July 21, 2020 12:59 AM
> To: Stephen Hemminger ; David Miller
> ; Haiyang Zhang ; KY
> Srinivasan ; Stephen Hemminger
> ; wei@kernel.org; k...@kernel.org; linux-
> hyp...@vger.kernel.org
> Cc: net...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: [PATCH v6 net-next] net: hyperv: Add attributes to show TX 
> indirection
> table
> 
> An imbalanced TX indirection table causes netvsc to have low performance.
> This table is created and managed during runtime. To help better diagnose
> performance issues caused by imbalanced tables, add device attributes to show
> the content of TX indirection tables.
> 
> Signed-off-by: Chi Song 

Thanks you!
Reviewed-by: Haiyang Zhang 




RE: [PATCH v3] net: hyperv: add support for vlans in netvsc driver

2020-07-20 Thread Haiyang Zhang



> -Original Message-
> From: Sriram Krishnan 
> Sent: Monday, July 20, 2020 12:46 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu 
> Cc: mbumg...@cisco.com; u...@cisco.com; n...@cisco.com; xe-linux-
> exter...@cisco.com; David S. Miller ; Jakub Kicinski
> ; linux-hyp...@vger.kernel.org; net...@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: [PATCH v3] net: hyperv: add support for vlans in netvsc driver

Also netvsc already supports vlan in "regular" cases. Please be more specific 
in the subject.
Suggested subject: hv_netvsc: add support for vlans in AF_PACKET mode

Thanks,
- Haiyang


RE: [PATCH v3] net: hyperv: add support for vlans in netvsc driver

2020-07-20 Thread Haiyang Zhang



> -Original Message-
> From: Sriram Krishnan 
> Sent: Monday, July 20, 2020 12:46 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu 
> Cc: mbumg...@cisco.com; u...@cisco.com; n...@cisco.com; xe-linux-
> exter...@cisco.com; David S. Miller ; Jakub Kicinski
> ; linux-hyp...@vger.kernel.org; net...@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: [PATCH v3] net: hyperv: add support for vlans in netvsc driver
> 
> Vlan tagged packets are getting dropped when used with DPDK that uses
> the AF_PACKET interface on a hyperV guest.
> 
> The packet layer uses the tpacket interface to communicate the vlans
> information to the upper layers. On Rx path, these drivers can read the
> vlan info from the tpacket header but on the Tx path, this information
> is still within the packet frame and requires the paravirtual drivers to
> push this back into the NDIS header which is then used by the host OS to
> form the packet.
> 
> This transition from the packet frame to NDIS header is currently missing
> hence causing the host OS to drop the all vlan tagged packets sent by
> the drivers that use AF_PACKET (ETH_P_ALL) such as DPDK.
> 
> Here is an overview of the changes in the vlan header in the packet path:
> 
> The RX path (userspace handles everything):
>   1. RX VLAN packet is stripped by HOST OS and placed in NDIS header
>   2. Guest Kernel RX hv_netvsc packets and moves VLAN info from NDIS
>  header into kernel SKB
>   3. Kernel shares packets with user space application with PACKET_MMAP.
>  The SKB VLAN info is copied to tpacket layer and indication set
>  TP_STATUS_VLAN_VALID.
>   4. The user space application will re-insert the VLAN info into the frame.
> 
> The TX path:
>   1. The user space application has the VLAN info in the frame.
>   2. Guest kernel gets packets from the application with PACKET_MMAP.
>   3. The kernel later sends the frame to the hv_netvsc driver. The only way
>  to send VLANs is when the SKB is setup & the VLAN is is stripped from the
>  frame.
>   4. TX VLAN is re-inserted by HOST OS based on the NDIS header. If it sees
>  a VLAN in the frame the packet is dropped.
> 
> Cc: xe-linux-exter...@cisco.com
> Cc: Sriram Krishnan 
> Signed-off-by: Sriram Krishnan 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 23 +++
>  1 file changed, 23 insertions(+)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 6267f706e8ee..2a25b4352369 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -605,6 +605,29 @@ static int netvsc_xmit(struct sk_buff *skb, struct
> net_device *net, bool xdp_tx)
>   *hash_info = hash;
>   }
> 
> + /* When using AF_PACKET we need to remove VLAN from frame
> +  * and indicate VLAN information in SKB so HOST OS will
> +  * transmit the VLAN frame
> +  */
> + if (skb->protocol == htons(ETH_P_8021Q)) {
> + u16 vlan_tci = 0;
> + skb_reset_mac_header(skb);
> + if (eth_type_vlan(eth_hdr(skb)->h_proto)) {
> + int pop_err;
> + pop_err = __skb_vlan_pop(skb, _tci);
> + if (likely(pop_err == 0)) {
> + __vlan_hwaccel_put_tag(skb,
> htons(ETH_P_8021Q), vlan_tci);
> +
> + /* Update the NDIS header pkt lengths */
> + packet->total_data_buflen -= VLAN_HLEN;

packet->total_bytes should be updated too.

Thanks,
- Haiyang



RE: [PATCH v3 net-next] net: hyperv: Add attributes to show TX indirection table

2020-07-20 Thread Haiyang Zhang



> -Original Message-
> From: Chi Song 
> Sent: Monday, July 20, 2020 3:17 AM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> Wei Liu ; David S. Miller ; Jakub
> Kicinski 
> Cc: linux-hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Subject: [PATCH v3 net-next] net: hyperv: Add attributes to show TX 
> indirection
> table
> 
> An imbalanced TX indirection table causes netvsc to have low
> performance. This table is created and managed during runtime. To help
> better diagnose performance issues caused by imbalanced tables, add
> device attributes to show the content of TX indirection tables.
> 
> Signed-off-by: Chi Song 
> ---
> v2: remove RX as it's in ethtool already, show single value in each file,
>  and update description.
> v3: fix broken format by alpine.
> 
> Thank you for comments. Let me know, if I miss something.
> 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 53 +
>  1 file changed, 53 insertions(+)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 6267f706e8ee..222c2fad9300 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2370,6 +2370,55 @@ static int netvsc_unregister_vf(struct net_device
> *vf_netdev)
>   return NOTIFY_OK;
>  }
> 
> +static struct device_attribute
> dev_attr_netvsc_dev_attrs[VRSS_SEND_TAB_SIZE];
> +static struct attribute *netvsc_dev_attrs[VRSS_SEND_TAB_SIZE + 1];
> +
> +const struct attribute_group netvsc_dev_group = {
> + .name = NULL,
> + .attrs = netvsc_dev_attrs,
> +};
> +
> +static ssize_t tx_indirection_table_show(struct device *dev,
> +  struct device_attribute *dev_attr,
> +  char *buf)
> +{
> + struct net_device *ndev = to_net_dev(dev);
> + struct net_device_context *ndc = netdev_priv(ndev);
> + ssize_t offset = 0;
> + int index = dev_attr - dev_attr_netvsc_dev_attrs;
> +
> + offset = sprintf(buf, "%u\n", ndc->tx_table[index]);
> +
> + return offset;
> +}
> +
> +static void netvsc_attrs_init(void)
> +{
> + int i;
> + char buffer[32];
> +
> + for (i = 0; i < VRSS_SEND_TAB_SIZE; i++) {
> + sprintf(buffer, "tx_indirection_table_%02u", i);
> + dev_attr_netvsc_dev_attrs[i].attr.name =
> + kstrdup(buffer, GFP_KERNEL);
> + dev_attr_netvsc_dev_attrs[i].attr.mode = 0444;
> + sysfs_attr_init(_attr_netvsc_dev_attrs[i].attr);
> +
> + dev_attr_netvsc_dev_attrs[i].show = tx_indirection_table_show;
> + dev_attr_netvsc_dev_attrs[i].store = NULL;
> + netvsc_dev_attrs[i] = _attr_netvsc_dev_attrs[i].attr;
> + }
> + netvsc_dev_attrs[VRSS_SEND_TAB_SIZE] = NULL;
> +}
> +
> +static void netvsc_attrs_exit(void)
> +{
> + int i;
> +
> + for (i = 0; i < VRSS_SEND_TAB_SIZE; i++)
> + kfree(dev_attr_netvsc_dev_attrs[i].attr.name);
> +}
> +
>  static int netvsc_probe(struct hv_device *dev,
>   const struct hv_vmbus_device_id *dev_id)
>  {
> @@ -2396,6 +2445,7 @@ static int netvsc_probe(struct hv_device *dev,
>  net_device_ctx->msg_enable);
> 
>   hv_set_drvdata(dev, net);
> + netvsc_attrs_init();

dev_attr_netvsc_dev_attrs is a global array, so it should be initialized 
and cleaned up in netvsc_drv_init/exit(). 
If there are multiple NICs and possible hot add/remove, init/cleaning it in 
netvsc_probe/remove() will cause memory leak, and/or use of freed 
memory.

Thanks,
- Haiyang


RE: [PATCH v2 3/3] hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening

2020-06-29 Thread Haiyang Zhang



> -Original Message-
> From: Andres Beltran 
> Sent: Monday, June 29, 2020 4:02 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> wei@kernel.org
> Cc: linux-hyp...@vger.kernel.org; linux-kernel@vger.kernel.org; Michael
> Kelley ; parri.and...@gmail.com; Andres Beltran
> ; David S . Miller ; Jakub
> Kicinski ; net...@vger.kernel.org
> Subject: [PATCH v2 3/3] hv_netvsc: Use vmbus_requestor to generate
> transaction IDs for VMBus hardening
> 
> Currently, pointers to guest memory are passed to Hyper-V as
> transaction IDs in netvsc. In the face of errors or malicious
> behavior in Hyper-V, netvsc should not expose or trust the transaction
> IDs returned by Hyper-V to be valid guest memory addresses. Instead,
> use small integers generated by vmbus_requestor as requests
> (transaction) IDs.
> 
> Cc: David S. Miller 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> Signed-off-by: Andres Beltran 

Reviewed-by: Haiyang Zhang 




RE: [PATCH 3/3] hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening

2020-06-25 Thread Haiyang Zhang



> -Original Message-
> From: Andres Beltran 
> Sent: Thursday, June 25, 2020 11:37 AM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger ;
> wei@kernel.org
> Cc: linux-hyp...@vger.kernel.org; linux-kernel@vger.kernel.org; Michael
> Kelley ; parri.and...@gmail.com; Andres Beltran
> ; David S. Miller ; Jakub
> Kicinski ; net...@vger.kernel.org
> Subject: [PATCH 3/3] hv_netvsc: Use vmbus_requestor to generate transaction
> IDs for VMBus hardening
> 
> Currently, pointers to guest memory are passed to Hyper-V as
> transaction IDs in netvsc. In the face of errors or malicious
> behavior in Hyper-V, netvsc should not expose or trust the transaction
> IDs returned by Hyper-V to be valid guest memory addresses. Instead,
> use small integers generated by vmbus_requestor as requests
> (transaction) IDs.
> 
> Cc: "David S. Miller" 
> Cc: Jakub Kicinski 
> Cc: net...@vger.kernel.org
> Signed-off-by: Andres Beltran 
> ---
>  drivers/net/hyperv/hyperv_net.h   | 10 +
>  drivers/net/hyperv/netvsc.c   | 75 +--
>  drivers/net/hyperv/rndis_filter.c |  1 +
>  include/linux/hyperv.h|  1 +
>  4 files changed, 73 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index abda736e7c7d..14735c98e798 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -847,6 +847,16 @@ struct nvsp_message {
> 
>  #define NETVSC_XDP_HDRM 256
> 
> +#define NETVSC_MIN_OUT_MSG_SIZE (sizeof(struct vmpacket_descriptor) + \
> +  sizeof(struct nvsp_message))
> +#define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor)
> +
> +/* Estimated requestor size:
> + * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size
> + */
> +#define NETVSC_RQSTOR_SIZE (netvsc_ring_bytes /
> NETVSC_MIN_OUT_MSG_SIZE + \
> + netvsc_ring_bytes / NETVSC_MIN_IN_MSG_SIZE)

Please make the variable as the macro parameter for readability:
#define NETVSC_RQSTOR_SIZE(ringbytes) (ringbytes / NETVSC_MIN_OUT_MSG_SIZE ...

Then put the actual variable name when you use the macro.

Thanks,
- Haiyang



RE: [PATCH v2] hv_netvsc: Fix netvsc_start_xmit's return type

2020-04-30 Thread Haiyang Zhang



> -Original Message-
> From: Nathan Chancellor 
> Sent: Thursday, April 30, 2020 2:02 AM
> To: Michael Kelley 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Wei Liu ; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; clang-built-li...@googlegroups.com; Sami
> Tolvanen 
> Subject: Re: [PATCH v2] hv_netvsc: Fix netvsc_start_xmit's return type
> 
> Hi Michael,
> 
> On Thu, Apr 30, 2020 at 12:06:09AM +, Michael Kelley wrote:
> > From: Nathan Chancellor  Sent: Tuesday,
> > April 28, 2020 10:55 AM
> > >
> > > Do note that netvsc_xmit still returns int because netvsc_xmit has a
> > > potential return from netvsc_vf_xmit, which does not return
> > > netdev_tx_t because of the call to dev_queue_xmit.
> > >
> > > I am not sure if that is an oversight that was introduced by commit
> > > 0c195567a8f6e ("netvsc: transparent VF management") or if everything
> > > works properly as it is now.
> > >
> > > My patch is purely concerned with making the definition match the
> > > prototype so it should be NFC aside from avoiding the CFI panic.
> > >
> >
> > While it probably works correctly now, I'm not too keen on just
> > changing the return type for netvsc_start_xmit() and assuming the
> > 'int' that is returned from netvsc_xmit() will be correctly mapped to
> > the netdev_tx_t enum type.  While that mapping probably happens
> > correctly at the moment, this really should have a more holistic fix.
> 
> While it might work correctly, I am not sure that the mapping is correct,
> hence that comment.
> 
> netdev_tx_t is an enum with two acceptable types, 0x00 and 0x10. Up until
> commit 0c195567a8f6e ("netvsc: transparent VF management"), netvsc_xmit
> was guaranteed to return something of type netdev_tx_t.
> 
> However, after said commit, we could return anything from netvsc_vf_xmit,
> which in turn calls dev_queue_xmit then __dev_queue_xmit which will
> return either an error code (-ENOMEM or
> -ENETDOWN) or something from __dev_xmit_skb, which appears to be
> NET_XMIT_SUCCESS, NET_XMIT_DROP, or NET_XMIT_CN.
> 
> It does not look like netvsc_xmit or netvsc_vf_xmit try to convert those
> returns to netdev_tx_t in some way; netvsc_vf_xmit just passes the return
> value up to netvsc_xmit, which is the part that I am unsure about...
> 
> > Nathan -- are you willing to look at doing the more holistic fix?  Or
> > should we see about asking Haiyang Zhang to do it?
> 
> I would be fine trying to look at a more holistic fix but I know basically 
> nothing
> about this subsystem. I am unsure if something like this would be acceptable
> or if something else needs to happen.
> Otherwise, I'd be fine with you guys taking a look and just giving me
> reported-by credit.

Here is more info regarding Linux network subsystem:

As said in "include/linux/netdevice.h", drivers are allowed to return any codes 
from the three different namespaces.
And hv_netvsc needs to support "transparent VF", and calls netvsc_vf_xmit >> 
dev_queue_xmit which returns qdisc return codes, and errnos like -ENOMEM, 
etc. These are compliant with the guideline below:

  79 /*
  80  * Transmit return codes: transmit return codes originate from three 
different
  81  * namespaces:
  82  *
  83  * - qdisc return codes
  84  * - driver transmit return codes
  85  * - errno values
  86  *
  87  * Drivers are allowed to return any one of those in their 
hard_start_xmit()

Also, ndo_start_xmit function pointer is used by upper layer functions which 
can 
handles three types of the return codes. 
For example, in the calling stack: ndo_start_xmit << netdev_start_xmit << 
xmit_one << dev_hard_start_xmit():
The function dev_hard_start_xmit() uses dev_xmit_complete() to handle the 
return codes. It handles three types of the return codes correctly.

 3483 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct 
net_device *dev,
 3484 struct netdev_queue *txq, int *ret)
 3485 {
 3486 struct sk_buff *skb = first;
 3487 int rc = NETDEV_TX_OK;
 3488
 3489 while (skb) {
 3490 struct sk_buff *next = skb->next;
 3491
 3492 skb_mark_not_on_list(skb);
 3493 rc = xmit_one(skb, dev, txq, next != NULL);
 3494 if (unlikely(!dev_xmit_complete(rc))) {
 3495 skb->next = next;
 3496 goto out;
 3497 }
 3498
 3499 skb = next;
 3500 if (netif_tx_queue_stopped(txq) && skb) {
 3501 rc = NETDEV_TX_BUSY;
 3502  

RE: [PATCH v2][PATCH net] hv_netvsc: Add the support of hibernation

2019-09-25 Thread Haiyang Zhang



> -Original Message-
> From: Dexuan Cui 
> Sent: Wednesday, September 25, 2019 6:04 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; sas...@kernel.org; da...@davemloft.net;
> linux-hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; Michael Kelley 
> Cc: Dexuan Cui 
> Subject: [PATCH v2][PATCH net] hv_netvsc: Add the support of hibernation
> 
> The existing netvsc_detach() and netvsc_attach() APIs make it easy to
> implement the suspend/resume callbacks.
> 
> Signed-off-by: Dexuan Cui 

Reviewed-by: Haiyang Zhang 


RE: [PATCH v3 02/26] PCI: hv: Use PCI_STD_NUM_BARS

2019-09-16 Thread Haiyang Zhang



> -Original Message-
> From: Denis Efremov 
> Sent: Monday, September 16, 2019 4:42 PM
> To: Bjorn Helgaas 
> Cc: Denis Efremov ; linux-kernel@vger.kernel.org;
> linux-...@vger.kernel.org; Andrew Murray ;
> linux-hyp...@vger.kernel.org; KY Srinivasan ; Haiyang
> Zhang ; Stephen Hemminger
> ; Sasha Levin 
> Subject: [PATCH v3 02/26] PCI: hv: Use PCI_STD_NUM_BARS
> 
> Replace the magic constant (6) with define PCI_STD_NUM_BARS
> representing the number of PCI BARs.
> 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Sasha Levin 
> Signed-off-by: Denis Efremov 
> ---
>  drivers/pci/controller/pci-hyperv.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-
> hyperv.c
> index 40b625458afa..1665c23b649f 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -307,7 +307,7 @@ struct pci_bus_relations {  struct
> pci_q_res_req_response {
>   struct vmpacket_descriptor hdr;
>   s32 status; /* negative values are failures */
> - u32 probed_bar[6];
> + u32 probed_bar[PCI_STD_NUM_BARS];
>  } __packed;
> 
>  struct pci_set_power {
> @@ -503,7 +503,7 @@ struct hv_pci_dev {
>* What would be observed if one wrote 0x to a BAR and
> then
>* read it back, for each of the BAR offsets within config space.
>*/
> - u32 probed_bar[6];
> + u32 probed_bar[PCI_STD_NUM_BARS];
>  };
> 
>  struct hv_pci_compl {
> @@ -1327,7 +1327,7 @@ static void survey_child_resources(struct
> hv_pcibus_device *hbus)
>* so it's sufficient to just add them up without tracking alignment.
>*/
>   list_for_each_entry(hpdev, >children, list_entry) {
> - for (i = 0; i < 6; i++) {
> + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
>   if (hpdev->probed_bar[i] &
> PCI_BASE_ADDRESS_SPACE_IO)
>   dev_err(>hdev->device,
>   "There's an I/O BAR in this list!\n");
> @@ -1401,7 +1401,7 @@ static void prepopulate_bars(struct
> hv_pcibus_device *hbus)
>   /* Pick addresses for the BARs. */
>   do {
>   list_for_each_entry(hpdev, >children, list_entry) {
> - for (i = 0; i < 6; i++) {
> + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
>   bar_val = hpdev->probed_bar[i];
>   if (bar_val == 0)
>   continue;
> @@ -1558,7 +1558,7 @@ static void q_resource_requirements(void
> *context, struct pci_response *resp,
>   "query resource requirements failed: %x\n",
>   resp->status);
>   } else {
> - for (i = 0; i < 6; i++) {
> + for (i = 0; i < PCI_STD_NUM_BARS; i++) {
>   completion->hpdev->probed_bar[i] =
>   q_res_req->probed_bar[i];
>   }

Reviewed-by: Haiyang Zhang 
Thanks.


RE: [PATCH][PATCH net-next] hv_netvsc: Add the support of hibernation

2019-09-12 Thread Haiyang Zhang



> -Original Message-
> From: Dexuan Cui 
> Sent: Wednesday, September 11, 2019 7:38 PM
> To: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; sas...@kernel.org; da...@davemloft.net;
> linux-hyp...@vger.kernel.org; net...@vger.kernel.org; linux-
> ker...@vger.kernel.org; Michael Kelley 
> Cc: Dexuan Cui 
> Subject: [PATCH][PATCH net-next] hv_netvsc: Add the support of hibernation
> 
> The existing netvsc_detach() and netvsc_attach() APIs make it easy to
> implement the suspend/resume callbacks.
> 
> Signed-off-by: Dexuan Cui 
> ---
> 
> This patch is basically a pure Hyper-V specific change and it has a
> build dependency on the commit 271b2224d42f ("Drivers: hv: vmbus:
> Implement
> suspend/resume for VSC drivers for hibernation"), which is on Sasha Levin's
> Hyper-V tree's hyperv-next branch:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.k
> ernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Fhyperv%2Flinux.git%2
> Flog%2F%3Fh%3Dhyperv-
> nextdata=02%7C01%7Chaiyangz%40microsoft.com%7C0b84e40c446
> 648cc35fb08d737110e28%7C72f988bf86f141af91ab2d7cd011db47%7C1%7
> C0%7C637038418703112019sdata=qd7DGFCJZ%2BDTix0VGcCe1JucV
> O97E0gILpVpcxlA6EE%3Dreserved=0
> 
> I request this patch should go through Sasha's tree rather than the
> net-next tree.
> 
>  drivers/net/hyperv/hyperv_net.h |  3 +++
>  drivers/net/hyperv/netvsc_drv.c | 59
> +
>  2 files changed, 62 insertions(+)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h
> b/drivers/net/hyperv/hyperv_net.h
> index ecc9af0..b8763ee 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -952,6 +952,9 @@ struct net_device_context {
>   u32 vf_alloc;
>   /* Serial number of the VF to team with */
>   u32 vf_serial;
> +
> + /* Used to temporarily save the config info across hibernation */
> + struct netvsc_device_info *saved_netvsc_dev_info;
>  };
> 
>  /* Per channel data */
> diff --git a/drivers/net/hyperv/netvsc_drv.c
> b/drivers/net/hyperv/netvsc_drv.c
> index afdcc56..f920959 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2392,6 +2392,63 @@ static int netvsc_remove(struct hv_device *dev)
>   return 0;
>  }
> 
> +static int netvsc_suspend(struct hv_device *dev)
> +{
> + struct net_device_context *ndev_ctx;
> + struct net_device *vf_netdev, *net;
> + struct netvsc_device *nvdev;
> + int ret;
> +
> + net = hv_get_drvdata(dev);
> +
> + ndev_ctx = netdev_priv(net);
> + cancel_delayed_work_sync(_ctx->dwork);
> +
> + rtnl_lock();
> +
> + nvdev = rtnl_dereference(ndev_ctx->nvdev);
> + if (nvdev == NULL) {
> + ret = -ENODEV;
> + goto out;
> + }
> +
> + cancel_work_sync(>subchan_work);

This looks redundant because netvsc_detach() cancels subchan_work.

Thanks,
- Haiyang



RE: linux-next: Signed-off-by missing for commit in the pci tree

2019-09-10 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Tuesday, September 10, 2019 7:18 AM
> To: Haiyang Zhang 
> Cc: Stephen Rothwell ; Bjorn Helgaas
> ; Linux Next Mailing List  n...@vger.kernel.org>; Linux Kernel Mailing List  ker...@vger.kernel.org>; Sasha Levin 
> Subject: Re: linux-next: Signed-off-by missing for commit in the pci tree
> 
> On Mon, Sep 09, 2019 at 11:10:06PM +, Haiyang Zhang wrote:
> >
> >
> > > -Original Message-
> > > From: Lorenzo Pieralisi 
> > > Sent: Wednesday, August 21, 2019 5:03 AM
> > > To: Stephen Rothwell 
> > > Cc: Bjorn Helgaas ; Linux Next Mailing List
> > > ; Linux Kernel Mailing List  > > ker...@vger.kernel.org>; Sasha Levin ; Haiyang
> > > Zhang 
> > > Subject: Re: linux-next: Signed-off-by missing for commit in the pci
> > > tree
> > >
> > > On Wed, Aug 21, 2019 at 07:19:39AM +1000, Stephen Rothwell wrote:
> > > > Hi all,
> > > >
> > > > Commit
> > > >
> > > >   c4a29fbba415 ("PCI: hv: Use bytes 4 and 5 from instance ID as
> > > > the PCI
> > > domain numbers")
> > > >
> > > > is missing a Signed-off-by from its committer.
> > > >
> > > > Also, all the tags should be kept together, please.
> > >
> > > Fixed it up in my pci/hv branch, apologies.
> > >
> > > Lorenzo
> >
> > Hi thanks for fixing this, but I found the following commit now has
> > the Subject line and commit message same as a previous patch.
> >
> > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.
> > kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Fnext%2Flinux-
> next.git%
> >
> 2Fcommit%2Fdrivers%3Fid%3D87b20a08dcc265959f5c59f18603ea0487fe60
> 9b
> > ;data=02%7C01%7Chaiyangz%40microsoft.com%7Cb4e2f03b09da4454342
> 108d735e
> >
> 081d0%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6370371106
> 95959451&
> >
> amp;sdata=17KZvmvtuHz3f0xKh6MwSOja41VerGTpyAO0f7BF6gY%3Dr
> eserved=
> > 0
> >
> > The correct message for the patch should be the msg below. Any
> > possibility to fix it again?  Thanks.
> 
> I updated my pci/hv branch and rewrote the commit log, I will ask Bjorn to
> update it so that it shows up in -next.
> 
> Lorenzo


Thank you!


RE: linux-next: Signed-off-by missing for commit in the pci tree

2019-09-09 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Wednesday, August 21, 2019 5:03 AM
> To: Stephen Rothwell 
> Cc: Bjorn Helgaas ; Linux Next Mailing List  n...@vger.kernel.org>; Linux Kernel Mailing List  ker...@vger.kernel.org>; Sasha Levin ; Haiyang Zhang
> 
> Subject: Re: linux-next: Signed-off-by missing for commit in the pci tree
> 
> On Wed, Aug 21, 2019 at 07:19:39AM +1000, Stephen Rothwell wrote:
> > Hi all,
> >
> > Commit
> >
> >   c4a29fbba415 ("PCI: hv: Use bytes 4 and 5 from instance ID as the PCI
> domain numbers")
> >
> > is missing a Signed-off-by from its committer.
> >
> > Also, all the tags should be kept together, please.
> 
> Fixed it up in my pci/hv branch, apologies.
> 
> Lorenzo

Hi thanks for fixing this, but I found the following commit now has the Subject 
line
and commit message same as a previous patch.

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/drivers?id=87b20a08dcc265959f5c59f18603ea0487fe609b

The correct message for the patch should be the msg below. Any possibility to 
fix it again?  Thanks.
- Haiyang

Subject:  PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers

As recommended by Azure host team, the bytes 4, 5 have more uniqueness
(info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
numbers. On older hosts, bytes 4, 5 can also be used -- no backward
compatibility issues here. The chance of collision is greatly reduced.

In the rare cases of collision, the driver code detects and finds another
number that is not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
Signed-off-by: Lorenzo Pieralisi 



[PATCH net-next,v2, 1/2] hv_netvsc: Allow scatter-gather feature to be tunable

2019-09-05 Thread Haiyang Zhang
In a previous patch, the NETIF_F_SG was missing after the code changes.
That caused the SG feature to be "fixed". This patch includes it into
hw_features, so it is tunable again.

Fixes: 23312a3be999 ("netvsc: negotiate checksum and segmentation parameters")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/netvsc_drv.c   | 4 ++--
 drivers/net/hyperv/rndis_filter.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index ecc9af0..670ef68 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -822,7 +822,7 @@ struct nvsp_message {
 
 #define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \
  NETIF_F_TSO | NETIF_F_IPV6_CSUM | \
- NETIF_F_TSO6 | NETIF_F_LRO)
+ NETIF_F_TSO6 | NETIF_F_LRO | NETIF_F_SG)
 
 #define VRSS_SEND_TAB_SIZE 16  /* must be power of 2 */
 #define VRSS_CHANNEL_MAX 64
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 0a6cd2f..1f1192e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2313,8 +2313,8 @@ static int netvsc_probe(struct hv_device *dev,
 
/* hw_features computed in rndis_netdev_set_hwcaps() */
net->features = net->hw_features |
-   NETIF_F_HIGHDMA | NETIF_F_SG |
-   NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+   NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX |
+   NETIF_F_HW_VLAN_CTAG_RX;
net->vlan_features = net->features;
 
netdev_lockdep_set_classes(net);
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 317dbe9..abaf815 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1207,6 +1207,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device 
*rndis_device,
 
/* Compute tx offload settings based on hw capabilities */
net->hw_features |= NETIF_F_RXCSUM;
+   net->hw_features |= NETIF_F_SG;
 
if ((hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_ALL_TCP4) == 
NDIS_TXCSUM_ALL_TCP4) {
/* Can checksum TCP */
-- 
1.8.3.1



[PATCH net-next,v2, 0/2] Enable sg as tunable, sync offload settings to VF NIC

2019-09-05 Thread Haiyang Zhang
This patch set fixes an issue in SG tuning, and sync
offload settings from synthetic NIC to VF NIC.

Haiyang Zhang (2):
  hv_netvsc: hv_netvsc: Allow scatter-gather feature to be tunable
  hv_netvsc: Sync offloading features to VF NIC

 drivers/net/hyperv/hyperv_net.h   |  2 +-
 drivers/net/hyperv/netvsc_drv.c   | 26 ++
 drivers/net/hyperv/rndis_filter.c |  1 +
 3 files changed, 24 insertions(+), 5 deletions(-)

-- 
1.8.3.1



[PATCH net-next,v2, 2/2] hv_netvsc: Sync offloading features to VF NIC

2019-09-05 Thread Haiyang Zhang
VF NIC may go down then come up during host servicing events. This
causes the VF NIC offloading feature settings to roll back to the
defaults. This patch can synchronize features from synthetic NIC to
the VF NIC during ndo_set_features (ethtool -K),
and netvsc_register_vf when VF comes back after host events.

Signed-off-by: Haiyang Zhang 
Cc: Mark Bloch 
---
 drivers/net/hyperv/netvsc_drv.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1f1192e..39dddcd 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1785,13 +1785,15 @@ static int netvsc_set_features(struct net_device *ndev,
netdev_features_t change = features ^ ndev->features;
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
+   struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
struct ndis_offload_params offloads;
+   int ret = 0;
 
if (!nvdev || nvdev->destroy)
return -ENODEV;
 
if (!(change & NETIF_F_LRO))
-   return 0;
+   goto syncvf;
 
memset(, 0, sizeof(struct ndis_offload_params));
 
@@ -1803,7 +1805,19 @@ static int netvsc_set_features(struct net_device *ndev,
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
}
 
-   return rndis_filter_set_offload_params(ndev, nvdev, );
+   ret = rndis_filter_set_offload_params(ndev, nvdev, );
+
+   if (ret)
+   features ^= NETIF_F_LRO;
+
+syncvf:
+   if (!vf_netdev)
+   return ret;
+
+   vf_netdev->wanted_features = features;
+   netdev_update_features(vf_netdev);
+
+   return ret;
 }
 
 static u32 netvsc_get_msglevel(struct net_device *ndev)
@@ -2181,6 +2195,10 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
 
dev_hold(vf_netdev);
rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+
+   vf_netdev->wanted_features = ndev->features;
+   netdev_update_features(vf_netdev);
+
return NOTIFY_OK;
 }
 
-- 
1.8.3.1



RE: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC

2019-09-05 Thread Haiyang Zhang



> -Original Message-
> From: Jakub Kicinski 
> Sent: Friday, August 30, 2019 7:05 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de; vkuznets
> ; da...@davemloft.net; linux-
> ker...@vger.kernel.org; Mark Bloch 
> Subject: Re: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF
> NIC
> 
> On Fri, 30 Aug 2019 03:45:38 +, Haiyang Zhang wrote:
> > VF NIC may go down then come up during host servicing events. This
> > causes the VF NIC offloading feature settings to roll back to the
> > defaults. This patch can synchronize features from synthetic NIC to
> > the VF NIC during ndo_set_features (ethtool -K), and
> > netvsc_register_vf when VF comes back after host events.
> >
> > Signed-off-by: Haiyang Zhang 
> > Cc: Mark Bloch 
> 
> If we want to make this change in behaviour we should change net_failover
> at the same time.

After checking the net_failover, I found it's for virtio based SRIOV, and very 
different from what we did for Hyper-V based SRIOV.

We let the netvsc driver acts as both the synthetic (PV) driver and the 
transparent 
bonding master for the VF NIC. But net_failover acts as a master device on top 
of both virtio PV NIC, and VF NIC. And the net_failover doesn't implemented 
operations, like ndo_set_features.
So the code change for our netvsc driver cannot be applied to net_failover 
driver.

I will re-submit my two patches (fixing the extra tab in the 1st one as you 
pointed 
out). Thanks!

- Haiyang



RE: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC

2019-08-30 Thread Haiyang Zhang



> -Original Message-
> From: Jakub Kicinski 
> Sent: Friday, August 30, 2019 4:05 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; o...@aepfle.de; vkuznets
> ; da...@davemloft.net; linux-
> ker...@vger.kernel.org; Mark Bloch 
> Subject: Re: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF
> NIC
> 
> On Fri, 30 Aug 2019 03:45:38 +, Haiyang Zhang wrote:
> > VF NIC may go down then come up during host servicing events. This
> > causes the VF NIC offloading feature settings to roll back to the
> > defaults. This patch can synchronize features from synthetic NIC to
> > the VF NIC during ndo_set_features (ethtool -K), and
> > netvsc_register_vf when VF comes back after host events.
> >
> > Signed-off-by: Haiyang Zhang 
> > Cc: Mark Bloch 
> 
> If we want to make this change in behaviour we should change net_failover
> at the same time.

I will check net_failover. Thanks.


[PATCH net-next, 1/2] hv_netvsc: Allow scatter-gather feature to be tunable

2019-08-29 Thread Haiyang Zhang
In a previous patch, the NETIF_F_SG was missing after the code changes.
That caused the SG feature to be "fixed". This patch includes it into
hw_features, so it is tunable again.

Fixes:  23312a3be999 ("netvsc: negotiate checksum and segmentation parameters")
Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/netvsc_drv.c   | 4 ++--
 drivers/net/hyperv/rndis_filter.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index ecc9af0..670ef68 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -822,7 +822,7 @@ struct nvsp_message {
 
 #define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \
  NETIF_F_TSO | NETIF_F_IPV6_CSUM | \
- NETIF_F_TSO6 | NETIF_F_LRO)
+ NETIF_F_TSO6 | NETIF_F_LRO | NETIF_F_SG)
 
 #define VRSS_SEND_TAB_SIZE 16  /* must be power of 2 */
 #define VRSS_CHANNEL_MAX 64
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 0a6cd2f..1f1192e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2313,8 +2313,8 @@ static int netvsc_probe(struct hv_device *dev,
 
/* hw_features computed in rndis_netdev_set_hwcaps() */
net->features = net->hw_features |
-   NETIF_F_HIGHDMA | NETIF_F_SG |
-   NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+   NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX |
+   NETIF_F_HW_VLAN_CTAG_RX;
net->vlan_features = net->features;
 
netdev_lockdep_set_classes(net);
diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 317dbe9..abaf815 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1207,6 +1207,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device 
*rndis_device,
 
/* Compute tx offload settings based on hw capabilities */
net->hw_features |= NETIF_F_RXCSUM;
+   net->hw_features |= NETIF_F_SG;
 
if ((hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_ALL_TCP4) == 
NDIS_TXCSUM_ALL_TCP4) {
/* Can checksum TCP */
-- 
1.8.3.1



[PATCH net-next, 0/2] Enable sg as tunable, sync offload settings to VF NIC

2019-08-29 Thread Haiyang Zhang
This patch set fixes an issue in SG tuning, and sync
offload settings from synthetic NIC to VF NIC.

Haiyang Zhang (2):
  hv_netvsc: hv_netvsc: Allow scatter-gather feature to be tunable
  hv_netvsc: Sync offloading features to VF NIC

 drivers/net/hyperv/hyperv_net.h   |  2 +-
 drivers/net/hyperv/netvsc_drv.c   | 26 ++
 drivers/net/hyperv/rndis_filter.c |  1 +
 3 files changed, 24 insertions(+), 5 deletions(-)

-- 
1.8.3.1



[PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC

2019-08-29 Thread Haiyang Zhang
VF NIC may go down then come up during host servicing events. This
causes the VF NIC offloading feature settings to roll back to the
defaults. This patch can synchronize features from synthetic NIC to
the VF NIC during ndo_set_features (ethtool -K),
and netvsc_register_vf when VF comes back after host events.

Signed-off-by: Haiyang Zhang 
Cc: Mark Bloch 
---
 drivers/net/hyperv/netvsc_drv.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1f1192e..39dddcd 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1785,13 +1785,15 @@ static int netvsc_set_features(struct net_device *ndev,
netdev_features_t change = features ^ ndev->features;
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
+   struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
struct ndis_offload_params offloads;
+   int ret = 0;
 
if (!nvdev || nvdev->destroy)
return -ENODEV;
 
if (!(change & NETIF_F_LRO))
-   return 0;
+   goto syncvf;
 
memset(, 0, sizeof(struct ndis_offload_params));
 
@@ -1803,7 +1805,19 @@ static int netvsc_set_features(struct net_device *ndev,
offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
}
 
-   return rndis_filter_set_offload_params(ndev, nvdev, );
+   ret = rndis_filter_set_offload_params(ndev, nvdev, );
+
+   if (ret)
+   features ^= NETIF_F_LRO;
+
+syncvf:
+   if (!vf_netdev)
+   return ret;
+
+   vf_netdev->wanted_features = features;
+   netdev_update_features(vf_netdev);
+
+   return ret;
 }
 
 static u32 netvsc_get_msglevel(struct net_device *ndev)
@@ -2181,6 +2195,10 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
 
dev_hold(vf_netdev);
rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+
+   vf_netdev->wanted_features = ndev->features;
+   netdev_update_features(vf_netdev);
+
return NOTIFY_OK;
 }
 
-- 
1.8.3.1



RE: linux-next: Tree for Aug 29 (mlx5)

2019-08-29 Thread Haiyang Zhang


> -Original Message-
> From: Saeed Mahameed 
> Sent: Thursday, August 29, 2019 4:04 PM
> To: s...@canb.auug.org.au; Eran Ben Elisha ; linux-
> n...@vger.kernel.org; rdun...@infradead.org; Haiyang Zhang
> 
> Cc: linux-kernel@vger.kernel.org; net...@vger.kernel.org; Leon
> Romanovsky 
> Subject: Re: linux-next: Tree for Aug 29 (mlx5)
> 
> On Thu, 2019-08-29 at 21:48 +, Haiyang Zhang wrote:
> > > -Original Message-
> > > From: Saeed Mahameed 
> > > Sent: Thursday, August 29, 2019 2:32 PM
> > > To: s...@canb.auug.org.au; Eran Ben Elisha ;
> > > linux-
> > > n...@vger.kernel.org; rdun...@infradead.org; Haiyang Zhang
> > > 
> > > Cc: linux-kernel@vger.kernel.org; net...@vger.kernel.org; Leon
> > > Romanovsky 
> > > Subject: Re: linux-next: Tree for Aug 29 (mlx5)
> > >
> > > On Thu, 2019-08-29 at 12:55 -0700, Randy Dunlap wrote:
> > > > On 8/29/19 12:54 PM, Randy Dunlap wrote:
> > > > > On 8/29/19 4:08 AM, Stephen Rothwell wrote:
> > > > > > Hi all,
> > > > > >
> > > > > > Changes since 20190828:
> > > > > >
> > > > >
> > > > > on x86_64:
> > > > > when CONFIG_PCI_HYPERV=m
> > > >
> > > > and CONFIG_PCI_HYPERV_INTERFACE=m
> > > >
> > >
> > > Haiyang and Eran, I think CONFIG_PCI_HYPERV_INTERFACE was never
> > > supposed to be a module ? it supposed to provide an always available
> > > interface to drivers ..
> > >
> > > Anyway, maybe we need to imply CONFIG_PCI_HYPERV_INTERFACE in
> mlx5.
> >
> > The symbolic dependency by driver mlx5e,  automatically triggers
> > loading of pci_hyperv_interface module. And this module can be loaded
> > in any platforms.
> >
> 
> This only works when both are modules.
> 
> 
> > Currently, mlx5e driver has #if
> > IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
> > around the code using the interface.
> >
> > I agree --
> > Adding "select PCI_HYPERV_INTERFACE" for mlx5e will clean up these
> > #if's.
> >
> 
> No, not "select", "imply".
> 
> if one wants PCI_HYPERV_INTERFACE off, imply will keep it off for you.

This looks good.

- Haiyang


RE: linux-next: Tree for Aug 29 (mlx5)

2019-08-29 Thread Haiyang Zhang


> -Original Message-
> From: Saeed Mahameed 
> Sent: Thursday, August 29, 2019 2:32 PM
> To: s...@canb.auug.org.au; Eran Ben Elisha ; linux-
> n...@vger.kernel.org; rdun...@infradead.org; Haiyang Zhang
> 
> Cc: linux-kernel@vger.kernel.org; net...@vger.kernel.org; Leon
> Romanovsky 
> Subject: Re: linux-next: Tree for Aug 29 (mlx5)
> 
> On Thu, 2019-08-29 at 12:55 -0700, Randy Dunlap wrote:
> > On 8/29/19 12:54 PM, Randy Dunlap wrote:
> > > On 8/29/19 4:08 AM, Stephen Rothwell wrote:
> > > > Hi all,
> > > >
> > > > Changes since 20190828:
> > > >
> > >
> > > on x86_64:
> > > when CONFIG_PCI_HYPERV=m
> >
> > and CONFIG_PCI_HYPERV_INTERFACE=m
> >
> 
> Haiyang and Eran, I think CONFIG_PCI_HYPERV_INTERFACE was never
> supposed to be a module ? it supposed to provide an always available
> interface to drivers ..
> 
> Anyway, maybe we need to imply CONFIG_PCI_HYPERV_INTERFACE in mlx5.

The symbolic dependency by driver mlx5e,  automatically triggers loading of
pci_hyperv_interface module. And this module can be loaded in any platforms.

Currently, mlx5e driver has #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
around the code using the interface.

I agree --
Adding "select PCI_HYPERV_INTERFACE" for mlx5e will clean up these #if's.

Thanks,
- Haiyang


RE: [PATCH v3] PCI: hv: Make functions static

2019-08-29 Thread Haiyang Zhang


> -Original Message-
> From: Krzysztof Wilczynski  On Behalf Of Krzysztof
> Wilczynski
> Sent: Thursday, August 29, 2019 2:17 AM
> To: Bjorn Helgaas 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Sasha Levin ; Lorenzo
> Pieralisi ; linux-...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: [PATCH v3] PCI: hv: Make functions static
> 
> Functions hv_read_config_block(), hv_write_config_block() and
> hv_register_block_invalidate() are not used anywhere else and are local to
> drivers/pci/controller/pci-hyperv.c,
> and do not need to be in global scope, so make these static.
> 
> Resolve following compiler warning that can be seen when building with
> warnings enabled (W=1):
> 
> drivers/pci/controller/pci-hyperv.c:933:5: warning:
>  no previous prototype for ‘hv_read_config_block’
>   [-Wmissing-prototypes]
> 
> drivers/pci/controller/pci-hyperv.c:1013:5: warning:
>  no previous prototype for ‘hv_write_config_block’
>   [-Wmissing-prototypes]
> 
> drivers/pci/controller/pci-hyperv.c:1082:5: warning:
>  no previous prototype for ‘hv_register_block_invalidate’
>   [-Wmissing-prototypes]
> 
> Signed-off-by: Krzysztof Wilczynski 

Reviewed-by: Haiyang Zhang 

Thanks!


RE: [PATCH v2] PCI: hv: Make functions only used locally static in pci-hyperv.c

2019-08-28 Thread Haiyang Zhang


> -Original Message-
> From: Krzysztof Wilczynski  On Behalf Of Krzysztof
> Wilczynski
> Sent: Wednesday, August 28, 2019 3:19 PM
> To: Bjorn Helgaas 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Sasha Levin ; Lorenzo
> Pieralisi ; linux-...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linux-hyp...@vger.kernel.org
> Subject: [PATCH v2] PCI: hv: Make functions only used locally static in pci-
> hyperv.c
> 
> Functions hv_read_config_block(), hv_write_config_block()
> and hv_register_block_invalidate() are not used anywhere
> else and are local to drivers/pci/controller/pci-hyperv.c,
> and do not need to be in global scope, so make these static.
> 
> Resolve following compiler warning that can be seen when
> building with warnings enabled (W=1):
> 
> drivers/pci/controller/pci-hyperv.c:933:5: warning: no previous prototype for
> ‘hv_read_config_block’ [-Wmissing-prototypes]
>  int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
>  ^
> drivers/pci/controller/pci-hyperv.c:1013:5: warning: no previous prototype
> for ‘hv_write_config_block’ [-Wmissing-prototypes]
>  int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
>  ^
> drivers/pci/controller/pci-hyperv.c:1082:5: warning: no previous prototype
> for ‘hv_register_block_invalidate’ [-Wmissing-prototypes]
>  int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
>  ^
> Signed-off-by: Krzysztof Wilczynski 
> ---
> Changes in v2:
>   Update commit message to include compiler warning.
> 
>  drivers/pci/controller/pci-hyperv.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-
> hyperv.c
> index f1f300218fab..c9642e429c2d 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -930,7 +930,7 @@ static void hv_pci_read_config_compl(void *context,
> struct pci_response *resp,
>   *
>   * Return: 0 on success, -errno on failure
>   */
> -int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
> +static int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned 
> int len,
>unsigned int block_id, unsigned int *bytes_returned)

The second line should be aligned next to the "(" on the first line.
Also the first line is now over 80 chars.

Thanks,
- Haiyang



RE: [PATCH net-next,v5, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-22 Thread Haiyang Zhang



> -Original Message-
> From: David Miller 
> Sent: Thursday, August 22, 2019 3:39 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; sae...@mellanox.com; l...@kernel.org;
> era...@mellanox.com; lorenzo.pieral...@arm.com; bhelg...@google.com;
> linux-...@vger.kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v5, 0/6] Add software backchannel and mlx5e
> HV VHCA stats
> 
> From: Haiyang Zhang 
> Date: Thu, 22 Aug 2019 22:37:13 +
> 
> > The v5 is pretty much the same as v4, except Eran had a fix to patch #3 in
> response to
> > Leon Romanovsky .
> 
> Well you now have to send me a patch relative to v4 in order to fix that.
> 
> When I say "applied", the series is in my tree and is therefore permanent.
> It is therefore never appropriate to then post a new version of the series.

Thanks.

Eran, could you submit another patch for the fix to patch #3? 

- Haiyang



RE: [PATCH net-next,v5, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-22 Thread Haiyang Zhang



> -Original Message-
> From: David Miller 
> Sent: Thursday, August 22, 2019 3:33 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; sae...@mellanox.com; l...@kernel.org;
> era...@mellanox.com; lorenzo.pieral...@arm.com; bhelg...@google.com;
> linux-...@vger.kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v5, 0/6] Add software backchannel and mlx5e
> HV VHCA stats
> 
> 
> I applied this patch series already to net-next, what are you doing?

The v5 is pretty much the same as v4, except Eran had a fix to patch #3 in 
response to
Leon Romanovsky .

Thanks,
- Haiyang



[PATCH net-next,v5, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent

2019-08-22 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8d443fc..f4de9cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o 
en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o 
lib/port_tun.o lag_mp.o \
lib/geneve.o en/tc_tun_vxlan.o 
en/tc_tun_gre.o \
en/tc_tun_geneve.o 
diag/en_tc_tracepoint.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7316571..4467927 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -782,6 +783,15 @@ struct mlx5e_modify_sq_param {
int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+   struct mlx5_hv_vhca_agent *agent;
+   struct delayed_workwork;
+   u16delay;
+   void  *buf;
+};
+#endif
+
 struct mlx5e_xsk {
/* UMEMs are stored separately from channels, because we don't want to
 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -853,6 +863,9 @@ struct mlx5e_priv {
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
struct mlx5e_xsk   xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+   struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+   u64 rx_packets;
+   u64 rx_bytes;
+   u64 tx_packets;
+   u64 tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+ struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+   struct mlx5e_channel_stats *stats;
+   int tc;
+
+   stats = >channel_stats[ch];
+   data->rx_packets = stats->rq.packets;
+   data->rx_bytes   = stats->rq.bytes;
+
+   for (tc = 0; tc < priv->max_opened_tc; tc++) {
+   data->tx_packets += stats->sq[tc].packets;
+   data->tx_bytes   += stats->sq[tc].bytes;
+   }
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+int buf_len)
+{
+   int ch, i = 0;
+
+   for (ch = 0; ch < priv->max_nch; ch++) {
+   u64 *buf = data + i;
+
+   if (WARN_ON_ONCE(buf +
+sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+data + buf_len))
+   return;
+
+   mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+

[PATCH net-next,v5, 4/6] net/mlx5: Add HV VHCA infrastructure

2019-08-22 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 include/linux/mlx5/driver.h|   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fd32a5b..8d443fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+   struct mlx5_core_dev   *dev;
+   struct workqueue_struct*work_queue;
+   struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+   struct mutexagents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+   struct work_struct invalidate_work;
+   struct mlx5_hv_vhca   *hv_vhca;
+   u64block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+   u16 sequence;
+   u16 offset;
+   u8  reserved[4];
+   u64 data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+   enum mlx5_hv_vhca_agent_type type;
+   struct mlx5_hv_vhca *hv_vhca;
+   void*priv;
+   u16  seq;
+   void (*control)(struct mlx5_hv_vhca_agent *agent,
+   struct mlx5_hv_vhca_control_block *block);
+   void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+  u64 block_mask);
+   void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+   struct mlx5_hv_vhca *hv_vhca = NULL;
+
+   hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+   if (!hv_vhca)
+   return ERR_PTR(-ENOMEM);
+
+   hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+   if (!hv_vhca->work_queue) {
+   kfree(hv_vhca);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   hv_vhca->dev = dev;
+   mutex_init(_vhca->agents_lock);
+
+   return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+   if (IS_ERR_OR_NULL(hv_vhca))
+   return;
+
+   destroy_workqueue(hv_vhca->work_queue);
+   kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+   struct mlx5_hv_vhca_work *hwork;
+   struct mlx5_hv_vhca *hv_vhca;
+   int i;
+
+   hwork = conta

[PATCH net-next,v5, 5/6] net/mlx5: Add HV VHCA control agent

2019-08-22 Thread Haiyang Zhang
From: Eran Ben Elisha 

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 -
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 
block_mask)
queue_work(hv_vhca->work_queue, >invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+   struct mlx5_hv_vhca_control_block 
*block)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (!agent || !agent->control)
+   continue;
+
+   if (!(AGENT_MASK(agent->type) & block->control))
+   continue;
+
+   agent->control(agent, block);
+   }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (agent)
+   *capabilities |= AGENT_MASK(agent->type);
+   }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+   struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+   struct mlx5_core_dev *dev = hv_vhca->dev;
+   struct mlx5_hv_vhca_control_block *block;
+   u32 capabilities = 0;
+   int err;
+
+   block = kzalloc(sizeof(*block), GFP_KERNEL);
+   if (!block)
+   return;
+
+   err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+   if (err)
+   goto free_block;
+
+   mlx5_hv_vhca_capabilities(hv_vhca, );
+
+   /* In case no capabilities, send empty block in return */
+   if (!capabilities) {
+   memset(block, 0, sizeof(*block));
+   goto write;
+   }
+
+   if (block->capabilities != capabilities)
+   block->capabilities = capabilities;
+
+   if (block->control & ~capabilities)
+   goto free_block;
+
+   mlx5_hv_vhca_agents_control(hv_vhca, block);
+   block->command_ack = block->command;
+
+write:
+   mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+   kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+   return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+NULL,
+mlx5_hv_vhca_control_agent_invalidate,
+NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent 
*agent)
+{
+   mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
+   int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
 
-   return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-  mlx5_hv_vhca_invalidate);
+   err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+   if (err)
+   return err;
+
+   agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+   if (IS_ERR_OR_NULL(agent)) {
+   mlx5_hv_unregister_invalidate(hv_vhca->dev);
+   return IS_ERR_OR_NULL(agent);
+   }
+
+   hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+   return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
int i;
 
if (IS_ERR_O

[PATCH net-next,v5, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-22 Thread Haiyang Zhang
From: Eran Ben Elisha 

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index bcf3655..fd32a5b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 000..fb19008
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+int offset, bool read)
+{
+   int rc = -EOPNOTSUPP;
+   int bytes_returned;
+   int block_id;
+
+   if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+   return -EINVAL;
+
+   block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+   rc = read ?
+hyperv_read_cfg_blk(dev->pdev, buf,
+HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+_returned) :
+hyperv_write_cfg_blk(dev->pdev, buf,
+ HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+   /* Make sure len bytes were read successfully  */
+   if (read && !rc && len != bytes_returned)
+   rc = -EIO;
+
+   if (rc) {
+   mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, 
offset = %d\n",
+ read ? "read" : "write", rc, len,
+ offset);
+   return rc;
+   }
+
+   return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   return hyperv_reg_block_invalidate(dev->pdev, context,
+  block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+   hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include 
+#include 
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1



[PATCH net-next,v5, 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

2019-08-22 Thread Haiyang Zhang
This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang 
Signed-off-by: Saeed Mahameed 
---
 MAINTAINERS  |  1 +
 drivers/pci/Kconfig  |  1 +
 drivers/pci/controller/Kconfig   |  7 
 drivers/pci/controller/Makefile  |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 67 
 drivers/pci/controller/pci-hyperv.c  | 12 --
 include/linux/hyperv.h   | 30 ++
 7 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a406947..9860853 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7469,6 +7469,7 @@ F:drivers/hid/hid-hyperv.c
 F: drivers/hv/
 F: drivers/input/serio/hyperv-keyboard.c
 F: drivers/pci/controller/pci-hyperv.c
+F: drivers/pci/controller/pci-hyperv-intf.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
 tristate "Hyper-V PCI Frontend"
 depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   select PCI_HYPERV_INTERFACE
 help
   The PCI device frontend driver allows the kernel to import arbitrary
   PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
  To compile this driver as a module, choose M here: the
  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+   tristate "Hyper-V PCI Interface"
+   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   help
+ The Hyper-V PCI Interface is a helper driver allows other drivers to
+ have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c 
b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 000..cc96be4
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang 
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL_GPL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+   unsigned int block_id, unsigned int *bytes_returned)
+{
+   if (!hvpci_block_ops.read_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+ bytes_returned);
+}
+EXPORT_SYMBOL_GPL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+unsigned int block_id)
+{
+   if (!hvpci_block_ops.write_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL_GPL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   if (!hvpci_block_ops.reg_blk_invalidate)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.reg_blk_invalidate(dev, context,
+ block_invalidate);
+}
+EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+}
+
+static int __init init_hv_pci_intf(void)
+{
+   return 0;

[PATCH net-next,v5, 1/6] PCI: hv: Add a paravirtual backchannel in software

2019-08-22 Thread Haiyang Zhang
From: Dexuan Cui 

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins 
Signed-off-by: Dexuan Cui 
Cc: Haiyang Zhang 
Cc: K. Y. Srinivasan 
Cc: Stephen Hemminger 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 302 
 include/linux/hyperv.h  |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+   struct vmpacket_descriptor hdr;
+   u32 status;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 byte_count;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+   struct pci_incoming_message incoming;
+   union win_slot_encoding wslot;
+   u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
 
+   void (*block_invalidate)(void *context, u64 block_mask);
+   void *invalidate_context;
+
/*
 * What would be observed if one wrote 0x to a BAR and then
 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, 
unsigned int devfn,
.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one 
or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+   struct hv_pci_compl comp_pkt;
+   void *buf;
+   unsigned int len;
+   unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:   Identifies the read config operation
+ * @resp:  The response packet itself
+ * @resp_packet_size:  Size in bytes of the res

[PATCH net-next,v5, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-22 Thread Haiyang Zhang
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Eran Ben Elisha (4):
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

Haiyang Zhang (1):
  PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

 MAINTAINERS|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 371 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 drivers/pci/Kconfig|   1 +
 drivers/pci/controller/Kconfig |   7 +
 drivers/pci/controller/Makefile|   1 +
 drivers/pci/controller/pci-hyperv-intf.c   |  67 
 drivers/pci/controller/pci-hyperv.c| 308 +
 include/linux/hyperv.h |  29 ++
 include/linux/mlx5/driver.h|   2 +
 18 files changed, 1189 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

-- 
1.8.3.1



[PATCH net-next,v4, 4/6] net/mlx5: Add HV VHCA infrastructure

2019-08-21 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 include/linux/mlx5/driver.h|   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fd32a5b..8d443fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+   struct mlx5_core_dev   *dev;
+   struct workqueue_struct*work_queue;
+   struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+   struct mutexagents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+   struct work_struct invalidate_work;
+   struct mlx5_hv_vhca   *hv_vhca;
+   u64block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+   u16 sequence;
+   u16 offset;
+   u8  reserved[4];
+   u64 data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+   enum mlx5_hv_vhca_agent_type type;
+   struct mlx5_hv_vhca *hv_vhca;
+   void*priv;
+   u16  seq;
+   void (*control)(struct mlx5_hv_vhca_agent *agent,
+   struct mlx5_hv_vhca_control_block *block);
+   void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+  u64 block_mask);
+   void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+   struct mlx5_hv_vhca *hv_vhca = NULL;
+
+   hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+   if (!hv_vhca)
+   return ERR_PTR(-ENOMEM);
+
+   hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+   if (!hv_vhca->work_queue) {
+   kfree(hv_vhca);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   hv_vhca->dev = dev;
+   mutex_init(_vhca->agents_lock);
+
+   return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+   if (IS_ERR_OR_NULL(hv_vhca))
+   return;
+
+   destroy_workqueue(hv_vhca->work_queue);
+   kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+   struct mlx5_hv_vhca_work *hwork;
+   struct mlx5_hv_vhca *hv_vhca;
+   int i;
+
+   hwork = conta

[PATCH net-next,v4, 5/6] net/mlx5: Add HV VHCA control agent

2019-08-21 Thread Haiyang Zhang
From: Eran Ben Elisha 

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 -
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 
block_mask)
queue_work(hv_vhca->work_queue, >invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+   struct mlx5_hv_vhca_control_block 
*block)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (!agent || !agent->control)
+   continue;
+
+   if (!(AGENT_MASK(agent->type) & block->control))
+   continue;
+
+   agent->control(agent, block);
+   }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (agent)
+   *capabilities |= AGENT_MASK(agent->type);
+   }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+   struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+   struct mlx5_core_dev *dev = hv_vhca->dev;
+   struct mlx5_hv_vhca_control_block *block;
+   u32 capabilities = 0;
+   int err;
+
+   block = kzalloc(sizeof(*block), GFP_KERNEL);
+   if (!block)
+   return;
+
+   err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+   if (err)
+   goto free_block;
+
+   mlx5_hv_vhca_capabilities(hv_vhca, );
+
+   /* In case no capabilities, send empty block in return */
+   if (!capabilities) {
+   memset(block, 0, sizeof(*block));
+   goto write;
+   }
+
+   if (block->capabilities != capabilities)
+   block->capabilities = capabilities;
+
+   if (block->control & ~capabilities)
+   goto free_block;
+
+   mlx5_hv_vhca_agents_control(hv_vhca, block);
+   block->command_ack = block->command;
+
+write:
+   mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+   kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+   return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+NULL,
+mlx5_hv_vhca_control_agent_invalidate,
+NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent 
*agent)
+{
+   mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
+   int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
 
-   return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-  mlx5_hv_vhca_invalidate);
+   err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+   if (err)
+   return err;
+
+   agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+   if (IS_ERR_OR_NULL(agent)) {
+   mlx5_hv_unregister_invalidate(hv_vhca->dev);
+   return IS_ERR_OR_NULL(agent);
+   }
+
+   hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+   return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
int i;
 
if (IS_ERR_O

[PATCH net-next,v4, 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

2019-08-21 Thread Haiyang Zhang
This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang 
Signed-off-by: Saeed Mahameed 
---
 MAINTAINERS  |  1 +
 drivers/pci/Kconfig  |  1 +
 drivers/pci/controller/Kconfig   |  7 
 drivers/pci/controller/Makefile  |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 67 
 drivers/pci/controller/pci-hyperv.c  | 12 --
 include/linux/hyperv.h   | 30 ++
 7 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a406947..9860853 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7469,6 +7469,7 @@ F:drivers/hid/hid-hyperv.c
 F: drivers/hv/
 F: drivers/input/serio/hyperv-keyboard.c
 F: drivers/pci/controller/pci-hyperv.c
+F: drivers/pci/controller/pci-hyperv-intf.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
 tristate "Hyper-V PCI Frontend"
 depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   select PCI_HYPERV_INTERFACE
 help
   The PCI device frontend driver allows the kernel to import arbitrary
   PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
  To compile this driver as a module, choose M here: the
  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+   tristate "Hyper-V PCI Interface"
+   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   help
+ The Hyper-V PCI Interface is a helper driver allows other drivers to
+ have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c 
b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 000..cc96be4
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang 
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL_GPL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+   unsigned int block_id, unsigned int *bytes_returned)
+{
+   if (!hvpci_block_ops.read_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+ bytes_returned);
+}
+EXPORT_SYMBOL_GPL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+unsigned int block_id)
+{
+   if (!hvpci_block_ops.write_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL_GPL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   if (!hvpci_block_ops.reg_blk_invalidate)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.reg_blk_invalidate(dev, context,
+ block_invalidate);
+}
+EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+}
+
+static int __init init_hv_pci_intf(void)
+{
+   return 0;

[PATCH net-next,v4, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent

2019-08-21 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8d443fc..f4de9cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o 
en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o 
lib/port_tun.o lag_mp.o \
lib/geneve.o en/tc_tun_vxlan.o 
en/tc_tun_gre.o \
en/tc_tun_geneve.o 
diag/en_tc_tracepoint.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7316571..4467927 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -782,6 +783,15 @@ struct mlx5e_modify_sq_param {
int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+   struct mlx5_hv_vhca_agent *agent;
+   struct delayed_workwork;
+   u16delay;
+   void  *buf;
+};
+#endif
+
 struct mlx5e_xsk {
/* UMEMs are stored separately from channels, because we don't want to
 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -853,6 +863,9 @@ struct mlx5e_priv {
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
struct mlx5e_xsk   xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+   struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+   u64 rx_packets;
+   u64 rx_bytes;
+   u64 tx_packets;
+   u64 tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+ struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+   struct mlx5e_channel_stats *stats;
+   int tc;
+
+   stats = >channel_stats[ch];
+   data->rx_packets = stats->rq.packets;
+   data->rx_bytes   = stats->rq.bytes;
+
+   for (tc = 0; tc < priv->max_opened_tc; tc++) {
+   data->tx_packets += stats->sq[tc].packets;
+   data->tx_bytes   += stats->sq[tc].bytes;
+   }
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+int buf_len)
+{
+   int ch, i = 0;
+
+   for (ch = 0; ch < priv->max_nch; ch++) {
+   u64 *buf = data + i;
+
+   if (WARN_ON_ONCE(buf +
+sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+data + buf_len))
+   return;
+
+   mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+

[PATCH net-next,v4, 1/6] PCI: hv: Add a paravirtual backchannel in software

2019-08-21 Thread Haiyang Zhang
From: Dexuan Cui 

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins 
Signed-off-by: Dexuan Cui 
Cc: Haiyang Zhang 
Cc: K. Y. Srinivasan 
Cc: Stephen Hemminger 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 302 
 include/linux/hyperv.h  |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+   struct vmpacket_descriptor hdr;
+   u32 status;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 byte_count;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+   struct pci_incoming_message incoming;
+   union win_slot_encoding wslot;
+   u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
 
+   void (*block_invalidate)(void *context, u64 block_mask);
+   void *invalidate_context;
+
/*
 * What would be observed if one wrote 0x to a BAR and then
 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, 
unsigned int devfn,
.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one 
or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+   struct hv_pci_compl comp_pkt;
+   void *buf;
+   unsigned int len;
+   unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:   Identifies the read config operation
+ * @resp:  The response packet itself
+ * @resp_packet_size:  Size in bytes of the res

[PATCH net-next,v4, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-21 Thread Haiyang Zhang
From: Eran Ben Elisha 

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index bcf3655..fd32a5b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+int offset, bool read)
+{
+   int rc = -EOPNOTSUPP;
+   int bytes_returned;
+   int block_id;
+
+   if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+   return -EINVAL;
+
+   block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+   rc = read ?
+hyperv_read_cfg_blk(dev->pdev, buf,
+HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+_returned) :
+hyperv_write_cfg_blk(dev->pdev, buf,
+ HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+   /* Make sure len bytes were read successfully  */
+   if (read)
+   rc |= !(len == bytes_returned);
+
+   if (rc) {
+   mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, 
offset = %d\n",
+ read ? "read" : "write", rc, len,
+ offset);
+   return rc;
+   }
+
+   return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   return hyperv_reg_block_invalidate(dev->pdev, context,
+  block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+   hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include 
+#include 
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1



[PATCH net-next,v4, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-21 Thread Haiyang Zhang
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Eran Ben Elisha (4):
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

Haiyang Zhang (1):
  PCI: hv: Add a Hyper-V PCI interface driver for software backchannel
interface

 MAINTAINERS|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 371 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 drivers/pci/Kconfig|   1 +
 drivers/pci/controller/Kconfig |   7 +
 drivers/pci/controller/Makefile|   1 +
 drivers/pci/controller/pci-hyperv-intf.c   |  67 
 drivers/pci/controller/pci-hyperv.c| 308 +
 include/linux/hyperv.h |  29 ++
 include/linux/mlx5/driver.h|   2 +
 18 files changed, 1189 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

-- 
1.8.3.1



RE: [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-21 Thread Haiyang Zhang



> -Original Message-
> From: linux-hyperv-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of David Miller
> Sent: Wednesday, August 21, 2019 9:09 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; sae...@mellanox.com; l...@kernel.org;
> era...@mellanox.com; lorenzo.pieral...@arm.com; bhelg...@google.com;
> linux-...@vger.kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v3, 0/6] Add software backchannel and mlx5e
> HV VHCA stats
> 
> From: Haiyang Zhang 
> Date: Wed, 21 Aug 2019 00:23:19 +
> 
> > This patch set adds paravirtual backchannel in software in pci_hyperv,
> > which is required by the mlx5e driver HV VHCA stats agent.
> >
> > The stats agent is responsible on running a periodic rx/tx
> > packets/bytes stats update.
> 
> These patches don't apply cleanly to net-next, probably due to some recent
> mlx5 driver changes.
> 
> Please respin.

I will do.
Thanks,

- Haiyang


[PATCH net-next,v3, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent

2019-08-20 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fc59a40..a889a38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o 
en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o 
lib/port_tun.o lag_mp.o \
lib/geneve.o en/tc_tun_vxlan.o 
en/tc_tun_gre.o \
en/tc_tun_geneve.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fc5107..4a8008b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -777,6 +778,15 @@ struct mlx5e_modify_sq_param {
int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+   struct mlx5_hv_vhca_agent *agent;
+   struct delayed_workwork;
+   u16delay;
+   void  *buf;
+};
+#endif
+
 struct mlx5e_xsk {
/* UMEMs are stored separately from channels, because we don't want to
 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -848,6 +858,9 @@ struct mlx5e_priv {
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
struct mlx5e_xsk   xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+   struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+   u64 rx_packets;
+   u64 rx_bytes;
+   u64 tx_packets;
+   u64 tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+ struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+   struct mlx5e_channel_stats *stats;
+   int tc;
+
+   stats = >channel_stats[ch];
+   data->rx_packets = stats->rq.packets;
+   data->rx_bytes   = stats->rq.bytes;
+
+   for (tc = 0; tc < priv->max_opened_tc; tc++) {
+   data->tx_packets += stats->sq[tc].packets;
+   data->tx_bytes   += stats->sq[tc].bytes;
+   }
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+int buf_len)
+{
+   int ch, i = 0;
+
+   for (ch = 0; ch < priv->max_nch; ch++) {
+   u64 *buf = data + i;
+
+   if (WARN_ON_ONCE(buf +
+sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+data + buf_len))
+   return;
+
+   mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+ (struct 
mlx5e_hv_vhca_per_ring_stats *)buf);
+   i += sizeof(struct mlx5e_hv_vhca_per_ring_stats) / sizeof(u64);
+   }
+}
+
+static int 

[PATCH net-next,v3, 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

2019-08-20 Thread Haiyang Zhang
This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang 
Signed-off-by: Saeed Mahameed 
---
 MAINTAINERS  |  1 +
 drivers/pci/Kconfig  |  1 +
 drivers/pci/controller/Kconfig   |  7 
 drivers/pci/controller/Makefile  |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 67 
 drivers/pci/controller/pci-hyperv.c  | 12 --
 include/linux/hyperv.h   | 30 ++
 7 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e352550..866ae88 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7453,6 +7453,7 @@ F:drivers/hid/hid-hyperv.c
 F: drivers/hv/
 F: drivers/input/serio/hyperv-keyboard.c
 F: drivers/pci/controller/pci-hyperv.c
+F: drivers/pci/controller/pci-hyperv-intf.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
 tristate "Hyper-V PCI Frontend"
 depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   select PCI_HYPERV_INTERFACE
 help
   The PCI device frontend driver allows the kernel to import arbitrary
   PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
  To compile this driver as a module, choose M here: the
  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+   tristate "Hyper-V PCI Interface"
+   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   help
+ The Hyper-V PCI Interface is a helper driver allows other drivers to
+ have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c 
b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 000..cc96be4
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang 
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL_GPL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+   unsigned int block_id, unsigned int *bytes_returned)
+{
+   if (!hvpci_block_ops.read_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+ bytes_returned);
+}
+EXPORT_SYMBOL_GPL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+unsigned int block_id)
+{
+   if (!hvpci_block_ops.write_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL_GPL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   if (!hvpci_block_ops.reg_blk_invalidate)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.reg_blk_invalidate(dev, context,
+ block_invalidate);
+}
+EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+}
+
+static int __init init_hv_pci_intf(void)
+{
+   return 0;

[PATCH net-next,v3, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-20 Thread Haiyang Zhang
From: Eran Ben Elisha 

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8b7edaa..247295b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+int offset, bool read)
+{
+   int rc = -EOPNOTSUPP;
+   int bytes_returned;
+   int block_id;
+
+   if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+   return -EINVAL;
+
+   block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+   rc = read ?
+hyperv_read_cfg_blk(dev->pdev, buf,
+HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+_returned) :
+hyperv_write_cfg_blk(dev->pdev, buf,
+ HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+   /* Make sure len bytes were read successfully  */
+   if (read)
+   rc |= !(len == bytes_returned);
+
+   if (rc) {
+   mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, 
offset = %d\n",
+ read ? "read" : "write", rc, len,
+ offset);
+   return rc;
+   }
+
+   return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   return hyperv_reg_block_invalidate(dev->pdev, context,
+  block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+   hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include 
+#include 
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1



[PATCH net-next,v3, 5/6] net/mlx5: Add HV VHCA control agent

2019-08-20 Thread Haiyang Zhang
From: Eran Ben Elisha 

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 -
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 
block_mask)
queue_work(hv_vhca->work_queue, >invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+   struct mlx5_hv_vhca_control_block 
*block)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (!agent || !agent->control)
+   continue;
+
+   if (!(AGENT_MASK(agent->type) & block->control))
+   continue;
+
+   agent->control(agent, block);
+   }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (agent)
+   *capabilities |= AGENT_MASK(agent->type);
+   }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+   struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+   struct mlx5_core_dev *dev = hv_vhca->dev;
+   struct mlx5_hv_vhca_control_block *block;
+   u32 capabilities = 0;
+   int err;
+
+   block = kzalloc(sizeof(*block), GFP_KERNEL);
+   if (!block)
+   return;
+
+   err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+   if (err)
+   goto free_block;
+
+   mlx5_hv_vhca_capabilities(hv_vhca, );
+
+   /* In case no capabilities, send empty block in return */
+   if (!capabilities) {
+   memset(block, 0, sizeof(*block));
+   goto write;
+   }
+
+   if (block->capabilities != capabilities)
+   block->capabilities = capabilities;
+
+   if (block->control & ~capabilities)
+   goto free_block;
+
+   mlx5_hv_vhca_agents_control(hv_vhca, block);
+   block->command_ack = block->command;
+
+write:
+   mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+   kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+   return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+NULL,
+mlx5_hv_vhca_control_agent_invalidate,
+NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent 
*agent)
+{
+   mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
+   int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
 
-   return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-  mlx5_hv_vhca_invalidate);
+   err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+   if (err)
+   return err;
+
+   agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+   if (IS_ERR_OR_NULL(agent)) {
+   mlx5_hv_unregister_invalidate(hv_vhca->dev);
+   return IS_ERR_OR_NULL(agent);
+   }
+
+   hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+   return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
int i;
 
if (IS_ERR_OR_NULL(hv_vhca))
return;
 
+   agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+   if (agent)
+

[PATCH net-next,v3, 1/6] PCI: hv: Add a paravirtual backchannel in software

2019-08-20 Thread Haiyang Zhang
From: Dexuan Cui 

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins 
Signed-off-by: Dexuan Cui 
Cc: Haiyang Zhang 
Cc: K. Y. Srinivasan 
Cc: Stephen Hemminger 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 302 
 include/linux/hyperv.h  |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+   struct vmpacket_descriptor hdr;
+   u32 status;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 byte_count;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+   struct pci_incoming_message incoming;
+   union win_slot_encoding wslot;
+   u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
 
+   void (*block_invalidate)(void *context, u64 block_mask);
+   void *invalidate_context;
+
/*
 * What would be observed if one wrote 0x to a BAR and then
 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, 
unsigned int devfn,
.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one 
or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+   struct hv_pci_compl comp_pkt;
+   void *buf;
+   unsigned int len;
+   unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:   Identifies the read config operation
+ * @resp:  The response packet itself
+ * @resp_packet_size:  Size in bytes of the res

[PATCH net-next,v3, 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-20 Thread Haiyang Zhang
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Eran Ben Elisha (4):
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

Haiyang Zhang (1):
  PCI: hv: Add a Hyper-V PCI interface driver for software backchannel
interface

 MAINTAINERS|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 371 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 drivers/pci/Kconfig|   1 +
 drivers/pci/controller/Kconfig |   7 +
 drivers/pci/controller/Makefile|   1 +
 drivers/pci/controller/pci-hyperv-intf.c   |  67 
 drivers/pci/controller/pci-hyperv.c| 308 +
 include/linux/hyperv.h |  29 ++
 include/linux/mlx5/driver.h|   2 +
 18 files changed, 1189 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

-- 
1.8.3.1



[PATCH net-next,v3, 4/6] net/mlx5: Add HV VHCA infrastructure

2019-08-20 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 include/linux/mlx5/driver.h|   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 247295b..fc59a40 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+   struct mlx5_core_dev   *dev;
+   struct workqueue_struct*work_queue;
+   struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+   struct mutexagents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+   struct work_struct invalidate_work;
+   struct mlx5_hv_vhca   *hv_vhca;
+   u64block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+   u16 sequence;
+   u16 offset;
+   u8  reserved[4];
+   u64 data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+   enum mlx5_hv_vhca_agent_type type;
+   struct mlx5_hv_vhca *hv_vhca;
+   void*priv;
+   u16  seq;
+   void (*control)(struct mlx5_hv_vhca_agent *agent,
+   struct mlx5_hv_vhca_control_block *block);
+   void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+  u64 block_mask);
+   void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+   struct mlx5_hv_vhca *hv_vhca = NULL;
+
+   hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+   if (!hv_vhca)
+   return ERR_PTR(-ENOMEM);
+
+   hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+   if (!hv_vhca->work_queue) {
+   kfree(hv_vhca);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   hv_vhca->dev = dev;
+   mutex_init(_vhca->agents_lock);
+
+   return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+   if (IS_ERR_OR_NULL(hv_vhca))
+   return;
+
+   destroy_workqueue(hv_vhca->work_queue);
+   kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+   struct mlx5_hv_vhca_work *hwork;
+   struct mlx5_hv_vhca *hv_vhca;
+   int i;
+
+   hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
+   hv_vhca = hwork->hv_vhca;

RE: [PATCH net-next,v2 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

2019-08-20 Thread Haiyang Zhang



> -Original Message-
> From: David Miller 
> Sent: Tuesday, August 20, 2019 3:29 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; sae...@mellanox.com; l...@kernel.org;
> era...@mellanox.com; lorenzo.pieral...@arm.com; bhelg...@google.com;
> linux-...@vger.kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org; KY Srinivasan ; Stephen
> Hemminger ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next,v2 2/6] PCI: hv: Add a Hyper-V PCI interface
> driver for software backchannel interface
> 
> From: Haiyang Zhang 
> Date: Mon, 19 Aug 2019 19:30:47 +
> 
> > +static void __exit exit_hv_pci_intf(void) {
> > +   pr_info("unloaded\n");
> > +}
> > +
> > +static int __init init_hv_pci_intf(void) {
> > +   pr_info("loaded\n");
> > +
> 
> Clogging up the logs with useless messages like this is inappropriate.
> Please remove these pr_info() calls.
> 
> Also, all of these symbols should probably be GPL exported.

I will update the patch -- remove the pr_info, and use EXPORT_SYMBOL_GPL()
for the symbols.

Thanks,
- Haiyang



[PATCH net-next,v2 4/6] net/mlx5: Add HV VHCA infrastructure

2019-08-19 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 include/linux/mlx5/driver.h|   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 247295b..fc59a40 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+   struct mlx5_core_dev   *dev;
+   struct workqueue_struct*work_queue;
+   struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+   struct mutexagents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+   struct work_struct invalidate_work;
+   struct mlx5_hv_vhca   *hv_vhca;
+   u64block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+   u16 sequence;
+   u16 offset;
+   u8  reserved[4];
+   u64 data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+   enum mlx5_hv_vhca_agent_type type;
+   struct mlx5_hv_vhca *hv_vhca;
+   void*priv;
+   u16  seq;
+   void (*control)(struct mlx5_hv_vhca_agent *agent,
+   struct mlx5_hv_vhca_control_block *block);
+   void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+  u64 block_mask);
+   void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+   struct mlx5_hv_vhca *hv_vhca = NULL;
+
+   hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+   if (!hv_vhca)
+   return ERR_PTR(-ENOMEM);
+
+   hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+   if (!hv_vhca->work_queue) {
+   kfree(hv_vhca);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   hv_vhca->dev = dev;
+   mutex_init(_vhca->agents_lock);
+
+   return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+   if (IS_ERR_OR_NULL(hv_vhca))
+   return;
+
+   destroy_workqueue(hv_vhca->work_queue);
+   kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+   struct mlx5_hv_vhca_work *hwork;
+   struct mlx5_hv_vhca *hv_vhca;
+   int i;
+
+   hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
+   hv_vhca = hwork->hv_vhca;

[PATCH net-next,v2 5/6] net/mlx5: Add HV VHCA control agent

2019-08-19 Thread Haiyang Zhang
From: Eran Ben Elisha 

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 -
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 
block_mask)
queue_work(hv_vhca->work_queue, >invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+   struct mlx5_hv_vhca_control_block 
*block)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (!agent || !agent->control)
+   continue;
+
+   if (!(AGENT_MASK(agent->type) & block->control))
+   continue;
+
+   agent->control(agent, block);
+   }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (agent)
+   *capabilities |= AGENT_MASK(agent->type);
+   }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+   struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+   struct mlx5_core_dev *dev = hv_vhca->dev;
+   struct mlx5_hv_vhca_control_block *block;
+   u32 capabilities = 0;
+   int err;
+
+   block = kzalloc(sizeof(*block), GFP_KERNEL);
+   if (!block)
+   return;
+
+   err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+   if (err)
+   goto free_block;
+
+   mlx5_hv_vhca_capabilities(hv_vhca, );
+
+   /* In case no capabilities, send empty block in return */
+   if (!capabilities) {
+   memset(block, 0, sizeof(*block));
+   goto write;
+   }
+
+   if (block->capabilities != capabilities)
+   block->capabilities = capabilities;
+
+   if (block->control & ~capabilities)
+   goto free_block;
+
+   mlx5_hv_vhca_agents_control(hv_vhca, block);
+   block->command_ack = block->command;
+
+write:
+   mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+   kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+   return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+NULL,
+mlx5_hv_vhca_control_agent_invalidate,
+NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent 
*agent)
+{
+   mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
+   int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
 
-   return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-  mlx5_hv_vhca_invalidate);
+   err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+   if (err)
+   return err;
+
+   agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+   if (IS_ERR_OR_NULL(agent)) {
+   mlx5_hv_unregister_invalidate(hv_vhca->dev);
+   return IS_ERR_OR_NULL(agent);
+   }
+
+   hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+   return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
int i;
 
if (IS_ERR_OR_NULL(hv_vhca))
return;
 
+   agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+   if (agent)
+

[PATCH net-next,v2 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-19 Thread Haiyang Zhang
From: Eran Ben Elisha 

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8b7edaa..247295b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o 
eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include 
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+int offset, bool read)
+{
+   int rc = -EOPNOTSUPP;
+   int bytes_returned;
+   int block_id;
+
+   if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+   return -EINVAL;
+
+   block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+   rc = read ?
+hyperv_read_cfg_blk(dev->pdev, buf,
+HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+_returned) :
+hyperv_write_cfg_blk(dev->pdev, buf,
+ HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+   /* Make sure len bytes were read successfully  */
+   if (read)
+   rc |= !(len == bytes_returned);
+
+   if (rc) {
+   mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, 
offset = %d\n",
+ read ? "read" : "write", rc, len,
+ offset);
+   return rc;
+   }
+
+   return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset)
+{
+   return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   return hyperv_reg_block_invalidate(dev->pdev, context,
+  block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+   hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include 
+#include 
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+   int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1



[PATCH net-next,v2 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent

2019-08-19 Thread Haiyang Zhang
From: Eran Ben Elisha 

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fc59a40..a889a38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o 
en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o 
lib/port_tun.o lag_mp.o \
lib/geneve.o en/tc_tun_vxlan.o 
en/tc_tun_gre.o \
en/tc_tun_geneve.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fc5107..4a8008b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -777,6 +778,15 @@ struct mlx5e_modify_sq_param {
int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+   struct mlx5_hv_vhca_agent *agent;
+   struct delayed_workwork;
+   u16delay;
+   void  *buf;
+};
+#endif
+
 struct mlx5e_xsk {
/* UMEMs are stored separately from channels, because we don't want to
 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -848,6 +858,9 @@ struct mlx5e_priv {
struct devlink_health_reporter *tx_reporter;
struct devlink_health_reporter *rx_reporter;
struct mlx5e_xsk   xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+   struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+   u64 rx_packets;
+   u64 rx_bytes;
+   u64 tx_packets;
+   u64 tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+ struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+   struct mlx5e_channel_stats *stats;
+   int tc;
+
+   stats = >channel_stats[ch];
+   data->rx_packets = stats->rq.packets;
+   data->rx_bytes   = stats->rq.bytes;
+
+   for (tc = 0; tc < priv->max_opened_tc; tc++) {
+   data->tx_packets += stats->sq[tc].packets;
+   data->tx_bytes   += stats->sq[tc].bytes;
+   }
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+int buf_len)
+{
+   int ch, i = 0;
+
+   for (ch = 0; ch < priv->max_nch; ch++) {
+   u64 *buf = data + i;
+
+   if (WARN_ON_ONCE(buf +
+sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+data + buf_len))
+   return;
+
+   mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+ (struct 
mlx5e_hv_vhca_per_ring_stats *)buf);
+   i += sizeof(struct mlx5e_hv_vhca_per_ring_stats) / sizeof(u64);
+   }
+}
+
+static int 

[PATCH net-next,v2 1/6] PCI: hv: Add a paravirtual backchannel in software

2019-08-19 Thread Haiyang Zhang
From: Dexuan Cui 

Windows SR-IOV provides a backchannel mechanism in software for communication
between a VF driver and a PF driver.  These "configuration blocks" are
similar in concept to PCI configuration space, but instead of doing reads and
writes in 32-bit chunks through a very slow path, packets of up to 128 bytes
can be sent or received asynchronously.

Nearly every SR-IOV device contains just such a communications channel in
hardware, so using this one in software is usually optional.  Using the
software channel, however, allows driver implementers to leverage software
tools that fuzz the communications channel looking for vulnerabilities.

The usage model for these packets puts the responsibility for reading or
writing on the VF driver.  The VF driver sends a read or a write packet,
indicating which "block" is being referred to by number.

If the PF driver wishes to initiate communication, it can "invalidate" one or
more of the first 64 blocks.  This invalidation is delivered via a callback
supplied by the VF driver by this driver.

No protocol is implied, except that supplied by the PF and VF drivers.

Signed-off-by: Jake Oshins 
Signed-off-by: Dexuan Cui 
Cc: Haiyang Zhang 
Cc: K. Y. Srinivasan 
Cc: Stephen Hemminger 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Haiyang Zhang 
---
 drivers/pci/controller/pci-hyperv.c | 302 
 include/linux/hyperv.h  |  15 ++
 2 files changed, 317 insertions(+)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..57adeca 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -365,6 +365,39 @@ struct pci_delete_interrupt {
struct tran_int_desc int_desc;
 } __packed;
 
+/*
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
+ */
+struct pci_read_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 bytes_requested;
+} __packed;
+
+struct pci_read_block_response {
+   struct vmpacket_descriptor hdr;
+   u32 status;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+/*
+ * Note: the VM must pass a valid block id, wslot and byte_count.
+ */
+struct pci_write_block {
+   struct pci_message message_type;
+   u32 block_id;
+   union win_slot_encoding wslot;
+   u32 byte_count;
+   u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
+} __packed;
+
+struct pci_dev_inval_block {
+   struct pci_incoming_message incoming;
+   union win_slot_encoding wslot;
+   u64 block_mask;
+} __packed;
+
 struct pci_dev_incoming {
struct pci_incoming_message incoming;
union win_slot_encoding wslot;
@@ -499,6 +532,9 @@ struct hv_pci_dev {
struct hv_pcibus_device *hbus;
struct work_struct wrk;
 
+   void (*block_invalidate)(void *context, u64 block_mask);
+   void *invalidate_context;
+
/*
 * What would be observed if one wrote 0x to a BAR and then
 * read it back, for each of the BAR offsets within config space.
@@ -817,6 +853,256 @@ static int hv_pcifront_write_config(struct pci_bus *bus, 
unsigned int devfn,
.write = hv_pcifront_write_config,
 };
 
+/*
+ * Paravirtual backchannel
+ *
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
+ * communication between a VF driver and a PF driver.  These
+ * "configuration blocks" are similar in concept to PCI configuration space,
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
+ *
+ * Nearly every SR-IOV device contains just such a communications channel in
+ * hardware, so using this one in software is usually optional.  Using the
+ * software channel, however, allows driver implementers to leverage software
+ * tools that fuzz the communications channel looking for vulnerabilities.
+ *
+ * The usage model for these packets puts the responsibility for reading or
+ * writing on the VF driver.  The VF driver sends a read or a write packet,
+ * indicating which "block" is being referred to by number.
+ *
+ * If the PF driver wishes to initiate communication, it can "invalidate" one 
or
+ * more of the first 64 blocks.  This invalidation is delivered via a callback
+ * supplied by the VF driver by this driver.
+ *
+ * No protocol is implied, except that supplied by the PF and VF drivers.
+ */
+
+struct hv_read_config_compl {
+   struct hv_pci_compl comp_pkt;
+   void *buf;
+   unsigned int len;
+   unsigned int bytes_returned;
+};
+
+/**
+ * hv_pci_read_config_compl() - Invoked when a response packet
+ * for a read config block operation arrives.
+ * @context:   Identifies the read config operation
+ * @resp:  The response packet itself
+ * @resp_packet_size:  Size in bytes of the res

[PATCH net-next,v2 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface

2019-08-19 Thread Haiyang Zhang
This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang 
Signed-off-by: Saeed Mahameed 
---
 MAINTAINERS  |  1 +
 drivers/pci/Kconfig  |  1 +
 drivers/pci/controller/Kconfig   |  7 
 drivers/pci/controller/Makefile  |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 70 
 drivers/pci/controller/pci-hyperv.c  | 12 --
 include/linux/hyperv.h   | 30 ++
 7 files changed, 111 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e352550..866ae88 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7453,6 +7453,7 @@ F:drivers/hid/hid-hyperv.c
 F: drivers/hv/
 F: drivers/input/serio/hyperv-keyboard.c
 F: drivers/pci/controller/pci-hyperv.c
+F: drivers/pci/controller/pci-hyperv-intf.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
 tristate "Hyper-V PCI Frontend"
 depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   select PCI_HYPERV_INTERFACE
 help
   The PCI device frontend driver allows the kernel to import arbitrary
   PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
  To compile this driver as a module, choose M here: the
  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+   tristate "Hyper-V PCI Interface"
+   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+   help
+ The Hyper-V PCI Interface is a helper driver allows other drivers to
+ have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c 
b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 000..087b7eb
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang 
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+   unsigned int block_id, unsigned int *bytes_returned)
+{
+   if (!hvpci_block_ops.read_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+ bytes_returned);
+}
+EXPORT_SYMBOL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+unsigned int block_id)
+{
+   if (!hvpci_block_ops.write_block)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+   void (*block_invalidate)(void *context,
+u64 block_mask))
+{
+   if (!hvpci_block_ops.reg_blk_invalidate)
+   return -EOPNOTSUPP;
+
+   return hvpci_block_ops.reg_blk_invalidate(dev, context,
+ block_invalidate);
+}
+EXPORT_SYMBOL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+   pr_info("unloaded\n");
+}
+
+static int __init init_hv_pc

[PATCH net-next,v2 0/6] Add software backchannel and mlx5e HV VHCA stats

2019-08-19 Thread Haiyang Zhang
This patch set adds paravirtual backchannel in software in pci_hyperv,
which is required by the mlx5e driver HV VHCA stats agent.

The stats agent is responsible on running a periodic rx/tx packets/bytes
stats update.

Dexuan Cui (1):
  PCI: hv: Add a paravirtual backchannel in software

Haiyang Zhang (5):
  PCI: hv: Add a Hyper-V PCI interface driver for software backchannel
interface
  net/mlx5: Add wrappers for HyperV PCIe operations
  net/mlx5: Add HV VHCA infrastructure
  net/mlx5: Add HV VHCA control agent
  net/mlx5e: Add mlx5e HV VHCA stats agent

 MAINTAINERS|   1 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  13 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c   |  64 
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h   |  22 ++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 371 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 104 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   7 +
 drivers/pci/Kconfig|   1 +
 drivers/pci/controller/Kconfig |   7 +
 drivers/pci/controller/Makefile|   1 +
 drivers/pci/controller/pci-hyperv-intf.c   |  70 
 drivers/pci/controller/pci-hyperv.c| 308 +
 include/linux/hyperv.h |  29 ++
 include/linux/mlx5/driver.h|   2 +
 18 files changed, 1192 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

-- 
1.8.3.1



RE: [PATCH net-next, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations

2019-08-19 Thread Haiyang Zhang


> -Original Message-
> From: Eran Ben Elisha 
> Sent: Thursday, August 15, 2019 7:35 AM
> To: Mark Bloch ; Haiyang Zhang
> ; sas...@kernel.org; da...@davemloft.net;
> Saeed Mahameed ; l...@kernel.org;
> lorenzo.pieral...@arm.com; bhelg...@google.com; linux-
> p...@vger.kernel.org; linux-hyp...@vger.kernel.org; net...@vger.kernel.org
> Cc: KY Srinivasan ; Stephen Hemminger
> ; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next, 3/6] net/mlx5: Add wrappers for HyperV PCIe
> operations
> 
> 
> 
> On 8/14/2019 11:41 PM, Mark Bloch wrote:
> >
> >
> > On 8/14/19 12:08 PM, Haiyang Zhang wrote:
> >> From: Eran Ben Elisha 
> >>
> >> Add wrapper functions for HyperV PCIe read / write /
> >> block_invalidate_register operations.  This will be used as an
> >> infrastructure in the downstream patch for software communication.
> >>
> >> This will be enabled by default if CONFIG_PCI_HYPERV_MINI is set.
> >>
> >> Signed-off-by: Eran Ben Elisha 
> >> Signed-off-by: Saeed Mahameed 
> >> ---
> >>   drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
> >>   drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64
> 
> >>   drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 
> >>   3 files changed, 87 insertions(+)
> >>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
> >>   create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
> >>
> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> >> index 8b7edaa..a8950b1 100644
> >> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> >> @@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   +=
> eswitch.o eswitch_offloads.o eswitch_offlo
> >>   mlx5_core-$(CONFIG_MLX5_MPFS)  += lib/mpfs.o
> >>   mlx5_core-$(CONFIG_VXLAN)  += lib/vxlan.o
> >>   mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
> >> +mlx5_core-$(CONFIG_PCI_HYPERV_MINI) += lib/hv.o
> >>
> >>   #
> >>   # Ipoib netdev
> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
> b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
> >> new file mode 100644
> >> index 000..cf08d02
> >> --- /dev/null
> >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
> >> @@ -0,0 +1,64 @@
> >> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> >> +// Copyright (c) 2018 Mellanox Technologies
> >> +
> >> +#include 
> >> +#include "mlx5_core.h"
> >> +#include "lib/hv.h"
> >> +
> >> +static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void
> *buf, int len,
> >> +   int offset, bool read)
> >> +{
> >> +  int rc = -EOPNOTSUPP;
> >> +  int bytes_returned;
> >> +  int block_id;
> >> +
> >> +  if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len %
> HV_CONFIG_BLOCK_SIZE_MAX)
> >> +  return -EINVAL;
> >> +
> >> +  block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
> >> +
> >> +  rc = read ?
> >> +   hyperv_read_cfg_blk(dev->pdev, buf,
> >> +   HV_CONFIG_BLOCK_SIZE_MAX, block_id,
> >> +   _returned) :
> >> +   hyperv_write_cfg_blk(dev->pdev, buf,
> >> +HV_CONFIG_BLOCK_SIZE_MAX, block_id);
> >> +
> >> +  /* Make sure len bytes were read successfully  */
> >> +  if (read)
> >> +  rc |= !(len == bytes_returned);
> >> +
> >> +  if (rc) {
> >> +  mlx5_core_err(dev, "Failed to %s hv config, err = %d, len
> = %d, offset = %d\n",
> >> +read ? "read" : "write", rc, len,
> >> +offset);
> >> +  return rc;
> >> +  }
> >> +
> >> +  return 0;
> >> +}
> >
> > This seems out of place why not expose this function as part of hyperv and
> mlx5
> > will just pass the pdev.
> >
> The HV driver works with block chunks. I found it less convenience to do
> so directly, so I add a small wrapper for mlx5 core.
> 
> Haiyangz,
> Do you see a reason to export this callback style from the HYPERV level
> instead?
I don’t think the wrapper has to be in the hv interface. 
One function for read, another for write, are pretty straight forward. 
Users (other drivers) may use or wrap them in the way they like to.

Thanks,
- Haiyang


RE: [PATCH net-next, 2/6] PCI: hv: Add a Hyper-V PCI mini driver for software backchannel interface

2019-08-16 Thread Haiyang Zhang



> -Original Message-
> From: Vitaly Kuznetsov 
> Sent: Friday, August 16, 2019 12:16 PM
> To: Haiyang Zhang 
> Cc: KY Srinivasan ; Stephen Hemminger
> ; linux-kernel@vger.kernel.org;
> sas...@kernel.org; da...@davemloft.net; sae...@mellanox.com;
> l...@kernel.org; era...@mellanox.com; lorenzo.pieral...@arm.com;
> bhelg...@google.com; linux-...@vger.kernel.org; linux-
> hyp...@vger.kernel.org; net...@vger.kernel.org
> Subject: RE: [PATCH net-next, 2/6] PCI: hv: Add a Hyper-V PCI mini driver for
> software backchannel interface
> 
> Haiyang Zhang  writes:
> 
> >
> > The pci_hyperv can only be loaded on VMs on Hyper-V and Azure. Other
> > drivers like MLX5e will have symbolic dependency of pci_hyperv if they
> > use functions exported by pci_hyperv. This dependency will cause other
> > drivers fail to load on other platforms, like VMs on KVM. So we
> > created this mini driver, which can be loaded on any platforms to
> > provide the symbolic dependency.
> 
> (/me wondering is there a nicer way around this, by using __weak or
> something like that...)
> 
> In case this stub is the best solution I'd suggest to rename it to something 
> like
> PCI_HYPERV_INTERFACE to make it clear it is not a separate driver (_MINI
> makes me think so).

Thanks! I will consider those options.


RE: [PATCH net-next, 2/6] PCI: hv: Add a Hyper-V PCI mini driver for software backchannel interface

2019-08-16 Thread Haiyang Zhang



> -Original Message-
> From: Vitaly Kuznetsov 
> Sent: Friday, August 16, 2019 8:28 AM
> To: Haiyang Zhang ; sas...@kernel.org;
> da...@davemloft.net; sae...@mellanox.com; l...@kernel.org;
> era...@mellanox.com; lorenzo.pieral...@arm.com; bhelg...@google.com;
> linux-...@vger.kernel.org; linux-hyp...@vger.kernel.org;
> net...@vger.kernel.org
> Cc: Haiyang Zhang ; KY Srinivasan
> ; Stephen Hemminger ;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH net-next, 2/6] PCI: hv: Add a Hyper-V PCI mini driver for
> software backchannel interface
> 
> Haiyang Zhang  writes:
> 
> > This mini driver is a helper driver allows other drivers to have a
> > common interface with the Hyper-V PCI frontend driver.
> >
> > Signed-off-by: Haiyang Zhang 
> > Signed-off-by: Saeed Mahameed 
> > ---
> >  MAINTAINERS  |  1 +
> >  drivers/pci/Kconfig  |  1 +
> >  drivers/pci/controller/Kconfig   |  7 
> >  drivers/pci/controller/Makefile  |  1 +
> >  drivers/pci/controller/pci-hyperv-mini.c | 70
> 
> >  drivers/pci/controller/pci-hyperv.c  | 12 --
> >  include/linux/hyperv.h   | 30 ++
> >  7 files changed, 111 insertions(+), 11 deletions(-)  create mode
> > 100644 drivers/pci/controller/pci-hyperv-mini.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS index e352550..c4962b9 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -7453,6 +7453,7 @@ F:drivers/hid/hid-hyperv.c
> >  F: drivers/hv/
> >  F: drivers/input/serio/hyperv-keyboard.c
> >  F: drivers/pci/controller/pci-hyperv.c
> > +F: drivers/pci/controller/pci-hyperv-mini.c
> >  F: drivers/net/hyperv/
> >  F: drivers/scsi/storvsc_drv.c
> >  F: drivers/uio/uio_hv_generic.c
> > diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index
> > 2ab9240..bb852f5 100644
> > --- a/drivers/pci/Kconfig
> > +++ b/drivers/pci/Kconfig
> > @@ -182,6 +182,7 @@ config PCI_LABEL
> >  config PCI_HYPERV
> >  tristate "Hyper-V PCI Frontend"
> >  depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN
> &&
> > X86_64
> > +   select PCI_HYPERV_MINI
> >  help
> >The PCI device frontend driver allows the kernel to import 
> > arbitrary
> >PCI devices from a PCI backend to support PCI driver domains.
> > diff --git a/drivers/pci/controller/Kconfig
> > b/drivers/pci/controller/Kconfig index fe9f9f1..8e31cba 100644
> > --- a/drivers/pci/controller/Kconfig
> > +++ b/drivers/pci/controller/Kconfig
> > @@ -281,5 +281,12 @@ config VMD
> >   To compile this driver as a module, choose M here: the
> >   module will be called vmd.
> >
> > +config PCI_HYPERV_MINI
> > +   tristate "Hyper-V PCI Mini"
> > +   depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN
> && X86_64
> > +   help
> > + The Hyper-V PCI Mini is a helper driver allows other drivers to
> > + have a common interface with the Hyper-V PCI frontend driver.
> > +
> 
> Out of pure curiosity, why not just export this interface from PCI_HYPERV
> directly? Why do we need this stub?

The pci_hyperv can only be loaded on VMs on Hyper-V and Azure. Other 
drivers like MLX5e will have symbolic dependency of pci_hyperv if they 
use functions exported by pci_hyperv. This dependency will cause other 
drivers fail to load on other platforms, like VMs on KVM. So we created 
this mini driver, which can be loaded on any platforms to provide the 
symbolic dependency.

Thanks,
- Haiyang


RE: [PATCH v6,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-16 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Friday, August 16, 2019 5:52 AM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; bhelg...@google.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v6,1/2] PCI: hv: Detect and fix Hyper-V PCI domain
> number collision
> 
> On Thu, Aug 15, 2019 at 05:01:37PM +, Haiyang Zhang wrote:
> > Currently in Azure cloud, for passthrough devices, the host sets the
> > device instance ID's bytes 8 - 15 to a value derived from the host
> > HWID, which is the same on all devices in a VM. So, the device
> > instance ID's bytes 8 and 9 provided by the host are no longer unique.
> > This affects all Azure hosts since July 2018, and can cause device
> > passthrough to VMs to fail because the bytes 8 and 9 are used as PCI
> > domain number. Collision of domain numbers will cause the second
> > device with the same domain number fail to load.
> >
> > In the cases of collision, we will detect and find another number that
> > is not in use.
> >
> > Suggested-by: Michael Kelley 
> > Signed-off-by: Haiyang Zhang 
> > Acked-by: Sasha Levin 
> 
> I assume you will take care of backporting and sending this patch to stable
> kernels given that you have not applied any tag with such request.
> 
> I appreciate it may not be easy to define but a Fixes: tag would help.

Sure, I will add a Fixes tag, and Cc stable. Usually Sasha from our team will
do the stable porting in batches.

Thanks,
- Haiyang


[PATCH v6,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-15 Thread Haiyang Zhang
Currently in Azure cloud, for passthrough devices, the host sets the device
instance ID's bytes 8 - 15 to a value derived from the host HWID, which is
the same on all devices in a VM. So, the device instance ID's bytes 8 and 9
provided by the host are no longer unique. This affects all Azure hosts
since July 2018, and can cause device passthrough to VMs to fail because
the bytes 8 and 9 are used as PCI domain number. Collision of domain
numbers will cause the second device with the same domain number fail to
load.

In the cases of collision, we will detect and find another number that is
not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 92 +++--
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 40b6254..31b8fd5 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus)
complete(>remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+   unsigned int i;
+
+   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+   return dom;
+
+   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+   if (test_and_set_bit(i, hvpci_dom_map) == 0)
+   return i;
+   }
+
+   return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+   clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:  VMBus's tracking struct for this root PCI bus
@@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
 {
struct hv_pcibus_device *hbus;
+   u16 dom_req, dom;
int ret;
 
/*
@@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->state = hv_pcibus_init;
 
/*
-* The PCI bus "domain" is what is called "segment" in ACPI and
-* other specs.  Pull it from the instance ID, to get something
-* unique.  Bytes 8 and 9 are what is used in Windows guests, so
-* do the same thing for consistency.  Note that, since this code
-* only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-* that (1) the only domain in use for something that looks like
-* a physical PCI bus (which is actually emulated by the
-* hypervisor) is domain 0 and (2) there will be no overlap
-* between domains derived from these instance IDs in the same
-* VM.
+* The PCI bus "domain" is what is called "segment" in ACPI and other
+* specs. Pull it from the instance ID, to get something usually
+* unique. In rare cases of collision, we will find out another number
+* not in use.
+*
+* Note that, since this code only runs in a Hyper-V VM, Hyper-V
+* together with this guest driver can guarantee that (1) The only
+* domain used by Gen1 VMs for something that looks like a physical
+* PCI bus (which is actually emulated by the hypervisor) is domain 0.
+* (2) There will be no overlap between domains (after fixing possible
+* collisions) in the same VM.
 */
-   hbus->sysdata.domain = hdev->dev_instance.b[9] |
-  hdev->dev_instance.b[8] << 8;
+   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom = hv_get_dom_num(dom_req);
+
+   if (dom == HVPCI_DOM_INVALID) {
+   dev_err(>device,
+   "Unable to use dom# 0x%hx or other numbers", dom_req);
+   ret = -EINVAL;
+   goto free_bus;
+   }
+
+   if (dom != dom_req)
+   dev_info(>device,
+"PCI dom# 0x%hx has collision, using 0x%hx",
+dom_req, dom);
+
+   hbus->sysdata.domain = dom;
 
hbus->hdev = hdev;
refcount_set(>remove_lock, 1);
@@ -2562,7 +2620,7 @@ static int hv_pci_probe(str

[PATCH v6,2/2] PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers

2019-08-15 Thread Haiyang Zhang
As recommended by Azure host team, the bytes 4, 5 have more uniqueness
(info entropy) than bytes 8, 9. So now we use bytes 4, 5 as the PCI domain
numbers. On older hosts, bytes 4, 5 can also be used -- no backward
compatibility issues here. The chance of collision is greatly reduced.

In the rare cases of collision, the driver code detects and finds another
number that is not in use.

Suggested-by: Michael Kelley 
Signed-off-by: Haiyang Zhang 
Acked-by: Sasha Levin 
---
 drivers/pci/controller/pci-hyperv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pci-hyperv.c 
b/drivers/pci/controller/pci-hyperv.c
index 31b8fd5..4f3d97e 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2590,7 +2590,7 @@ static int hv_pci_probe(struct hv_device *hdev,
 * (2) There will be no overlap between domains (after fixing possible
 * collisions) in the same VM.
 */
-   dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9];
+   dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
dom = hv_get_dom_num(dom_req);
 
if (dom == HVPCI_DOM_INVALID) {
-- 
1.8.3.1



RE: [PATCH v5,1/2] PCI: hv: Detect and fix Hyper-V PCI domain number collision

2019-08-15 Thread Haiyang Zhang



> -Original Message-
> From: Lorenzo Pieralisi 
> Sent: Thursday, August 15, 2019 12:11 PM
> To: Haiyang Zhang 
> Cc: sas...@kernel.org; bhelg...@google.com; linux-
> hyp...@vger.kernel.org; linux-...@vger.kernel.org; KY Srinivasan
> ; Stephen Hemminger ;
> o...@aepfle.de; vkuznets ; linux-
> ker...@vger.kernel.org
> Subject: Re: [PATCH v5,1/2] PCI: hv: Detect and fix Hyper-V PCI domain
> number collision
> 
> On Wed, Aug 14, 2019 at 03:52:15PM +, Haiyang Zhang wrote:
> > Currently in Azure cloud, for passthrough devices, the host sets the device
> > instance ID's bytes 8 - 15 to a value derived from the host HWID, which is
> > the same on all devices in a VM. So, the device instance ID's bytes 8 and 9
> > provided by the host are no longer unique. This affects all Azure hosts
> > since last year, and can cause device passthrough to VMs to fail because
> 
> Bjorn already asked, can you be a bit more specific than "since last
> year" here please ?
> 
> It would be useful to understand when/how this became an issue.
The host change happens around July 2018. The Azure roll out takes
multi weeks, so there is no specific date. I will include the Month
Year in the log.

> 
> > the bytes 8 and 9 are used as PCI domain number. Collision of domain
> > numbers will cause the second device with the same domain number fail to
> > load.
> >
> > In the cases of collision, we will detect and find another number that is
> > not in use.
> >
> > Suggested-by: Michael Kelley 
> > Signed-off-by: Haiyang Zhang 
> > Acked-by: Sasha Levin 
> > ---
> >  drivers/pci/controller/pci-hyperv.c | 92
> +++--
> >  1 file changed, 79 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/pci/controller/pci-hyperv.c 
> > b/drivers/pci/controller/pci-
> hyperv.c
> > index 40b6254..31b8fd5 100644
> > --- a/drivers/pci/controller/pci-hyperv.c
> > +++ b/drivers/pci/controller/pci-hyperv.c
> > @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct
> hv_pcibus_device *hbus)
> > complete(>remove_event);
> >  }
> >
> > +#define HVPCI_DOM_MAP_SIZE (64 * 1024)
> > +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
> > +
> > +/*
> > + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so
> define 0
> > + * as invalid for passthrough PCI devices of this driver.
> > + */
> > +#define HVPCI_DOM_INVALID 0
> > +
> > +/**
> > + * hv_get_dom_num() - Get a valid PCI domain number
> > + * Check if the PCI domain number is in use, and return another number if
> > + * it is in use.
> > + *
> > + * @dom: Requested domain number
> > + *
> > + * return: domain number on success, HVPCI_DOM_INVALID on failure
> > + */
> > +static u16 hv_get_dom_num(u16 dom)
> > +{
> > +   unsigned int i;
> 
> > +
> > +   if (test_and_set_bit(dom, hvpci_dom_map) == 0)
> > +   return dom;
> > +
> > +   for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
> > +   if (test_and_set_bit(i, hvpci_dom_map) == 0)
> > +   return i;
> > +   }
> 
> Don't you need locking around code reading/updating hvpci_dom_map ?

If the bit changes after for_each_clear_bit() considers it as a "clear bit" - 
the
test_and_set_bit() does test in an atomic operation - the return value
will be 1 instead of 0. Then the loop will continue to the next clear bit, until
the test_and_set_bit() is successful. So no locking is necessary here.

Thanks,
- Haiyang



RE: [PATCH] hv_netvsc: Fix a memory leak bug

2019-08-14 Thread Haiyang Zhang



> -Original Message-
> From: Wenwen Wang 
> Sent: Wednesday, August 14, 2019 4:16 PM
> To: Wenwen Wang 
> Cc: KY Srinivasan ; Haiyang Zhang
> ; Stephen Hemminger
> ; Sasha Levin ; David S.
> Miller ; open list:Hyper-V CORE AND DRIVERS
> ; open list:NETWORKING DRIVERS
> ; open list 
> Subject: [PATCH] hv_netvsc: Fix a memory leak bug
> 
> In rndis_filter_device_add(), 'rndis_device' is allocated through kzalloc()
> by invoking get_rndis_device(). In the following execution, if an error
> occurs, the execution will go to the 'err_dev_remv' label. However, the
> allocated 'rndis_device' is not deallocated, leading to a memory leak bug.
> 
> Signed-off-by: Wenwen Wang 
> ---
>  drivers/net/hyperv/rndis_filter.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/hyperv/rndis_filter.c
> b/drivers/net/hyperv/rndis_filter.c
> index 317dbe9..ed35085 100644
> --- a/drivers/net/hyperv/rndis_filter.c
> +++ b/drivers/net/hyperv/rndis_filter.c
> @@ -1420,6 +1420,7 @@ struct netvsc_device
> *rndis_filter_device_add(struct hv_device *dev,
> 
>  err_dev_remv:
>   rndis_filter_device_remove(dev, net_device);
> + kfree(rndis_device);

The kfree() is not necessary here. 
Because it is already freed by --
rndis_filter_device_remove() --> netvsc_device_remove() 
--> free_netvsc_device_rcu() --> free_netvsc_device()
--> kfree(nvdev->extension);  //This frees rndis_device.

Thanks,
- Haiyang


[PATCH net-next, 5/6] net/mlx5: Add HV VHCA control agent

2019-08-14 Thread Haiyang Zhang
From: Eran Ben Elisha 

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 -
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index b2eebdf..3c7fffa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -110,22 +110,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 
block_mask)
queue_work(hv_vhca->work_queue, >invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+   struct mlx5_hv_vhca_control_block 
*block)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (!agent || !agent->control)
+   continue;
+
+   if (!(AGENT_MASK(agent->type) & block->control))
+   continue;
+
+   agent->control(agent, block);
+   }
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+ u32 *capabilities)
+{
+   int i;
+
+   for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+   struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+   if (agent)
+   *capabilities |= AGENT_MASK(agent->type);
+   }
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+ u64 block_mask)
+{
+   struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+   struct mlx5_core_dev *dev = hv_vhca->dev;
+   struct mlx5_hv_vhca_control_block *block;
+   u32 capabilities = 0;
+   int err;
+
+   block = kzalloc(sizeof(*block), GFP_KERNEL);
+   if (!block)
+   return;
+
+   err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+   if (err)
+   goto free_block;
+
+   mlx5_hv_vhca_capabilities(hv_vhca, );
+
+   /* In case no capabilities, send empty block in return */
+   if (!capabilities) {
+   memset(block, 0, sizeof(*block));
+   goto write;
+   }
+
+   if (block->capabilities != capabilities)
+   block->capabilities = capabilities;
+
+   if (block->control & ~capabilities)
+   goto free_block;
+
+   mlx5_hv_vhca_agents_control(hv_vhca, block);
+   block->command_ack = block->command;
+
+write:
+   mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+   kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+   return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+NULL,
+mlx5_hv_vhca_control_agent_invalidate,
+NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent 
*agent)
+{
+   mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
+   int err;
+
if (IS_ERR_OR_NULL(hv_vhca))
return IS_ERR_OR_NULL(hv_vhca);
 
-   return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-  mlx5_hv_vhca_invalidate);
+   err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+ mlx5_hv_vhca_invalidate);
+   if (err)
+   return err;
+
+   agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+   if (IS_ERR_OR_NULL(agent)) {
+   mlx5_hv_unregister_invalidate(hv_vhca->dev);
+   return IS_ERR_OR_NULL(agent);
+   }
+
+   hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+   return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+   struct mlx5_hv_vhca_agent *agent;
int i;
 
if (IS_ERR_OR_NULL(hv_vhca))
return;
 
+   agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+   if 

  1   2   3   4   5   6   7   8   9   >