Re: [PATCH] uio: add irq control support to uio_pci_generic

2015-04-20 Thread Stephen Hemminger
On Mon, 20 Apr 2015 15:59:06 +0200
Michael S. Tsirkin m...@redhat.com wrote:

 On Thu, Apr 16, 2015 at 02:21:10PM -0700, Stephen Hemminger wrote:
  On Thu, 16 Apr 2015 09:43:24 +0200
  Michael S. Tsirkin m...@redhat.com wrote:
  
   On Wed, Apr 15, 2015 at 09:59:34AM -0700, Stephen Hemminger wrote:
The driver already supported INTX interrupts but had no in kernel
function to enable and disable them.

It is possible for userspace to do this by accessing PCI config
directly, but this racy
   
   How is it racy? We have userspace using this interface,
   if there's a race I want to fix it.
  
  There is nothing to prevent two threads in user space doing 
  read/modify write at the same time.
 
 Well that's a userspace bug then - so let's drop that
 from commit log lest people think this fixes some
 kernel bugs. read/modify/write to the same register
 is at least an easy to grasp problem, creating
 an extra interface for the same function opens up
 the possibility that some userspace will do
 read/modify/write from one thread with irqcontrol
 from another thread, creating more races.
 
  The bigger issue is that DPDK needs to support multiple UIO
  interface types. And with current model there is no abstraction.
  The way to enable/disable IRQ is different depending on the UIO
  drivers.
 
 OK compatibility with other devices might be useful, but what are the
 other UIO drivers DPDK supports? I only found support for igb_uio so
 far, and that doesn't seem to be upstream.
 

Currently, supports:
  igb_uio, uio_pci_generic (as well as vfio)

There are additional drivers which been submitted but not accepted for Xen and 
HyperV
both of which require special uio drivers.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] uio: add irq control support to uio_pci_generic

2015-04-16 Thread Stephen Hemminger
On Thu, 16 Apr 2015 09:43:24 +0200
Michael S. Tsirkin m...@redhat.com wrote:

 On Wed, Apr 15, 2015 at 09:59:34AM -0700, Stephen Hemminger wrote:
  The driver already supported INTX interrupts but had no in kernel
  function to enable and disable them.
  
  It is possible for userspace to do this by accessing PCI config
  directly, but this racy
 
 How is it racy? We have userspace using this interface,
 if there's a race I want to fix it.

There is nothing to prevent two threads in user space doing 
read/modify write at the same time.

The bigger issue is that DPDK needs to support multiple UIO
interface types. And with current model there is no abstraction.
The way to enable/disable IRQ is different depending on the UIO
drivers.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] uio: add irq control support to uio_pci_generic

2015-04-15 Thread Stephen Hemminger
The driver already supported INTX interrupts but had no in kernel
function to enable and disable them.

It is possible for userspace to do this by accessing PCI config
directly, but this racy and better handled by same mechanism
that already exists in kernel.

Signed-off-by: Stephen Hemminger step...@networkplumber.org


--- a/drivers/uio/uio_pci_generic.c 2015-04-15 08:50:15.543900681 -0700
+++ b/drivers/uio/uio_pci_generic.c 2015-04-15 09:00:01.658609786 -0700
@@ -53,6 +53,18 @@ static irqreturn_t irqhandler(int irq, s
return IRQ_HANDLED;
 }
 
+static int irqcontrol(struct uio_info *info, s32 irq_on)
+{
+   struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
+   struct pci_dev *pdev = gdev-pdev;
+
+   pci_cfg_access_lock(pdev);
+   pci_intx(pdev, irq_on);
+   pci_cfg_access_unlock(pdev);
+
+   return 0;
+}
+
 static int probe(struct pci_dev *pdev,
   const struct pci_device_id *id)
 {
@@ -89,6 +101,7 @@ static int probe(struct pci_dev *pdev,
gdev-info.irq = pdev-irq;
gdev-info.irq_flags = IRQF_SHARED;
gdev-info.handler = irqhandler;
+   gdev-info.irqcontrol = irqcontrol;
gdev-pdev = pdev;
 
err = uio_register_device(pdev-dev, gdev-info);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] bridge: preserve random init MAC address

2014-03-18 Thread Stephen Hemminger
On Wed, 12 Mar 2014 20:15:25 -0700
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 As it is now if you add create a bridge it gets started
 with a random MAC address and if you then add a net_device
 as a slave but later kick it out you end up with a zero
 MAC address. Instead preserve the original random MAC
 address and use it.

What is supposed to happen is that the recalculate chooses
the lowest MAC address of the slaves. If there are no slaves
it might as well just calculate a new random value. There is
not great merit in preserving the original defunct address.

Or something like this that just keeps the old value.
The bridge is in a meaningless state when there are no ports,
and when the first port is added back it will be used as the
new bridge id.

--- a/net/bridge/br_stp_if.c2014-02-12 08:21:56.733857356 -0800
+++ b/net/bridge/br_stp_if.c2014-03-18 20:09:09.334388826 -0700
@@ -235,6 +235,9 @@ bool br_stp_recalculate_bridge_id(struct
addr = p-dev-dev_addr;
 
}
+
+   if (addr == br_mac_zero)
+   return false;  /* keep original address */
 
if (ether_addr_equal(br-bridge_id.addr, addr))
return false;   /* no change */
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] bridge: fix bridge root block on designated port

2014-03-13 Thread Stephen Hemminger
On Wed, 12 Mar 2014 20:15:27 -0700
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 --- a/net/bridge/br_private.h
 +++ b/net/bridge/br_private.h
 @@ -150,6 +150,7 @@ struct net_bridge_port
   u8  priority;
   u8  state;
   u16 port_no;
 + boolroot_block_enabled;
   unsigned char   topology_change_ack;

It seems a bit confusing to have both a ROOT_BLOCK flag in the
data structure and and additional root_block_enabled flag.
If nothing else it is a waste of space.

Looks like you are changing the meaning slightly. is possible
to have BR_ROOT_BLOCK set but !root_block_enabled? and what about
the inverse?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v3 4/6] bridge: enable root block during device registration

2014-03-03 Thread Stephen Hemminger
On Mon,  3 Mar 2014 14:47:03 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 From: Luis R. Rodriguez mcg...@suse.com
 
 root block support was added via 1007dd1a on v3.8 but toggling
 this flag is only allowed after a device has been registered and
 added to a bridge as its a bridge *port* primitive, not a *net_device*
 feature. There are work arounds possible to account for the lack
 of netlink tools to toggle root_block, such as using the root_block
 syfs attribute [0] or using udev / the driver to set the MAC address
 to something high such as FE:FF:FF:FF:FF:FF, but neither of these
 ensure root block is respected _from_the_start_ through device
 initialization.
 
 In order to support the root_block feature from the start since device
 initialization and in order to avoid having to require userspace
 work arounds to existing deployments this exposes a private
 net_device flag which enables drivers that know they want to
 start with the root_block feature enabled form the start. The
 only caveat with this is topologies that require STP or non-root
 will either have to use sysfs [0] or netlink tools like the
 iproute2 bridge util to toggle the flag off after initialization.
 This is an accepted compromise.
 
 This flag is required given that ndo_add_slave() currently does not
 allow specifying any other parameters other than the net_device. We
 could extend this but in order to do that properly we'd need to
 evaluate all other types of master device implementations.
 
 [0] echo 1  /sys/devices/vif-2-0/net/vif2.0/brport/root_block
 
 Cc: Stephen Hemminger step...@networkplumber.org
 Cc: bri...@lists.linux-foundation.org
 Cc: net...@vger.kernel.org
 Cc: linux-ker...@vger.kernel.org
 Cc: xen-de...@lists.xenproject.org
 Cc: kvm@vger.kernel.org
 Signed-off-by: Luis R. Rodriguez mcg...@suse.com
 ---
  include/linux/netdevice.h | 7 +++
  net/bridge/br_if.c| 2 ++
  2 files changed, 9 insertions(+)
 
 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
 index 1a86948..b17643a 100644
 --- a/include/linux/netdevice.h
 +++ b/include/linux/netdevice.h
 @@ -1181,6 +1181,11 @@ struct net_device_ops {
   * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
   *   change when it's running
   * @IFF_MACVLAN: Macvlan device
 + * @IFF_BRIDGE_ROOT_BLOCK: don't consider this net_device for root port
 + *   when this interface is added to a bridge. This makes use of the
 + *   root_block mechanism but since its a bridge port primitive this
 + *   flag can be used to instantiate the preference to have root block
 + *   enabled from the start since initialization.
   */

Doing this in priv flags bloats what is a limited resource (# of bits).
Plus there are issues (what if this is changed after adding to bridge)?

Maybe better to enhance existing netlink infrastructure to allow passing
flags on adding port to bridge.

Also, unless device is up, nothing will happen right away when added to bridge.
Root port status can be changed since device is disabled.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v3 4/6] bridge: enable root block during device registration

2014-03-03 Thread Stephen Hemminger
On Mon, 3 Mar 2014 15:58:50 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 On Mon, Mar 3, 2014 at 3:43 PM, Stephen Hemminger
 step...@networkplumber.org wrote:
  Doing this in priv flags bloats what is a limited resource (# of bits).  
 
 Agreed. I tried to avoid it but saw no other option for addressing
 this during initialization properly without requirng a userspace
 upgrade.

Replacing one Xen hack for another doesn't seem like great progress.
I would rather see an API change as needed because there are other
things like setting STP parameters which might also want to be part
of initial device add.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v3 0/6] networking: address root block upon initialization

2014-03-03 Thread Stephen Hemminger
On Mon, 3 Mar 2014 17:05:18 -0800
Luis R. Rodriguez mcg...@suse.com wrote:

 On Mon, Mar 3, 2014 at 2:46 PM, Luis R. Rodriguez
 mcg...@do-not-panic.com wrote:
  From: Luis R. Rodriguez mcg...@suse.com
 
 -- snip --
 
  As I tested using the root block preference I noticed that if a net_device
  slave under the bridge gets the designated root port prior to setting in
  userspace the root_block feature enabling the feature won't kick the
  bridge to remove that net_device from the designated port. I addressed
  that issue and also upkeeping the initial random MAC address given to
  the bridge as if othwerwise we'd end up with a zero MAC address bridge
  if we root block all ports. I have only done local tests I'd appreciate a
  bit more wide test coverage and review.
 
 
 Stephen,
 
 I should note that even if we discard patches 4-6 patches for an
 alternative implementation patches 1-3 should still be applicable for
 review. Let me know what you think of those.
 
   Luis
 --
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

I agree with 0-3 as normal improvements.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC v2 1/4] bridge: enable interfaces to opt out from becoming the root bridge

2014-02-20 Thread Stephen Hemminger
On Wed, 19 Feb 2014 09:59:33 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 On Wed, Feb 19, 2014 at 9:08 AM, Stephen Hemminger
 step...@networkplumber.org wrote:
  On Wed, 19 Feb 2014 09:02:06 -0800
  Luis R. Rodriguez mcg...@do-not-panic.com wrote:
 
  Folks, what if I repurpose my patch to use the IFF_BRIDGE_NON_ROOT (or
  relabel to IFF_ROOT_BLOCK_DEF) flag for a default driver preference
  upon initialization so that root block will be used once the device
  gets added to a bridge. The purpose would be to avoid drivers from
  using the high MAC address hack, streamline to use a random MAC
  address thereby avoiding the possible duplicate address situation for
  IPv6. In the STP use case for these interfaces we'd just require
  userspace to unset the root block. I'd consider the STP use case the
  most odd of all. The caveat to this approach is 3.8 would be needed
  (or its the root block patches cherry picked) for base kernels older
  than 3.8.
 
  Stephen?
 
Luis
 
  Don't add IFF_ flags that adds yet another API hook into bridge.
 
 The goal was not to add a userspace API, but rather consider a driver
 initialization preference.
 
  Please only use the netlink/sysfs flags fields that already exist
  for new features.
 
 Sure, but what if we know a driver in most cases wants the root block
 and we'd want to make it the default, thereby only requiring userspace
 for toggling it off.
 
   Luis

Something in userspace has to put the device into the bridge.
Fix the port setup in that tool via the netlink or sysfs flags in
the bridge. It should not have to be handled in the bridge looking
at magic flags in the device.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC v2 1/4] bridge: enable interfaces to opt out from becoming the root bridge

2014-02-19 Thread Stephen Hemminger
On Wed, 19 Feb 2014 09:02:06 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 Folks, what if I repurpose my patch to use the IFF_BRIDGE_NON_ROOT (or
 relabel to IFF_ROOT_BLOCK_DEF) flag for a default driver preference
 upon initialization so that root block will be used once the device
 gets added to a bridge. The purpose would be to avoid drivers from
 using the high MAC address hack, streamline to use a random MAC
 address thereby avoiding the possible duplicate address situation for
 IPv6. In the STP use case for these interfaces we'd just require
 userspace to unset the root block. I'd consider the STP use case the
 most odd of all. The caveat to this approach is 3.8 would be needed
 (or its the root block patches cherry picked) for base kernels older
 than 3.8.
 
 Stephen?
 
   Luis

Don't add IFF_ flags that adds yet another API hook into bridge.
Please only use the netlink/sysfs flags fields that already exist
for new features.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v2 2/4] net: enables interface option to skip IP

2014-02-18 Thread Stephen Hemminger
On Tue, 18 Feb 2014 13:19:15 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 Sure, but note that the both disable_ipv6 and accept_dada sysctl
 parameters are global. ipv4 and ipv6 interfaces are created upon
 NETDEVICE_REGISTER, which will get triggered when a driver calls
 register_netdev(). The goal of this patch was to enable an early
 optimization for drivers that have no need ever for ipv4 or ipv6
 interfaces.

The trick with ipv6 is to register the device, then have userspace
do the ipv6 sysctl before bringing the device up.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v2 1/4] bridge: enable interfaces to opt out from becoming the root bridge

2014-02-16 Thread Stephen Hemminger
On Fri, 14 Feb 2014 18:59:37 -0800
Luis R. Rodriguez mcg...@do-not-panic.com wrote:

 From: Luis R. Rodriguez mcg...@suse.com
 
 It doesn't make sense for some interfaces to become a root bridge
 at any point in time. One example is virtual backend interfaces
 which rely on other entities on the bridge for actual physical
 connectivity. They only provide virtual access.
 
 Device drivers that know they should never become part of the
 root bridge have been using a trick of setting their MAC address
 to a high broadcast MAC address such as FE:FF:FF:FF:FF:FF. Instead
 of using these hacks lets the interfaces annotate its intent and
 generalizes a solution for multiple drivers, while letting the
 drivers use a random MAC address or one prefixed with a proper OUI.
 This sort of hack is used by both qemu and xen for their backend
 interfaces.
 
 Cc: Stephen Hemminger step...@networkplumber.org
 Cc: bri...@lists.linux-foundation.org
 Cc: net...@vger.kernel.org
 Cc: linux-ker...@vger.kernel.org
 Signed-off-by: Luis R. Rodriguez mcg...@suse.com

This is already supported in a more standard way via the root
block flag.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 8% performance improved by change tap interact with kernel stack

2014-01-28 Thread Stephen Hemminger
On Tue, 28 Jan 2014 12:33:25 +0200
Michael S. Tsirkin m...@redhat.com wrote:

 On Tue, Jan 28, 2014 at 06:19:02PM +0800, Qin Chuanyu wrote:
  On 2014/1/28 17:41, Michael S. Tsirkin wrote:
  I think it's okay - IIUC this way we are processing xmit directly
  instead of going through softirq.
  Was meaning to try this - I'm glad you are looking into this.
  
  Could you please check latency results?
  
  netperf UDP_RR 512
  test model: VM-host-host
  
  modified before : 11108
  modified after  : 11480
  
  3% gained by this patch
  
  
  Nice.
  What about CPU utilization?
  It's trivially easy to speed up networking by
  burning up a lot of CPU so we must make sure it's
  not doing that.
  And I think we should see some tests with TCP as well, and
  try several message sizes.
  
  
  Yes, by burning up more CPU we could get better performance easily.
  So I have bond vhost thread and interrupt of nic on CPU1 while testing.
  
  modified before, the idle of CPU1 is 0%-1% while testing.
  and after modify, the idle of CPU1 is 2%-3% while testing
  
  TCP also could gain from this, but pps is less than UDP, so I think
  the improvement would be not so obviously.
 
 Still need to test this doesn't regress but overall looks convincing to me.
 Could you send a patch, accompanied by testing results for
 throughput latency and cpu utilization for tcp and udp
 with various message sizes?
 
 Thanks!
 

There are a couple potential problems with this. The primary one is
that now you are violating the explicit assumptions about when 
netif_receive_skb()
can be called and because of that it may break things all over the place.

 *
 *  netif_receive_skb() is the main receive data processing function.
 *  It always succeeds. The buffer may be dropped during processing
 *  for congestion control or by the protocol layers.
 *
 *  This function may only be called from softirq context and interrupts
 *  should be enabled.

At a minimum, softirq (BH) and preempt must be disabled.

Another potential problem is that since a softirq is not used, the kernel stack
maybe much larger.

Maybe a better way would be implementing some form of NAPI in the TUN device?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH kvm-next 2/2] kvm: remove dead code

2013-12-30 Thread Stephen Hemminger
On Mon, 30 Dec 2013 09:37:15 +0200
Gleb Natapov g...@minantech.com wrote:

 On Sun, Dec 29, 2013 at 12:13:08PM -0800, Stephen Hemminger wrote:
  The function kvm_io_bus_read_cookie is defined but never used
  in current in-tree code.

 It was added recently by Cornelia (copied) with intention to be used in s390
 code. I assume the intention is still there.

The normal process is that the code is added in one patch just
before the code that uses it. Rather than the if we build it they will come
philosophy.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kvm-next 2/2] kvm: remove dead code

2013-12-29 Thread Stephen Hemminger
The function kvm_io_bus_read_cookie is defined but never used
in current in-tree code.

Signed-off-by: Stephen Hemminger step...@networkplumber.org


--- a/include/linux/kvm_host.h  2013-12-27 13:12:19.409612858 -0800
+++ b/include/linux/kvm_host.h  2013-12-27 13:12:42.261259369 -0800
@@ -172,8 +172,6 @@ int kvm_io_bus_write_cookie(struct kvm *
int len, const void *val, long cookie);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
void *val);
-int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-  int len, void *val, long cookie);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
--- a/virt/kvm/kvm_main.c   2013-12-27 13:12:19.413612796 -0800
+++ b/virt/kvm/kvm_main.c   2013-12-27 13:12:42.261259369 -0800
@@ -2937,33 +2937,6 @@ int kvm_io_bus_read(struct kvm *kvm, enu
return r  0 ? r : 0;
 }
 
-/* kvm_io_bus_read_cookie - called under kvm-slots_lock */
-int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-  int len, void *val, long cookie)
-{
-   struct kvm_io_bus *bus;
-   struct kvm_io_range range;
-
-   range = (struct kvm_io_range) {
-   .addr = addr,
-   .len = len,
-   };
-
-   bus = srcu_dereference(kvm-buses[bus_idx], kvm-srcu);
-
-   /* First try the device referenced by cookie. */
-   if ((cookie = 0)  (cookie  bus-dev_count) 
-   (kvm_io_bus_cmp(range, bus-range[cookie]) == 0))
-   if (!kvm_iodevice_read(bus-range[cookie].dev, addr, len,
-  val))
-   return cookie;
-
-   /*
-* cookie contained garbage; fall back to search and return the
-* correct cookie value.
-*/
-   return __kvm_io_bus_read(bus, range, val);
-}
 
 /* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH kvm-next 1/2] kvm: make local functions static

2013-12-29 Thread Stephen Hemminger
Running 'make namespacecheck' found lots of functions that
should be declared static, since only used in one file.

Signed-off-by: Stephen Hemminger step...@networkplumber.org


---
 include/linux/kvm_host.h |   16 
 virt/kvm/ioapic.c|2 +-
 virt/kvm/ioapic.h|1 -
 virt/kvm/kvm_main.c  |   35 ++-
 4 files changed, 19 insertions(+), 35 deletions(-)

--- a/include/linux/kvm_host.h  2013-12-27 11:59:56.160921447 -0800
+++ b/include/linux/kvm_host.h  2013-12-27 13:11:21.246512950 -0800
@@ -463,8 +463,6 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
-u64 last_generation);
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 {
@@ -537,7 +535,6 @@ unsigned long gfn_to_hva_prot(struct kvm
 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
-void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
@@ -549,7 +546,6 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, g
 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
 
-void kvm_release_pfn_dirty(pfn_t pfn);
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
 void kvm_set_pfn_accessed(pfn_t pfn);
@@ -576,8 +572,6 @@ struct kvm_memory_slot *gfn_to_memslot(s
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
-gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
@@ -605,8 +599,6 @@ int kvm_get_dirty_log(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log);
 
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-  struct kvm_userspace_memory_region *mem);
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
bool line_status);
 long kvm_arch_vm_ioctl(struct file *filp,
@@ -654,8 +646,6 @@ void kvm_arch_check_processor_compat(voi
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
-void kvm_free_physmem(struct kvm *kvm);
-
 void *kvm_kvzalloc(unsigned long size);
 void kvm_kvfree(const void *addr);
 
@@ -1097,12 +1087,6 @@ static inline void kvm_vcpu_set_in_spin_
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 }
-
-static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
-{
-   return true;
-}
-
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
 
--- a/virt/kvm/kvm_main.c   2013-12-27 11:59:56.824911034 -0800
+++ b/virt/kvm/kvm_main.c   2013-12-27 13:11:21.246512950 -0800
@@ -95,6 +95,12 @@ static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+static void update_memslots(struct kvm_memslots *slots,
+   struct kvm_memory_slot *new, u64 last_generation);
+
+static void kvm_release_pfn_dirty(pfn_t pfn);
+static void mark_page_dirty_in_slot(struct kvm *kvm,
+   struct kvm_memory_slot *memslot, gfn_t gfn);
 
 bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -553,7 +559,7 @@ static void kvm_free_physmem_slot(struct
free-npages = 0;
 }
 
-void kvm_free_physmem(struct kvm *kvm)
+static void kvm_free_physmem(struct kvm *kvm)
 {
struct kvm_memslots *slots = kvm-memslots;
struct kvm_memory_slot *memslot;
@@ -675,8 +681,9 @@ static void sort_memslots(struct kvm_mem
slots-id_to_index[slots-memslots[i].id] = i;
 }
 
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
-u64 last_generation)
+static void update_memslots(struct kvm_memslots *slots,
+   struct kvm_memory_slot *new,
+   u64 last_generation)
 {
if (new) {
int id = new-id;
@@ -924,8 +931,8 @@ int kvm_set_memory_region(struct kvm *kv
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 
-int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-  struct kvm_userspace_memory_region *mem)
+static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region 
*mem)
 {
if (mem-slot = KVM_USER_MEM_SLOTS)
return -EINVAL

Re: [RFC PATCH v2 1/1] Workqueue based vhost workers

2013-10-14 Thread Stephen Hemminger
On Sun, 13 Oct 2013 21:55:43 -0400
Bandan Das b...@redhat.com wrote:

 +
 + if (cmwq_worker) {
 + ret = vhost_wq_init();
 + if (ret) {
 + pr_info(Enabling wq based vhost workers failed! 
 +  Switching to device based worker instead\n);
 + cmwq_worker = 0;
 + } else
 +   pr_info(Enabled workqueues based vhost workers\n);
 + }

Why keep two mechanisms (and two potential code paths to maintain)
when the only way vhost_wq_init() can fail is if out of memory.
You may have needed the messages and this during development but for
the final version just do it one way.

If alloc_workqueue fails, then the net_init function should propogate
the error code and fail as well.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I/O port permission bit inheritance between threads

2013-05-21 Thread Stephen Hemminger
On Tue, 21 May 2013 13:01:18 +0300
Gleb Natapov g...@redhat.com wrote:

 On Tue, May 21, 2013 at 11:50:30AM +0200, Joerg Roedel wrote:
  Hey Stephen,
  
  On Mon, May 20, 2013 at 02:24:31PM -0700, Stephen Hemminger wrote:
   ioperm() inheritance across threads is different in KVM then when run
   on physical hardware.  The following program runs on physical hardware
   but get SEGV under KVM.
   
   It appears that the I/O permission bits are not shared between threads
   in the same way.
  
  Is this specific to SVM or do you see it on VMX too? My first guess
  would be that the KVM instruction emulator does not check to
  IO-permissions correctly, but that would affect VMX and SVM.
  
 The program segfaults on physical hardware:
 # ./a.out 
 joining
 waiting
 beeping
 Segmentation fault
 
 --
   Gleb.

The program had timing races, changing it slightly shows that.
# ./beep
beeping
done
oo
# ./beep --pre
joining
beeping
Segmentation fault
# ./beep --post
beeping
joining
done
oo
# 

/* Original Copyright 2011, Kees Cook k...@outflux.net, License: GPLv2 */
#include unistd.h
#include string.h
#include stdio.h
#include pthread.h
#include sys/io.h

enum { NOFORK, BEFORE, AFTER } cases = NOFORK;
pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;

static void *beep(void *arg)
{
unsigned char bits;

pthread_mutex_lock(mut);
fprintf(stderr, beeping\n);
/* turn on speaker */
bits = inb(0x61);
bits |= 3;
outb(bits, 0x61);

/* set 1000 Hz frequency */
bits = 0xA9;
outb(bits, 0x42);
bits = 0x04;
outb(bits, 0x42);

/* listen to the beep */
sleep(4);
fprintf(stderr, done\n);
pthread_mutex_unlock(mut);

return NULL;
}

int main(int argc, char **argv) {
pthread_t tid;
unsigned char orig;

if (argc  1) {
if (!strcmp(argv[1], --pre)) cases = BEFORE;
if (!strcmp(argv[1], --post)) cases = AFTER;
}


pthread_mutex_lock(mut);
if (cases == BEFORE  pthread_create(tid, NULL, beep, NULL)) {
perror(pthread);
return 1;
}

/* gain access to speaker control port */
if (ioperm(0x61, 0x61, 1)  0) {
perror(0x61);
return 1;
}

/* record original value */
orig = inb(0x61);

/* gain access to speaker frequency port */
if (ioperm(0x42, 0x42, 1)  0) {
perror(0x42);
return 2;
}
pthread_mutex_unlock(mut);

if (cases == AFTER  pthread_create(tid, NULL, beep, NULL)) {
perror(pthread);
return 1;
}

if (cases == NOFORK)
beep(NULL);
else {
fprintf(stderr, joining\n);
pthread_join(tid, NULL);
}

/* restore speaker bits to turn off speaker */
outb(orig, 0x61);
fprintf(stderr, oo\n);
return 0;
}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


I/O port permission bit inheritance between threads

2013-05-20 Thread Stephen Hemminger
ioperm() inheritance across threads is different in KVM then when run
on physical hardware.  The following program runs on physical hardware
but get SEGV under KVM.

It appears that the I/O permission bits are not shared between threads
in the same way.

/* Original Copyright 2011, Kees Cook k...@outflux.net, License: GPLv2 */
#include unistd.h
#include stdio.h
#include pthread.h
#include sys/io.h

static void *beep(void *arg)
{
unsigned char bits;

fprintf(stderr, waiting\n);
sleep(1);

fprintf(stderr, beeping\n);
/* turn on speaker */
bits = inb(0x61);
bits |= 3;
outb(bits, 0x61);

/* set 1000 Hz frequency */
bits = 0xA9;
outb(bits, 0x42);
bits = 0x04;
outb(bits, 0x42);

/* listen to the beep */
sleep(4);
fprintf(stderr, done\n);

return NULL;
}

int main() {
pthread_t tid;
unsigned char orig;

if (pthread_create(tid, NULL, beep, NULL)) {
perror(pthread);
return 1;
}

/* gain access to speaker control port */
if (ioperm(0x61, 0x61, 1)  0) {
perror(0x61);
return 1;
}
orig = inb(0x61);

/* gain access to speaker frequency port */
if (ioperm(0x42, 0x42, 1)  0) {
perror(0x42);
return 2;
}

fprintf(stderr, joining\n);
pthread_join(tid, NULL);

/* restore speaker bits to turn off speaker */
outb(orig, 0x61);
fprintf(stderr, done\n);
return 0;
}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Locking problem in KVM (or MM)

2013-01-16 Thread Stephen Hemminger
Lockdep splat, appears to be KVM related with 3.8-rc1

[24428.429305] 
[24428.429308] =
[24428.429309] [ INFO: possible recursive locking detected ]
[24428.429312] 3.8.0-rc1-net-next+ #4 Tainted: GW   
[24428.429312] -
[24428.429313] kvm/7355 is trying to acquire lock:
[24428.429314]  (anon_vma-rwsem){..}, at: [811373f8] mm_take_all
_locks+0x102/0x14e
[24428.429321] 
[24428.429321] but task is already holding lock:
[24428.429322]  (anon_vma-rwsem){..}, at: [811373f8] mm_take_all
_locks+0x102/0x14e
[24428.429325] 
[24428.429325] other info that might help us debug this:
[24428.429326]  Possible unsafe locking scenario:
[24428.429326] 
[24428.429327]CPU0
[24428.429327]
[24428.429328]   lock(anon_vma-rwsem);
[24428.429329]   lock(anon_vma-rwsem);
[24428.429331] 
[24428.429331]  *** DEADLOCK ***
[24428.429331] 
[24428.429332]  May be due to missing lock nesting notation
[24428.429332] 
[24428.429333] 4 locks held by kvm/7355:
[24428.429334]  #0:  (mm-mmap_sem){++}, at: [81148b64] do_mmu_no
tifier_register+0x65/0x12c
[24428.429338]  #1:  (mm_all_locks_mutex){+.+...}, at: [8113732f] mm_t
ake_all_locks+0x39/0x14e
[24428.429341]  #2:  (mapping-i_mmap_mutex){+.+...}, at: [81137397] 
mm_take_all_locks+0xa1/0x14e
[24428.429344]  #3:  (anon_vma-rwsem){..}, at: [811373f8] mm_tak
e_all_locks+0x102/0x14e
[24428.429347] 
[24428.429347] stack backtrace:
[24428.429348] Pid: 7355, comm: kvm Tainted: GW3.8.0-rc1-net-next+ #
4
[24428.429349] Call Trace:
[24428.429354]  [810b82f0] __lock_acquire+0x569/0xe12
[24428.429356]  [810b9025] lock_acquire+0xd7/0x123
[24428.429358]  [811373f8] ? mm_take_all_locks+0x102/0x14e
[24428.429361]  [814e490a] down_write+0x49/0x58
[24428.429363]  [811373f8] ? mm_take_all_locks+0x102/0x14e
[24428.429365]  [814e44d2] ? _mutex_lock_nest_lock+0x40/0x45
[24428.429366]  [811373f8] mm_take_all_locks+0x102/0x14e
[24428.429369]  [81148b6c] do_mmu_notifier_register+0x6d/0x12c
[24428.429371]  [81148c50] mmu_notifier_register+0x13/0x15
[24428.429373]  [81004ed9] kvm_dev_ioctl+0x277/0x3e4
[24428.429376]  [8116471e] vfs_ioctl+0x26/0x39
[24428.429378]  [8116501b] do_vfs_ioctl+0x40f/0x452
[24428.429381]  [810f759f] ? time_hardirqs_off+0x15/0x2a
[24428.429383]  [814e6d63] ? error_sti+0x5/0x6
[24428.429385]  [810b5da0] ? trace_hardirqs_off_caller+0x3f/0x9e
[24428.429388]  [8116d353] ? fget_light+0x3d/0x9d
[24428.429389]  [811650b5] sys_ioctl+0x57/0x86
[24428.429393]  [812c23de] ? trace_hardirqs_on_thunk+0x3a/0x3f
[24428.429395]  [814ed582] system_call_fastpath+0x16/0x1b
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v3 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info

2012-12-07 Thread Stephen Hemminger
Minor style issue reported by checkpatch which can be fixed after merge.
Although sizeof is actually an operator in C, it is considered correct
style to treat it as a function.


WARNING: sizeof hdr-hdr should be sizeof(hdr-hdr)
#293: FILE: drivers/net/virtio_net.c:395:
+   sg_set_buf(rq-sg, hdr-hdr, sizeof hdr-hdr);

WARNING: sizeof hdr-mhdr should be sizeof(hdr-mhdr)
#552: FILE: drivers/net/virtio_net.c:641:
+   sg_set_buf(sq-sg, hdr-mhdr, sizeof hdr-mhdr);

WARNING: sizeof hdr-hdr should be sizeof(hdr-hdr)
#555: FILE: drivers/net/virtio_net.c:643:
+   sg_set_buf(sq-sg, hdr-hdr, sizeof hdr-hdr);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v3 0/3] Multiqueue support in virtio-net

2012-12-07 Thread Stephen Hemminger
On Fri, 07 Dec 2012 15:35:56 -0500 (EST)
David Miller da...@davemloft.net wrote:

 From: Jason Wang jasow...@redhat.com
 Date: Sat,  8 Dec 2012 01:04:54 +0800
 
  This series is an update version (hope the final version) of multiqueue
  (VIRTIO_NET_F_MQ) support in virtio-net driver. All previous comments were
  addressed, the work were based on Krishna Kumar's work to let virtio-net use
  multiple rx/tx queues to do the packets reception and transmission. 
  Performance
  test show the aggregate latency were increased greately but may get some
  regression in small packet transmission. Due to this, multiqueue were 
  disabled
  by default. If user want to benefit form the multiqueue, ethtool -L could be
  used to enable the feature.
  
  Please review and comments.
  
  A protype implementation of qemu-kvm support could by found in
  git://github.com/jasowang/qemu-kvm-mq.git. To start a guest with two 
  queues, you
  could specify the queues parameters to both tap and virtio-net like:
  
  ./qemu-kvm -netdev tap,queues=2,... -device virtio-net-pci,queues=2,...
  
  then enable the multiqueue through ethtool by:
  
  ethtool -L eth0 combined 2
 
 It seems like most, if not all, of the feedback given for this series
 has been addressed by Jason.
 
 Can I get some ACKs?

Other than the minor style nit in the first patch, I see no issues.
This is really needed by Virtual Routers.

Acked-by: Stephen Hemminger shemmin...@vyatta.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Question]About KVM network zero-copy feature!

2012-08-11 Thread Stephen Hemminger
On Fri, 10 Aug 2012 11:34:32 +0800
Peter Huang(Peng) peter.huangp...@huawei.com wrote:

 Hi,All
 
 I searched from git-log, and found that until now we have vhost TX zero-copy 
 experiment feature, how
 about RX zero-copy?
 
 For XEN, net-back also only has TX zero-copy, Is there any reason that RX 
 zero-copy still not implemented?
 
There is no guarantee that packet will ever be read by receiver. This means 
zero-copy could
create memory back pressure stalls.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next RFC V5 5/5] virtio_net: support negotiating the number of queues through ctrl vq

2012-07-06 Thread Stephen Hemminger
On Fri, 06 Jul 2012 11:20:06 +0800
Jason Wang jasow...@redhat.com wrote:

 On 07/05/2012 08:51 PM, Sasha Levin wrote:
  On Thu, 2012-07-05 at 18:29 +0800, Jason Wang wrote:
  @@ -1387,6 +1404,10 @@ static int virtnet_probe(struct virtio_device *vdev)
   if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
   vi-has_cvq = true;
 
  +   /* Use single tx/rx queue pair as default */
  +   vi-num_queue_pairs = 1;
  +   vi-total_queue_pairs = num_queue_pairs;
  The code is using this default even if the amount of queue pairs it
  wants was specified during initialization. This basically limits any
  device to use 1 pair when starting up.
 
 
 Yes, currently the virtio-net driver would use 1 txq/txq by default 
 since multiqueue may not outperform in all kinds of workload. So it's 
 better to keep it as default and let user enable multiqueue by ethtool -L.
 

I would prefer that the driver sized number of queues based on number
of online CPU's. That is what real hardware does. What kind of workload
are you doing? If it is some DBMS benchmark then maybe the issue is that
some CPU's need to be reserved.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] tun: experimental zero copy tx support

2012-05-14 Thread Stephen Hemminger
On Sun, 13 May 2012 18:52:06 +0300
Michael S. Tsirkin m...@redhat.com wrote:

 + /* Userspace may produce vectors with count greater than
 +  * MAX_SKB_FRAGS, so we need to linearize parts of the skb
 +  * to let the rest of data to be fit in the frags.
 +  */
Rather than complex partial code, just go through slow path for
requests with too many frags (or for really small requests).
Creating mixed skb's seems too easy to get wrong.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next PATCH v0 0/5] Series short description

2012-03-19 Thread Stephen Hemminger
On Mon, 19 Mar 2012 18:38:08 -0400 (EDT)
David Miller da...@davemloft.net wrote:

 From: John Fastabend john.r.fastab...@intel.com
 Date: Sun, 18 Mar 2012 23:51:45 -0700
 
  This series is a follow up to this thread:
  
  http://www.spinics.net/lists/netdev/msg191360.html
 
 Can the interested parties please review this series?
 
 I'm willing to apply this right now if it looks OK, but if
 it needs more revisions we'll have to defer.

Please don't rush this into this merge window. It needs more than
1 full day of review.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next PATCH v0 0/5] Series short description

2012-03-19 Thread Stephen Hemminger
On Mon, 19 Mar 2012 19:49:50 -0700
John Fastabend john.r.fastab...@intel.com wrote:

 On 3/19/2012 5:35 PM, David Miller wrote:
  From: John Fastabend john.r.fastab...@intel.com
  Date: Mon, 19 Mar 2012 17:27:00 -0700
  
  Dave, its probably fine to push this to 3.5 then.
  
  Fair enough.
 
 Stephen, please let me know if you see any issues though
 because without these we have no way to forward packets
 correctly in the embedded switch. So we can't really
 use SR-IOV and virtual interfaces together correctly. And
 the macvlan device in passthru mode is putting the device
 in promiscuous mode which isn't great either.
 
 .John

I am more worried about evaluating ABI compatibility with older
utilities.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v1 4/4] ixgbe: enable FDB netdevice ops

2012-03-09 Thread Stephen Hemminger

 Enable FDB ops on ixgbe when in SR-IOV mode.
 
 Signed-off-by: John Fastabend john.r.fastab...@intel.com

Will all this break anything on the vf client? What if the vf is running
a bridge.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-02-29 Thread Stephen Hemminger
On Wed, 29 Feb 2012 09:25:56 -0800
John Fastabend john.r.fastab...@intel.com wrote:

 On 2/29/2012 5:56 AM, Jamal Hadi Salim wrote:
  On Tue, 2012-02-28 at 20:40 -0800, John Fastabend wrote:
  
  OK back to this. The last piece is where to put these messages...
  we could take PF_ROUTE:RTM_*NEIGH
 
   PF_ROUTE:RTM_NEWNEIGH - Add a new FDB entry to an offloaded
   switch.
   PF_ROUTE:RTM_DELNEIGH - Delete a FDB entry from an offlaoded
   switch.
   PF_ROUTE:RTM_GETNEIGH - Dumps the embedded FDB table
 
  
  Why RTM_*NEIGH? RTM tends to map to Route/L3 and NEIGH tends to map
  to ndisc or ARP both tied to IP address resolution. While both ARP/Ndisc
  may play a role in the user space app populating the FDB, i dont think
  they are necessary players.
  Learning could be via a table entry miss and packet redirect to user
  space.
  So my suggestion is to use FDB_*ENTRY for names
   
 
 Well I think NETLINK_ROUTE is the most correct type to use in this
 case. Per netlink.h its for routing and device hooks.
 
 #define NETLINK_ROUTE   0   /* Routing/device hook
   */
 
 And NETLINK_ROUTE msg_types use the RTM_* prefix. The _*NEIGH postfix
 were merely a copy from the SW BRIDGE code paths. How about,
 
 PF_BRIDGE:RTM_FDB_NEWENTRY
 PF_BRIDGE:RTM_FDB_DELENTRY
 PF_BRIDGE:RTM_FDB_GETENTRY
 
 And a new group RTNLGRP_FDB. Also using NETLINK_ROUTE gives the correct
 rtnl locking semantics for free.
 
  The neighbor code is using the PF_UNSPEC protocol type so we won't
  collide with these unless someone was using PF_ROUTE and relying on
  falling back to PF_UNSPEC however I couldn't find any programs that
  did this iproute2 certainly doesn't. And the bridge pieces are using
  PF_BRIDGE so no collision there.
  
  They have to be different calls from the calls that talk to the s/ware
  bridge. In my opinion, as controversial as this may sound, you need to
  be flexible enough that some vendor can replace these calls with
  proprietary calls which are more efficient for their hardware. So a
  plugin to replace these calls in the user space code would be a 
  good idea. Alternatively, you could make that something they do at
  the driver level i.e from user space to kernel it is hardware, please
  addthistotheFDBtable() call and the implementation of that could be
  proprietary to the specific hardware.
  
 
 Agreed. I think adding some ndo_ops for bridging offloads here would
 work. For example the DSA infrastructure and/or macvlan devices might
 need this. Along the lines of extending this RFC,
 
 [RFC] hardware bridging support for DSA switches
 http://patchwork.ozlabs.org/patch/16578/

I want to see a unified API so that user space control applications (RSTP, 
TRILL?)
can use one set of netlink calls for both software bridge and hardware offloaded
bridges.  Does this proposal meet that requirement?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-02-14 Thread Stephen Hemminger
On Tue, 14 Feb 2012 10:57:04 -0800
John Fastabend john.r.fastab...@intel.com wrote:

 On 2/14/2012 5:18 AM, jamal wrote:
  On Mon, 2012-02-13 at 07:13 -0800, John Fastabend wrote:
  
  The use case here is multiple VFs but the same solution should work with
  multiple PFs as well. FDB controls should be independent of how the ports
  are exposed VFs, PFs, VMDQ/queue pairs, macvlan, etc.
  
  Makes sense.
  
  With events and ADD/DEL/GET FDB controls we can solve both cases. This also
  solves Roopa's case with macvlan where she wants to add additional 
  addresses
  to macvlan ports.
  
  Not familiar with that issue - I'll prowl the list.
 
 Roopa was likely on the right track here,
 
 http://patchwork.ozlabs.org/patch/123064/
 
 But I think the proper syntax is to use the existing PF_BRIDGE:RTM_XXX
 netlink messages. And if possible drive this without extending ndo_ops.
 
 An ideal user space interaction IMHO would look like,
 
 [root@jf-dev1-dcblab iproute2]# ./br/br fdb add 52:e5:62:7b:57:88 dev veth10
 [root@jf-dev1-dcblab iproute2]# ./br/br fdb
 portmac addrflags
 veth2   36:a6:35:9b:96:c4   local
 veth4   aa:54:b0:7b:42:ef   local
 veth0   2a:e8:5c:95:6c:1b   local
 veth6   6e:26:d5:43:a3:36   local
 veth0   f2:c1:39:76:6a:fb
 veth8   4e:35:16:af:87:13   local
 veth10  52:e5:62:7b:57:88   static
 veth10  aa:a9:35:21:15:c4   local
 [root@jf-dev1-dcblab iproute2]# ./br/br fdb add dev eth3 to 52:e5:62:7b:57:88
 RTNETLINK answers: Invalid argument

I am going to put bridge (nameclash with br) tool into iproute2 (soon).
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-02-10 Thread Stephen Hemminger
On Fri, 10 Feb 2012 10:18:31 -0500
jamal h...@cyberus.ca wrote:

 Hi John,
 
 I went backwards to summarize at the top after going through your email.
 
 TL;DR version 0.1: 
 you provide a good use case where it makes sense to do things in the
 kernel. IMO, you could make the same arguement if your embedded switch
 could do ACLs, IPv4 forwarding etc. And the kernel bloats.
 I am always bigoted to move all policy control to user space instead of
 bloating in the kernel.
 
  
 On Thu, 2012-02-09 at 20:14 -0800, John Fastabend wrote:
 
   
   Hi Jamal,
   
   The user space app in this case would listen for FDB updates to the SW
   bridge and then mirror them at the embedded NIC. In this case it seems
   easier to just add a notifier chain and let the kernel keep these in
   sync. Otherwise we need a daemon in user space to replicate these.
   
 
 A user space daemon if you need to ensure synchronization. Thats what i
 meant when i said there was a disadvantage over the simple case when
 the goal is always to synchronize.
 
   On the other hand if you could make the same RTM_NEWNEIGH, RTM_DELNEIGH,
   and RTM_GETNEIGH work for the bridge, embedded bridge, and macvlan you
   would have one common interface to drive these. But the bridge already
   has this protocol/msgtype so that would require either some demux or
   new protocol/msgtype pairs to be created. 
   
 
 The bridge is very netlink friendly these days. Given the rest of the
 network stack (*NEIGH* you mention above) talks netlink to user space
 it should be workable. 
 
   Let me think on it. I'm tempted by the simplicity of adding notifier
   hooks though.
 
 If something is missing bridge-side it may need to be added (as Per
 Stephen's comment) - i just took it one further indicating those
 notifiers need to also netlink-speak
 
 
  Actually because the bridge is adding/removing fdb entries dynamically
  maybe its best this gets done in kernel. Here's the example case,
 
 [..]
 
  
  With the flow by letters above hope this is not too difficult to follow.
 
  (A) veth0 a virtual device transmits packet destined for ethx.y
  (B) SW bridge receives frames and updates FDB flooding to C
  (C) eth0 the PF in this case sends the frame to the HW backed by the
  embedded bridge
 
 Following so far.
 Can you have more than one PF per embedded switch? Or is the intent here
 purely to do VMs/VF separation?
 
  (D) The HW embedded switch has a static entry for ethx.y and forwards
  the frame to the VF or if its a broadcast frame also floods it to
  the wire and ethx.y
 
 nod.
 
  (E) ethx.y receives the frame and generates a response to the dest mac of
  veth0
 
 nod.
 Since you said in #D the entries in the switch are static, I am assuming
 at this point neither ethx.y nor veth0 exist in the embedded FDB.
 
  Now here is the potential issue,
  
  (G) The frame transmitted from ethx.y with the destination address of
  veth0 but the embedded switch is not a learning switch. If the FDB
  update is done in user space its possible (likely?) that the FDB
  entry for veth0 has not been added to the embedded switch yet. 
 
 Ok, got it - so the catch here is the switch is not capable of learning.
 I think this depends on where learning is done. Your intent is to
 use the S/W bridge as something that does the learning for you i.e in
 the kernel. This makes the s/w bridge part of MUST-have-for-this-to-run.
 And that maybe the case for your use case.
 
 What if I dont wanna run the S/W bridge at all?
 Ive been making a point that with a simple knob(Stephen doesn like to
 add such a knob), the SW bridge could defer learning to user space. 
 [This way you can add a lot of richness e.g on ACLs such as restricting
 what MAC addresses etc are allowed to talk to which ones etc.].
 But if bypass the s/w bridge all together and learn in user space
 or have a static config in which i populate the embedded switch, i dont
 see the issue.
 
  Now
  we either have to flood the frame which is not horrible but not
  ideal or worse if the embedded switch does not support flooding send
  it to the wire and veth0 never receives it. 
 
 If it is a switch it has to flood, no? Otherwise it sounds broken.
 
  If the SW bridge pushes
  the FDB update down into the embedded switch the address is for
  sure in the embedded switches forwarding tables and the switching
  works as expected.
 
 Yes, there is a small gap between the s/w bridge learning and the
 synchronization happening to the embedded nic switch. That gap gets
 larger if you defer learning to user space. But like you said earlier,
 during that gap packets are flooded - and do you care if the
 synchronization doesnt happen immediately?
 
  So to handle this case correctly its probably best IMHO to use a notifier
  hook. Having a RTM_GETNEIGH for the embedded switch implemented though
  would be nice for dumping the FDB of the embedded switch and SET/DEL
  could be used to 

Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-02-09 Thread Stephen Hemminger
On Thu, 09 Feb 2012 09:36:47 -0800
John Fastabend john.r.fastab...@intel.com wrote:

 But the device features makes it easy for user space to learn that the device
 supports this sort of offload. Now if all SR-IOV devices support this then it
 doesn't matter but I thought there were SR-IOV devices that didn't do any
 switching? I'll dig through the SR-IOV drivers to check there are not too
 many of them.

If user space needs to know then the OS is not designed properly.
The purpose of the network device is to abstract all those details, and more 
and more
of them are bleeding through. This makes writing management applications harder 
and makes
things dependent on features that may or may not be present. The best design is 
when
the change is invisible.

 By netlink_notifier do you mean adding a notifier_block and using 
 atomic_notifier_call_chain()
 probably in rtnl_notify()? Then drivers could register with the notifier 
 chain with
 atomic_notifier_chain_register() and receive the events correctly. Or did I 
 miss
 some notifier chain that already exists?

Yes. that is what I mean. The callbacks you need may or may not already be 
present.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware

2012-02-08 Thread Stephen Hemminger
On Wed, 08 Feb 2012 19:22:06 -0800
John Fastabend john.r.fastab...@intel.com wrote:

 Propagate software FDB table into hardware uc, mc lists when
 the NETIF_F_HW_FDB is set.
 
 This resolves the case below where an embedded switch is used
 in hardware to do inter-VF or VF-PF switching. This patch
 pushes the FDB entry (specifically the MAC address) into the
 embedded switch with dev_add_uc and dev_add_mc so the switch
 learns about the software bridge.
 
 
   veth0  veth2
 |  |
   
   |  bridge0 |    software bridging
   
/
/
   ethx.y  ethx
 VF PF
  \ \   propagate FDB entries to HW
  \ \
   
   |  Embedded Bridge | hardware offloaded switching
   
 
 This is only an RFC couple more changes are needed.
 
 (1) Optimize HW FDB set/del to only walk list if an FDB offloaded
 device is attached. Or decide it doesn't matter from unlikely()
 path.
 
 (2) Is it good enough to just call dev_uc_{add|del} or
 dev_mc_{add|del}? Or do some devices really need a new netdev
 callback to do this operation correctly. I think it should be
 good enough as is.
 
 (3) wrapped list walk in rcu_read_lock() just in case maybe every
 case is already inside rcu_read_lock()/unlock().
 
 Also this is in response to this thread regarding the macvlan and
 exposing rx filters posting now to see if folks think this is the
 right idea and if it will resolve at least the bridge case.
 
 http://lists.openwall.net/netdev/2011/11/08/135
 
 Signed-off-by: John Fastabend john.r.fastab...@intel.com
 ---
 
  include/linux/netdev_features.h |2 ++
  net/bridge/br_fdb.c |   34 ++
  2 files changed, 36 insertions(+), 0 deletions(-)
 
 diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
 index 77f5202..5936fae 100644

Rather than yet another device feature, I would rather use netlink_notifier
callback. The notifier is more general and generic without messing with 
internals
of bridge.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-net: add module alias (v2.1)

2012-01-16 Thread Stephen Hemminger
On Mon, 16 Jan 2012 12:26:45 +
Alan Cox a...@linux.intel.com wrote:

   ACKs, NACKs?  What is happening here?
  
  I would like an Ack from Alan Cox who switched vhost-net
  to a dynamic minor in the first place, in commit
  79907d89c397b8bc2e05b347ec94e928ea919d33.
 
 Sorry dev...@lanana.org isn't yet back from the kernel hack incident.
 
 I don't read netdev so someone needs to summarise the issue and send me
 a copy of the patch to look at.
 
 Alan

Subject: vhost-net: add module alias (v2.1)

By adding some module aliases, programs (or users) won't have to explicitly
call modprobe. Vhost-net will always be available if built into the kernel.
It does require assigning a permanent minor number for depmod to work.

Also:
  - use C99 style initialization.
  - add missing entry in documentation for loop-control

Signed-off-by: Stephen Hemminger shemmin...@vyatta.com

---
2.1 - add missing documentation for loop control as well

 Documentation/devices.txt  |3 +++
 drivers/vhost/net.c|8 +---
 include/linux/miscdevice.h |1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

--- a/drivers/vhost/net.c   2012-01-12 14:14:25.681815487 -0800
+++ b/drivers/vhost/net.c   2012-01-12 18:09:56.810680816 -0800
@@ -856,9 +856,9 @@ static const struct file_operations vhos
 };
 
 static struct miscdevice vhost_net_misc = {
-   MISC_DYNAMIC_MINOR,
-   vhost-net,
-   vhost_net_fops,
+   .minor = VHOST_NET_MINOR,
+   .name = vhost-net,
+   .fops = vhost_net_fops,
 };
 
 static int vhost_net_init(void)
@@ -879,3 +879,5 @@ MODULE_VERSION(0.0.1);
 MODULE_LICENSE(GPL v2);
 MODULE_AUTHOR(Michael S. Tsirkin);
 MODULE_DESCRIPTION(Host kernel accelerator for virtio net);
+MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
+MODULE_ALIAS(devname:vhost-net);
--- a/include/linux/miscdevice.h2012-01-12 14:14:25.725815981 -0800
+++ b/include/linux/miscdevice.h2012-01-12 18:09:56.810680816 -0800
@@ -42,6 +42,7 @@
 #define AUTOFS_MINOR   235
 #define MAPPER_CTRL_MINOR  236
 #define LOOP_CTRL_MINOR237
+#define VHOST_NET_MINOR238
 #define MISC_DYNAMIC_MINOR 255
 
 struct device;
--- a/Documentation/devices.txt 2012-01-12 14:14:25.701815712 -0800
+++ b/Documentation/devices.txt 2012-01-12 18:09:56.814680860 -0800
@@ -447,6 +447,9 @@ Your cooperation is appreciated.
234 = /dev/btrfs-controlBtrfs control device
235 = /dev/autofs   Autofs control device
236 = /dev/mapper/control   Device-Mapper control device
+   237 = /dev/loop-control Loopback control device
+   238 = /dev/vhost-netHost kernel accelerator for virtio net
+
240-254 Reserved for local use
255 Reserved for MISC_DYNAMIC_MINOR
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-net: add module alias

2012-01-11 Thread Stephen Hemminger
On Wed, 11 Jan 2012 15:43:42 +0800
Amos Kong kongjian...@gmail.com wrote:

 On Wed, Jan 11, 2012 at 12:54 PM, Stephen Hemminger
 shemmin...@vyatta.comwrote:
 
  By adding the a module alias, programs (or users) won't have to explicitly
  call modprobe. Vhost-net will always be available if built into the kernel.
  It does require assigning a permanent minor number for depmod to work.
  Choose one next to TUN since this driver is related to it.
 
  Also, use C99 style initialization.
 
  Signed-off-by: Stephen Hemminger shemmin...@vyatta.com
 
  ---
   drivers/vhost/net.c|8 +---
   include/linux/miscdevice.h |1 +
   2 files changed, 6 insertions(+), 3 deletions(-)
 
:
 /*
  *  These allocations are managed by dev...@lanana.org. If you use an
  *  entry that is not in assigned your entry may well be moved and
  *  reassigned, or set dynamic if a fixed value is not justified.
  */

Didn't that mailing address was ever used any more. Like many places
in kernel, the comment looked like a historical leftover.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost-net: add module alias

2012-01-11 Thread Stephen Hemminger
On Wed, 11 Jan 2012 11:07:47 +0400
Michael Tokarev m...@tls.msk.ru wrote:

 On 11.01.2012 08:54, Stephen Hemminger wrote:
  By adding the a module alias, programs (or users) won't have to explicitly
  call modprobe. Vhost-net will always be available if built into the kernel.
  It does require assigning a permanent minor number for depmod to work.
  Choose one next to TUN since this driver is related to it.
 
 Why do you think a statically-allocated device number will do any good
 at all?  Static /dev is gone almost completely, at least on the systems
 where whole virt stuff makes any sense, so you don't have pre-created
 vhost-net device anymore, and hence this allocation makes no sense.
 Just IMHO anyway.

The statically allocated device number is required for the udev/module
autoloading to work. Probably the udev infrastructure needs a consistent
number to hang off of.

It looks like:
  * driver adds MODULE_ALIAS() for devname and character device
  * depmod scans modules and creates modules.devname (in /lib/modules)
  * udev uses modules.devname to autoload the module

$ /sbin/modinfo vhost_net
filename:   /lib/modules/3.2.0-net+/kernel/drivers/vhost/vhost_net.ko
alias:  devname:vhost-net
alias:  char-major-10-201
description:Host kernel accelerator for virtio net
...

See also: https://lkml.org/lkml/2010/5/21/134



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost-net: add module alias (v2)

2012-01-11 Thread Stephen Hemminger
By adding the correct module alias, programs won't have to explicitly
call modprobe. Vhost-net will always be available if built into the kernel.
It does require assigning a permanent minor number for depmod to work.
Choose one next to TUN since this driver is related to it.

Also, use C99 style initialization.

Signed-off-by: Stephen Hemminger shemmin...@vyatta.com

---
v2 - document minor number and make sure to not overlap

 Documentation/devices.txt  |2 ++
 drivers/vhost/net.c|8 +---
 include/linux/miscdevice.h |1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

--- a/drivers/vhost/net.c   2012-01-10 10:56:58.883179194 -0800
+++ b/drivers/vhost/net.c   2012-01-10 19:48:23.650225892 -0800
@@ -856,9 +856,9 @@ static const struct file_operations vhos
 };
 
 static struct miscdevice vhost_net_misc = {
-   MISC_DYNAMIC_MINOR,
-   vhost-net,
-   vhost_net_fops,
+   .minor = VHOST_NET_MINOR,
+   .name = vhost-net,
+   .fops = vhost_net_fops,
 };
 
 static int vhost_net_init(void)
@@ -879,3 +879,5 @@ MODULE_VERSION(0.0.1);
 MODULE_LICENSE(GPL v2);
 MODULE_AUTHOR(Michael S. Tsirkin);
 MODULE_DESCRIPTION(Host kernel accelerator for virtio net);
+MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
+MODULE_ALIAS(devname:vhost-net);
--- a/include/linux/miscdevice.h2012-01-10 10:56:59.779189436 -0800
+++ b/include/linux/miscdevice.h2012-01-11 09:13:20.803694316 -0800
@@ -42,6 +42,7 @@
 #define AUTOFS_MINOR   235
 #define MAPPER_CTRL_MINOR  236
 #define LOOP_CTRL_MINOR237
+#define VHOST_NET_MINOR238
 #define MISC_DYNAMIC_MINOR 255
 
 struct device;
--- a/Documentation/devices.txt 2012-01-10 10:56:53.399116518 -0800
+++ b/Documentation/devices.txt 2012-01-11 09:12:49.251197653 -0800
@@ -447,6 +447,8 @@ Your cooperation is appreciated.
234 = /dev/btrfs-controlBtrfs control device
235 = /dev/autofs   Autofs control device
236 = /dev/mapper/control   Device-Mapper control device
+   237 = /dev/vhost-netHost kernel accelerator for virtio net
+
240-254 Reserved for local use
255 Reserved for MISC_DYNAMIC_MINOR
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost-net: add module alias (v2.1)

2012-01-11 Thread Stephen Hemminger
Subject: vhost-net: add module alias (v2.1)

By adding some module aliases, programs (or users) won't have to explicitly
call modprobe. Vhost-net will always be available if built into the kernel.
It does require assigning a permanent minor number for depmod to work.

Also:
  - use C99 style initialization.
  - add missing entry in documentation for loop-control

Signed-off-by: Stephen Hemminger shemmin...@vyatta.com

---
2.1 - add missing documentation for loop control as well

 Documentation/devices.txt  |3 +++
 drivers/vhost/net.c|8 +---
 include/linux/miscdevice.h |1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

--- a/drivers/vhost/net.c   2012-01-10 10:56:58.883179194 -0800
+++ b/drivers/vhost/net.c   2012-01-10 19:48:23.650225892 -0800
@@ -856,9 +856,9 @@ static const struct file_operations vhos
 };
 
 static struct miscdevice vhost_net_misc = {
-   MISC_DYNAMIC_MINOR,
-   vhost-net,
-   vhost_net_fops,
+   .minor = VHOST_NET_MINOR,
+   .name = vhost-net,
+   .fops = vhost_net_fops,
 };
 
 static int vhost_net_init(void)
@@ -879,3 +879,5 @@ MODULE_VERSION(0.0.1);
 MODULE_LICENSE(GPL v2);
 MODULE_AUTHOR(Michael S. Tsirkin);
 MODULE_DESCRIPTION(Host kernel accelerator for virtio net);
+MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
+MODULE_ALIAS(devname:vhost-net);
--- a/include/linux/miscdevice.h2012-01-10 10:56:59.779189436 -0800
+++ b/include/linux/miscdevice.h2012-01-11 09:13:20.803694316 -0800
@@ -42,6 +42,7 @@
 #define AUTOFS_MINOR   235
 #define MAPPER_CTRL_MINOR  236
 #define LOOP_CTRL_MINOR237
+#define VHOST_NET_MINOR238
 #define MISC_DYNAMIC_MINOR 255
 
 struct device;
--- a/Documentation/devices.txt 2012-01-10 10:56:53.399116518 -0800
+++ b/Documentation/devices.txt 2012-01-11 13:17:07.882113340 -0800
@@ -447,6 +447,9 @@ Your cooperation is appreciated.
234 = /dev/btrfs-controlBtrfs control device
235 = /dev/autofs   Autofs control device
236 = /dev/mapper/control   Device-Mapper control device
+   237 = /dev/loop-control Loopback control device
+   238 = /dev/vhost-netHost kernel accelerator for virtio net
+
240-254 Reserved for local use
255 Reserved for MISC_DYNAMIC_MINOR
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost-net: add module alias

2012-01-10 Thread Stephen Hemminger
By adding the a module alias, programs (or users) won't have to explicitly
call modprobe. Vhost-net will always be available if built into the kernel.
It does require assigning a permanent minor number for depmod to work.
Choose one next to TUN since this driver is related to it.

Also, use C99 style initialization.

Signed-off-by: Stephen Hemminger shemmin...@vyatta.com

---
 drivers/vhost/net.c|8 +---
 include/linux/miscdevice.h |1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

--- a/drivers/vhost/net.c   2012-01-10 10:56:58.883179194 -0800
+++ b/drivers/vhost/net.c   2012-01-10 19:48:23.650225892 -0800
@@ -856,9 +856,9 @@ static const struct file_operations vhos
 };
 
 static struct miscdevice vhost_net_misc = {
-   MISC_DYNAMIC_MINOR,
-   vhost-net,
-   vhost_net_fops,
+   .minor = VHOST_NET_MINOR,
+   .name = vhost-net,
+   .fops = vhost_net_fops,
 };
 
 static int vhost_net_init(void)
@@ -879,3 +879,5 @@ MODULE_VERSION(0.0.1);
 MODULE_LICENSE(GPL v2);
 MODULE_AUTHOR(Michael S. Tsirkin);
 MODULE_DESCRIPTION(Host kernel accelerator for virtio net);
+MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
+MODULE_ALIAS(devname:vhost-net);
--- a/include/linux/miscdevice.h2012-01-10 10:56:59.779189436 -0800
+++ b/include/linux/miscdevice.h2012-01-10 19:49:56.091748210 -0800
@@ -31,6 +31,7 @@
 #define I2O_MINOR  166
 #define MICROCODE_MINOR184
 #define TUN_MINOR  200
+#define VHOST_NET_MINOR201
 #define MWAVE_MINOR219 /* ACP/Mwave Modem */
 #define MPT_MINOR  220
 #define MPT2SAS_MINOR  221

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] kvm tools: Implement multiple VQ for virtio-net

2011-11-22 Thread Stephen Hemminger
I have been playing with userspace-rcu which has a number of neat
lockless routines for queuing and hashing. But there aren't kernel versions
and several of them may require cmpxchg to work.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Why does virtmanager (VNC) console get stuck in caps lock mode?

2010-07-30 Thread Stephen Hemminger
As an emacs user, I remap caps-lock to CTRL key in normal usage.
But often in KVM console VNC window, it gets stuck in caps lock mode.
This appears to be a new mis-feature, since it never used to get stuck
before (not sure whether is kernel, virt-manager or QEMU).

Any clues?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Why does virtmanager (VNC) console get stuck in caps lock mode?

2010-07-30 Thread Stephen Hemminger
On Fri, 30 Jul 2010 16:11:50 -0500
Anthony Liguori anth...@codemonkey.ws wrote:

 On 07/30/2010 03:39 PM, Stephen Hemminger wrote:
  As an emacs user, I remap caps-lock to CTRL key in normal usage.
  But often in KVM console VNC window, it gets stuck in caps lock mode.
 
 
 Try passing no-lock-key-sync to the -vnc option.  If you're using 
 virt-manger, that may be challenging without hacking libvirt.
 
 But you could also try using qemu directly and then using gvncviewer.

And I could go back to telnet as well...

  This appears to be a new mis-feature, since it never used to get stuck
  before (not sure whether is kernel, virt-manager or QEMU).
 
  Any clues?
 
 
 There's a few possibilities.  The first is that you don't have the 
 guest's keymap setup to treat caps lock as ctrl.  I imagine you do though.

Already done (via Gnome preferences)

 The second, more likely, possibility is that our caps lock detection 
 heuristics are being defeated because you're remapping the key.
 
 The above option disables our heuristics.  If that fixes the problem 
 then we probably need to add some bits to further handle it.  The real 
 source of the problem is that we can receive key down events but if 
 focus moves we won't receive the key up.  That can create subtle 
 problems where you're pressing control and release it after switching 
 windows.  The effect is the guest sees control as being stuck.
 
 We do our best to work out this situation but it's not necessarily perfect.
 
 Regards,
 
 Anthony Liguori

Ideally you could read keymap and respond accordingly, but that maybe
too hard.


-- 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM SMP guests won't work (2.6.35-rc2+)

2010-06-10 Thread Stephen Hemminger
On Thu, 10 Jun 2010 21:44:07 +0300
Avi Kivity a...@redhat.com wrote:

 On 06/10/2010 09:28 PM, Stephen Hemminger wrote:
  With current 2.6.35-rc2 and following configuration, my build guest VM will
  not start (it was working previously).
 
   Error starting domain: monitor socket did not show up: Connection 
  refused
 
  Surprisingly, other guests work fine. The one difference is that the build
  VM has 4 virtual CPU's.
 
 
 Looks unrelated to kvm itself.  Can you start the guest without 
 libvirt?  An strace will show if it created the monitor socket or not.
 

I appears to have been a libvirt glitch. Runs fine under KVM directly.
And clearing it out of libvirt and recreating works fine now.

-- 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v7 01/19] Add a new structure for skb buffer from external.

2010-06-06 Thread Stephen Hemminger
Still not sure this is a good idea for a couple of reasons:

1. We already have lots of special cases with skb's (frags and fraglist),
   and skb's travel through a lot of different parts of the kernel.  So any
   new change like this creates lots of exposed points for new bugs. Look
   at cases like MD5 TCP and netfilter, and forwarding these SKB's to ipsec
   and ppp and ...

2. SKB's can have infinite lifetime in the kernel. If these buffers come from
   a fixed size pool in an external device, they can easily all get tied up
   if you have a slow listener. What happens then?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] [PATCH v2 3/3] Let host NIC driver to DMA to guest user space.

2010-04-08 Thread Stephen Hemminger
On Tue, 6 Apr 2010 14:26:29 +0800
Xin, Xiaohui xiaohui@intel.com wrote:

 How do you deal with the DoS problem of hostile user space app posting huge
 number of receives and never getting anything.   
 
 That's a problem we are trying to deal with. It's critical for long term.
 Currently, we tried to limit the pages it can pin, but not sure how much is 
 reasonable.
 For now, the buffers submitted is from guest virtio-net driver, so it's safe 
 in some extent
 just for now.

It is critical even now. Once you get past toy benchmarks you will see things 
like
Java processes with 1000 threads all reading at once. 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] [PATCH v2 3/3] Let host NIC driver to DMA to guest user space.

2010-04-02 Thread Stephen Hemminger
On Fri,  2 Apr 2010 15:30:10 +0800
xiaohui@intel.com wrote:

 From: Xin Xiaohui xiaohui@intel.com
 
 The patch let host NIC driver to receive user space skb,
 then the driver has chance to directly DMA to guest user
 space buffers thru single ethX interface.
 We want it to be more generic as a zero copy framework.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
 ---
 
 We consider 2 way to utilize the user buffres, but not sure which one
 is better. Please give any comments.
 
 One:Modify __alloc_skb() function a bit, it can only allocate a
 structure of sk_buff, and the data pointer is pointing to a
 user buffer which is coming from a page constructor API.
 Then the shinfo of the skb is also from guest.
 When packet is received from hardware, the skb-data is filled
 directly by h/w. What we have done is in this way.
 
 Pros:   We can avoid any copy here.
 Cons:   Guest virtio-net driver needs to allocate skb as almost
 the same method with the host NIC drivers, say the size
 of netdev_alloc_skb() and the same reserved space in the
 head of skb. Many NIC drivers are the same with guest and
 ok for this. But some lastest NIC drivers reserves special
 room in skb head. To deal with it, we suggest to provide
 a method in guest virtio-net driver to ask for parameter
 we interest from the NIC driver when we know which device
 we have bind to do zero-copy. Then we ask guest to do so.
 Is that reasonable?
 
 Two:Modify driver to get user buffer allocated from a page constructor
 API(to substitute alloc_page()), the user buffer are used as payload
 buffers and filled by h/w directly when packet is received. Driver
 should associate the pages with skb (skb_shinfo(skb)-frags). For
 the head buffer side, let host allocates skb, and h/w fills it.
 After that, the data filled in host skb header will be copied into
 guest header buffer which is submitted together with the payload 
 buffer.
 
 Pros:   We could less care the way how guest or host allocates their
 buffers.
 Cons:   We still need a bit copy here for the skb header.
 
 We are not sure which way is the better here. This is the first thing we want
 to get comments from the community. We wish the modification to the network
 part will be generic which not used by vhost-net backend only, but a user
 application may use it as well when the zero-copy device may provides async
 read/write operations later.
 
 
 Thanks
 Xiaohui

How do you deal with the DoS problem of hostile user space app posting huge
number of receives and never getting anything. 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 3/3] Let host NIC driver to DMA to guest user space.

2010-03-06 Thread Stephen Hemminger
On Sat,  6 Mar 2010 17:38:38 +0800
xiaohui@intel.com wrote:

 From: Xin Xiaohui xiaohui@intel.com
 
 The patch let host NIC driver to receive user space skb,
 then the driver has chance to directly DMA to guest user
 space buffers thru single ethX interface.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 Signed-off-by: Zhao Yu yzha...@gmail.com
 Sigend-off-by: Jeff Dike jd...@c2.user-mode-linux.org
 ---
  include/linux/netdevice.h |   76 ++-
  include/linux/skbuff.h|   30 +++--
  net/core/dev.c|   32 ++
  net/core/skbuff.c |   79 
 +
  4 files changed, 205 insertions(+), 12 deletions(-)
 

There are too many ifdef's in this implementation.
I would prefer to see a few functions (with stub for the non-ifdef case),
like the network namespace code.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Virtual Machine Device Queues(VMDq) support on KVM

2009-09-22 Thread Stephen Hemminger
On Tue, 22 Sep 2009 13:50:54 +0200
Arnd Bergmann a...@arndb.de wrote:

 On Tuesday 22 September 2009, Michael S. Tsirkin wrote:
More importantly, when virtualizations is used with multi-queue
NIC's the virtio-net NIC is a single CPU bottleneck. The virtio-net
NIC should preserve the parallelism (lock free) using multiple
receive/transmit queues. The number of queues should equal the
number of CPUs.
   
   Yup, multiqueue virtio is on todo list ;-)
   
  
  Note we'll need multiqueue tap for that to help.
 
 My idea for that was to open multiple file descriptors to the same
 macvtap device and let the kernel figure out the  right thing to
 do with that. You can do the same with raw packed sockets in case
 of vhost_net, but I wouldn't want to add more complexity to the
 tun/tap driver for this.
 
   Arnd 


Or get tap out of the way entirely. The packets should not have
to go out to user space at all (see veth)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Virtual Machine Device Queues(VMDq) support on KVM

2009-09-21 Thread Stephen Hemminger
On Mon, 21 Sep 2009 16:37:22 +0930
Rusty Russell ru...@rustcorp.com.au wrote:

   Actually this framework can apply to traditional network adapters which 
   have
   just one tx/rx queue pair. And applications using the same user/kernel 
   interface
   can utilize this framework to send/receive network traffic directly thru 
   a tx/rx
   queue pair in a network adapter.
   

More importantly, when virtualizations is used with multi-queue NIC's the 
virtio-net
NIC is a single CPU bottleneck. The virtio-net NIC should preserve the 
parallelism (lock
free) using multiple receive/transmit queues. The number of queues should equal 
the
number of CPUs.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Virtual Machine Device Queues(VMDq) support on KVM

2009-09-01 Thread Stephen Hemminger
On Tue, 1 Sep 2009 14:58:19 +0800
Xin, Xiaohui xiaohui@intel.com wrote:

   [RFC] Virtual Machine Device Queues (VMDq) support on KVM
 
 Network adapter with VMDq technology presents multiple pairs of tx/rx queues,
 and renders network L2 sorting mechanism based on MAC addresses and VLAN tags
 for each tx/rx queue pair. Here we present a generic framework, in which 
 network
 traffic to/from a tx/rx queue pair can be directed from/to a KVM guest without
 any software copy.
 
 Actually this framework can apply to traditional network adapters which have
 just one tx/rx queue pair. And applications using the same user/kernel 
 interface
 can utilize this framework to send/receive network traffic directly thru a 
 tx/rx
 queue pair in a network adapter.
 
 We use virtio-net architecture to illustrate the framework.
 
 
 || pop   add_buf||
 |Qemu process|  -TX   --  | Guest Kernel   |
 ||  - --  ||
 |Virtio-net  | push  get_buf||
 |  (Backend service) |  -RX   --  |  Virtio-net|
 ||  - --  |driver  |
 || push  get_buf||
 ||  ||
|
|
| AIO (read  write) combined with Direct I/O
|   (which substitute synced file operations)
 |---|
 | Host kernel  | read: copy-less with directly mapped user  |
 |  |   space to kernel, payload directly DMAed  |
 |  |   into user space  |
 |  | write: copy-less with directly mapped user |
 |  |   space to kernel, payload directly hooked |
 |  |   to a skb |
 |  ||
 |  (a likely   ||
 |   queue pair ||
 |   instance)  ||
 |  |   ||
 | NIC driver --  TUN/TAP driver   |
 |---|
|
|
traditional adapter or a tx/rx queue pair
 
 The basic idea is to utilize the kernel Asynchronous I/O combined with Direct
 I/O to implements copy-less TUN/TAP device. AIO and Direct I/O is not new to
 kernel, we still can see it in SCSI tape driver.
 
 With traditional file operations, a copying of payload contents from/to the
 kernel DMA address to/from a user buffer is needed. That's what the copying we
 want to save.
 
 The proposed framework is like this:
 A TUN/TAP device is bound to a traditional NIC adapter or a tx/rx queue pair 
 in
 host side. KVM virto-net Backend service, the user space program submits
 asynchronous read/write I/O requests to the host kernel through TUN/TAP 
 device.
 The requests are corresponding to the vqueue elements include both 
 transmission
  receive. They can be queued in one AIO request and later, the completion 
 will
 be notified through the underlying packets tx/rx processing of the rx/tx queue
 pair.
 
 Detailed path:
 
 To guest Virtio-net driver, packets receive corresponding to asynchronous read
 I/O requests of Backend service.
 
 1) Guest Virtio-net driver provides header and payload address through the
 receive vqueue to Virtio-net backend service.
 
 2) Virtio-net backend service encapsulates multiple vqueue elements into
 multiple AIO control blocks and composes them into one AIO read request.
 
 3) Virtio-net backend service uses io_submit() syscall to pass the request to
 the TUN/TAP device.
 
 4) Virtio-net backend service uses io_getevents() syscall to check the
 completion of the request.
 
 5) The TUN/TAP driver receives packets from the queue pair of NIC, and 
 prepares
 for Direct I/O.
A modified NIC driver may render a skb which header is allocated in host
 kernel, but the payload buffer is directly mapped from user space buffer which
 are rendered through the AIO request by the Backend service. get_user_pages()
 may do this. For one AIO read request, the TUN/TAP driver maintains a list for
 the directly mapped buffers, and a NIC driver tries to get the buffers as
 payload buffer to compose the new skbs. Of course, if getting the buffers
 fails, then kernel allocated buffers are used.
 
 6) Modern NIC cards now mostly have the header split feature. The NIC queue
 pair then may directly DMA the payload 

Re: [RFC PATCH v2 09/19] net: Add vbus_enet driver

2009-04-09 Thread Stephen Hemminger
On Thu, 09 Apr 2009 12:31:29 -0400
Gregory Haskins ghask...@novell.com wrote:

 Signed-off-by: Gregory Haskins ghask...@novell.com
 ---
 
  drivers/net/Kconfig |   13 +
  drivers/net/Makefile|1 
  drivers/net/vbus-enet.c |  680 
 +++
  3 files changed, 694 insertions(+), 0 deletions(-)
  create mode 100644 drivers/net/vbus-enet.c
 
 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
 index 62d732a..ac9dabd 100644
 --- a/drivers/net/Kconfig
 +++ b/drivers/net/Kconfig
 @@ -3099,4 +3099,17 @@ config VIRTIO_NET
 This is the virtual network driver for virtio.  It can be used with
lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
 +config VBUS_ENET
 + tristate Virtual Ethernet Driver
 + depends on VBUS_DRIVERS
 + help
 +A virtualized 802.x network device based on the VBUS interface.
 +It can be used with any hypervisor/kernel that supports the
 +vbus protocol.
 +
 +config VBUS_ENET_DEBUG
 +bool Enable Debugging
 + depends on VBUS_ENET
 + default n
 +
  endif # NETDEVICES
 diff --git a/drivers/net/Makefile b/drivers/net/Makefile
 index 471baaf..61db928 100644
 --- a/drivers/net/Makefile
 +++ b/drivers/net/Makefile
 @@ -264,6 +264,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
  obj-$(CONFIG_NETXEN_NIC) += netxen/
  obj-$(CONFIG_NIU) += niu.o
  obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 +obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
  obj-$(CONFIG_SFC) += sfc/
  
  obj-$(CONFIG_WIMAX) += wimax/
 diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
 new file mode 100644
 index 000..3779f77
 --- /dev/null
 +++ b/drivers/net/vbus-enet.c
 @@ -0,0 +1,680 @@
 +/*
 + * vbus_enet - A virtualized 802.x network device based on the VBUS interface
 + *
 + * Copyright (C) 2009 Novell, Gregory Haskins ghask...@novell.com
 + *
 + * Derived from the SNULL example from the book Linux Device Drivers by
 + * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
 + * by O'Reilly  Associates.
 + */
 +
 +#include linux/module.h
 +#include linux/init.h
 +#include linux/moduleparam.h
 +
 +#include linux/sched.h
 +#include linux/kernel.h
 +#include linux/slab.h
 +#include linux/errno.h
 +#include linux/types.h
 +#include linux/interrupt.h
 +
 +#include linux/in.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/ip.h
 +#include linux/tcp.h
 +#include linux/skbuff.h
 +#include linux/ioq.h
 +#include linux/vbus_driver.h
 +
 +#include linux/in6.h
 +#include asm/checksum.h
 +
 +#include linux/venet.h
 +
 +MODULE_AUTHOR(Gregory Haskins);
 +MODULE_LICENSE(GPL);

MODULE_DESCRIPTION ? 
MODULE_VERSION ?

 +static int napi_weight = 128;
 +module_param(napi_weight, int, 0444);
Already accessible through sysfs

 +static int rx_ringlen = 256;
 +module_param(rx_ringlen, int, 0444);

API for ring length exists via ethtool. If you used this
then there would be no need for device special parameter.

 +static int tx_ringlen = 256;
 +module_param(tx_ringlen, int, 0444);
 +
 +#undef PDEBUG /* undef it, just in case */
 +#ifdef VBUS_ENET_DEBUG
 +#  define PDEBUG(fmt, args...) printk(KERN_DEBUG vbus_enet:  fmt, ## args)
 +#else
 +#  define PDEBUG(fmt, args...) /* not debugging: nothing */
 +#endif

Why reinvent pr_debug()?

 +
 +struct vbus_enet_queue {
 + struct ioq  *queue;
 + struct ioq_notifier  notifier;
 +};
 +
 +struct vbus_enet_priv {
 + spinlock_t lock;
 + struct net_device *dev;
 + struct vbus_device_proxy  *vdev;
 + struct napi_struct napi;
 + struct vbus_enet_queue rxq;
 + struct vbus_enet_queue txq;
 + struct tasklet_struct  txtask;
 +};
 +
 +static struct vbus_enet_priv *
 +napi_to_priv(struct napi_struct *napi)
 +{
 + return container_of(napi, struct vbus_enet_priv, napi);
 +}
 +
 +static int
 +queue_init(struct vbus_enet_priv *priv,
 +struct vbus_enet_queue *q,
 +int qid,
 +size_t ringsize,
 +void (*func)(struct ioq_notifier *))
 +{
 + struct vbus_device_proxy *dev = priv-vdev;
 + int ret;
 +
 + ret = vbus_driver_ioq_alloc(dev, qid, 0, ringsize, q-queue);
 + if (ret  0)
 + panic(ioq_alloc failed: %d\n, ret);
 +
 + if (func) {
 + q-notifier.signal = func;
 + q-queue-notifier = q-notifier;
 + }
 +
 + return 0;
 +}
 +
 +static int
 +devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
 +{
 + struct vbus_device_proxy *dev = priv-vdev;
 +
 + return dev-ops-call(dev, func, data, len, 0);
 +}
 +
 +/*
 + * ---
 + * rx descriptors
 + * ---
 + */
 +
 +static void
 +rxdesc_alloc(struct ioq_ring_desc *desc, size_t len)
 +{
 + struct sk_buff *skb;
 +
 + len += ETH_HLEN;
 +
 + skb = dev_alloc_skb(len + 2);
 + BUG_ON(!skb);
 +
 + skb_reserve(skb, 2); /* align IP on 16B boundary */
Use NET_IP_ALIGN rather 

Re: [RFC PATCH 09/17] net: Add vbus_enet driver

2009-03-31 Thread Stephen Hemminger
On Tue, 31 Mar 2009 14:43:34 -0400
Gregory Haskins ghask...@novell.com wrote:

 Signed-off-by: Gregory Haskins ghask...@novell.com
 ---
 
  drivers/net/Kconfig |   13 +
  drivers/net/Makefile|1 
  drivers/net/vbus-enet.c |  706 
 +++
  3 files changed, 720 insertions(+), 0 deletions(-)
  create mode 100644 drivers/net/vbus-enet.c
 
 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
 index 62d732a..ac9dabd 100644
 --- a/drivers/net/Kconfig
 +++ b/drivers/net/Kconfig
 @@ -3099,4 +3099,17 @@ config VIRTIO_NET
 This is the virtual network driver for virtio.  It can be used with
lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
 +config VBUS_ENET
 + tristate Virtual Ethernet Driver
 + depends on VBUS_DRIVERS
 + help
 +A virtualized 802.x network device based on the VBUS interface.
 +It can be used with any hypervisor/kernel that supports the
 +vbus protocol.
 +
 +config VBUS_ENET_DEBUG
 +bool Enable Debugging
 + depends on VBUS_ENET
 + default n
 +
  endif # NETDEVICES
 diff --git a/drivers/net/Makefile b/drivers/net/Makefile
 index 471baaf..61db928 100644
 --- a/drivers/net/Makefile
 +++ b/drivers/net/Makefile
 @@ -264,6 +264,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
  obj-$(CONFIG_NETXEN_NIC) += netxen/
  obj-$(CONFIG_NIU) += niu.o
  obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 +obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
  obj-$(CONFIG_SFC) += sfc/
  
  obj-$(CONFIG_WIMAX) += wimax/
 diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
 new file mode 100644
 index 000..e698b3f
 --- /dev/null
 +++ b/drivers/net/vbus-enet.c
 @@ -0,0 +1,706 @@
 +/*
 + * vbus_enet - A virtualized 802.x network device based on the VBUS interface
 + *
 + * Copyright (C) 2009 Novell, Gregory Haskins ghask...@novell.com
 + *
 + * Derived from the SNULL example from the book Linux Device Drivers by
 + * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
 + * by O'Reilly  Associates.
 + */
 +
 +#include linux/module.h
 +#include linux/init.h
 +#include linux/moduleparam.h
 +
 +#include linux/sched.h
 +#include linux/kernel.h
 +#include linux/slab.h
 +#include linux/errno.h
 +#include linux/types.h
 +#include linux/interrupt.h
 +
 +#include linux/in.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/ip.h
 +#include linux/tcp.h
 +#include linux/skbuff.h
 +#include linux/ioq.h
 +#include linux/vbus_driver.h
 +
 +#include linux/in6.h
 +#include asm/checksum.h
 +
 +#include linux/venet.h
 +
 +MODULE_AUTHOR(Gregory Haskins);
 +MODULE_LICENSE(GPL);
 +
 +static int napi_weight = 128;
 +module_param(napi_weight, int, 0444);
 +static int rx_ringlen = 256;
 +module_param(rx_ringlen, int, 0444);
 +static int tx_ringlen = 256;
 +module_param(tx_ringlen, int, 0444);
 +
 +#undef PDEBUG /* undef it, just in case */
 +#ifdef VBUS_ENET_DEBUG
 +#  define PDEBUG(fmt, args...) printk(KERN_DEBUG vbus_enet:  fmt, ## args)
 +#else
 +#  define PDEBUG(fmt, args...) /* not debugging: nothing */
 +#endif
 +
 +struct vbus_enet_queue {
 + struct ioq  *queue;
 + struct ioq_notifier  notifier;
 +};
 +
 +struct vbus_enet_priv {
 + spinlock_t lock;
 + struct net_device *dev;
 + struct vbus_device_proxy  *vdev;
 + struct napi_struct napi;
 + struct net_device_statsstats;

Not needed any more, stats are available in net_device

 + struct vbus_enet_queue rxq;
 + struct vbus_enet_queue txq;
 + struct tasklet_struct  txtask;
 +};
 +

 + * Ioctl commands
 + */
 +static int
 +vbus_enet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 +{
 + PDEBUG(ioctl\n);
 + return 0;
 +}

If it doesn't do ioctl, just leave pointer as NULL

 +/*
 + * Return statistics to the caller
 + */
 +static struct net_device_stats *
 +vbus_enet_stats(struct net_device *dev)
 +{
 + struct vbus_enet_priv *priv = netdev_priv(dev);
 + return priv-stats;
 +}

Not needed if you use internal net_device stats

 +static void
 +rx_isr(struct ioq_notifier *notifier)
 +{
 + struct vbus_enet_priv *priv;
 + struct net_device  *dev;
 +
 + priv = container_of(notifier, struct vbus_enet_priv, rxq.notifier);
 + dev = priv-dev;
 +
 + if (!ioq_empty(priv-rxq.queue, ioq_idxtype_inuse))
 + vbus_enet_schedule_rx(priv);
 +}
 +
 +static void
 +deferred_tx_isr(unsigned long data)
 +{
 + struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
 + unsigned long flags;
 +
 + PDEBUG(deferred_tx_isr for %lld\n, priv-vdev-id);
 +
 + spin_lock_irqsave(priv-lock, flags);
 + vbus_enet_tx_reap(priv, 0);
 + spin_unlock_irqrestore(priv-lock, flags);
 +
 + ioq_notify_enable(priv-txq.queue, 0);
 +}
 +
 +static void
 +tx_isr(struct ioq_notifier *notifier)
 +{
 +   struct vbus_enet_priv *priv;
 +   unsigned long flags;
 +
 +   priv =