The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/3952
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) === The `vfio` interface type supports SR-IOV enabled network devices. These devices associate a set of virtual functions (VFs) with the single physical function (PF) of the network device. PFs are standard PCIe functions. VFs on the other hand are very lightweight PCIe functions that are optimized for data movement. They come with a limited set of configuration capabilites to prevent changing properties of the PF. Given that VFs appear as regular PCIe devices to the system they can be passed to containers just like a regular physical device. The `vfio` interface type expects to be passed the name of an SR-IOV enabled network device on the system via the `parent` property. LXD will then check for any available VFs on the system. By default LXD will allocate the first free VF it finds. If it detects that either none are enabled or all currently enabled VFs are in use it will bump the number of supported VFs to the maximum value and use the first free VF. If all possible VFs are in use or the kernel or card doesn't support incrementing the number of VFs LXD will return an error. To create a `vfio` network device use: ``` lxc config device add <container> <device-name> nic nictype=vfio parent=<sriov-enabled-device> ``` To tell LXD to use a specific unused VF add the `host_name` property and pass it the name of the enabled VF. Signed-off-by: Christian Brauner <[email protected]>
From fcb83715682766d813d5dbd8b403d90511b3f1d7 Mon Sep 17 00:00:00 2001 From: Christian Brauner <[email protected]> Date: Tue, 17 Oct 2017 14:26:16 +0200 Subject: [PATCH 1/3] container: add nictype "vfio" Closes #3941. Signed-off-by: Christian Brauner <[email protected]> --- lxd/container.go | 6 +-- lxd/container_lxc.go | 111 +++++++++++++++++++++++++++++++++++++++++++++++--- lxd/networks_utils.go | 2 +- 3 files changed, 109 insertions(+), 10 deletions(-) diff --git a/lxd/container.go b/lxd/container.go index d53d91e19..bfe0f2349 100644 --- a/lxd/container.go +++ b/lxd/container.go @@ -306,12 +306,12 @@ func containerValidDevices(dbObj *sql.DB, devices types.Devices, profile bool, e return fmt.Errorf("Missing nic type") } - if !shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "p2p", "macvlan"}) { + if !shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "p2p", "physical", "vfio"}) { return fmt.Errorf("Bad nic type: %s", m["nictype"]) } - if shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "macvlan"}) && m["parent"] == "" { - return fmt.Errorf("Missing parent for %s type nic.", m["nictype"]) + if shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) && m["parent"] == "" { + return fmt.Errorf("Missing parent for %s type nic", m["nictype"]) } } else if m["type"] == "disk" { if !expanded && !shared.StringInSlice(m["path"], diskDevicePaths) { diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go index fc5ed8cb0..45a33066e 100644 --- a/lxd/container_lxc.go +++ b/lxd/container_lxc.go @@ -1350,7 +1350,7 @@ func (c *containerLXC) initLXC() error { if err != nil { return err } - } else if m["nictype"] == "physical" { + } else if m["nictype"] == "physical" || m["nictype"] == "vfio" { err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.type", networkKeyPrefix, networkidx), "phys") if err != nil { return err @@ -1377,6 +1377,11 @@ func (c *containerLXC) initLXC() error { if err != nil { return err } + } else if m["nictype"] == "vfio" { + err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), m["host_name"]) + if err != nil { + return err + } } else if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) { err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), networkGetHostDevice(m["parent"], m["vlan"])) if err != nil { @@ -1386,7 +1391,7 @@ func (c *containerLXC) initLXC() error { // Host Virtual NIC name vethName := "" - if m["host_name"] != "" { + if m["host_name"] != "" && m["nictype"] != "vfio" { vethName = m["host_name"] } else if shared.IsTrue(m["security.mac_filtering"]) { // We need a known device name for MAC filtering @@ -5885,6 +5890,10 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string, } } + if m["nictype"] == "vfio" { + dev = m["host_name"] + } + // Handle bridged and p2p if shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) { n2 := deviceNextVeth() @@ -5914,7 +5923,7 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string, } // Handle physical and macvlan - if shared.StringInSlice(m["nictype"], []string{"physical", "macvlan"}) { + if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) { // Deal with VLAN device := m["parent"] if m["vlan"] != "" { @@ -6118,11 +6127,99 @@ func (c *containerLXC) fillNetworkDevice(name string, m types.Device) (types.Dev } // Fill in the host name (but don't generate a static one ourselves) - if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) { - configKey := fmt.Sprintf("volatile.%s.host_name", name) + configKey := fmt.Sprintf("volatile.%s.host_name", name) + if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p", "vfio"}) { newDevice["host_name"] = c.localConfig[configKey] } + if m["nictype"] == "vfio" && m["parent"] != "" { + if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s", m["parent"])) { + return nil, fmt.Errorf("Parent device '%s' doesn't exist", m["parent"]) + } + + if newDevice["host_name"] == "" { + sriovNumVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_numvfs", m["parent"]) + sriovTotalVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_totalvfs", m["parent"]) + + // verify that this is indeed a SR-IOV enabled device + if !shared.PathExists(sriovTotalVFs) { + return nil, fmt.Errorf("Parent device '%s' doesn't support SR-IOV", m["parent"]) + } + + // get number of currently enabled VFs + sriovNumVfsBuf, err := ioutil.ReadFile(sriovNumVFs) + if err != nil { + return nil, err + } + sriovNumVfsStr := strings.TrimSpace(string(sriovNumVfsBuf)) + sriovNum, err := strconv.Atoi(sriovNumVfsStr) + if err != nil { + return nil, err + } + + // get number of possible VFs + sriovTotalVfsBuf, err := ioutil.ReadFile(sriovTotalVFs) + if err != nil { + return nil, err + } + sriovTotalVfsStr := strings.TrimSpace(string(sriovTotalVfsBuf)) + sriovTotal, err := strconv.Atoi(sriovTotalVfsStr) + if err != nil { + return nil, err + } + + // Check if any VFs are already enabled + vf := "" + for i := 0; i < sriovNum; i++ { + vf = fmt.Sprintf("virtfn%d", i) + if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)) { + vf = "" + continue + } + + // Check if VF is already in use + empty, err := shared.PathIsEmpty(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)) + if err != nil { + return nil, err + } + if empty { + vf = "" + continue + } + + // found free VF + break + } + + if vf == "" { + if sriovNum == sriovTotal { + return nil, fmt.Errorf("All virtual functions of vfio device '%s' seem to be in use", m["parent"]) + } + + // bump the number of VFs to the maximum + err := ioutil.WriteFile(sriovNumVFs, []byte(sriovTotalVfsStr), 0644) + if err != nil { + return nil, err + } + + // use next free VF index + vf = fmt.Sprintf("virtfn%d", sriovNum+1) + } + + vf = fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf) + ents, err := ioutil.ReadDir(vf) + if err != nil { + return nil, err + } + if len(ents) == 0 || len(ents) > 1 { + return nil, fmt.Errorf("Failed to determine unique device name") + } + + newDevice["host_name"] = ents[0].Name() + c.localConfig[configKey] = ents[0].Name() + } + } + return newDevice, nil } @@ -6249,6 +6346,8 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error { var hostName string if m["nictype"] == "physical" { hostName = m["parent"] + } else if m["nictype"] == "vfio" { + hostName = m["host_name"] } else { hostName = deviceNextVeth() } @@ -6266,7 +6365,7 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error { } // If a veth, destroy it - if m["nictype"] != "physical" { + if m["nictype"] != "physical" && m["nictype"] != "vfio" { deviceRemoveInterface(hostName) } diff --git a/lxd/networks_utils.go b/lxd/networks_utils.go index 4cf455a84..95656ba53 100644 --- a/lxd/networks_utils.go +++ b/lxd/networks_utils.go @@ -104,7 +104,7 @@ func networkIsInUse(c container, name string) bool { continue } - if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical"}) { + if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) { continue } From 817fc4cacb2acb4513cb48c41ad7330147226490 Mon Sep 17 00:00:00 2001 From: Christian Brauner <[email protected]> Date: Wed, 18 Oct 2017 11:33:35 +0200 Subject: [PATCH 2/3] doc/containers: add nictype=vfio Closes #3941. Signed-off-by: Christian Brauner <[email protected]> --- doc/containers.md | 58 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/doc/containers.md b/doc/containers.md index c5a72bea9..c8a0f8f14 100644 --- a/doc/containers.md +++ b/doc/containers.md @@ -62,7 +62,7 @@ Key | Type | Default | Description :-- | :--- | :------ | :---------- volatile.\<name\>.hwaddr | string | - | Network device MAC address (when no hwaddr property is set on the device itself) volatile.\<name\>.name | string | - | Network device name (when no name propery is set on the device itself) -volatile.\<name\>.host\_name | string | - | Network device name on the host (for nictype=bridged or nictype=p2p) +volatile.\<name\>.host\_name | string | - | Network device name on the host (for nictype=bridged or nictype=p2p, or nictype=vfio) volatile.apply\_quota | string | - | Disk quota to be applied on next container start volatile.apply\_template | string | - | The name of a template hook which should be triggered upon next startup volatile.base\_image | string | - | The hash of the image the container was created from, if any. @@ -170,24 +170,25 @@ LXD supports different kind of network devices: - `bridged`: Uses an existing bridge on the host and creates a virtual device pair to connect the host bridge to the container. - `macvlan`: Sets up a new network device based on an existing one but using a different MAC address. - `p2p`: Creates a virtual device pair, putting one side in the container and leaving the other side on the host. + - `vfio`: Passes a virtual function of an SR-IOV enabled physical network device into the container. Different network interface types have different additional properties, the current list is: -Key | Type | Default | Required | Used by | API extension | Description -:-- | :-- | :-- | :-- | :-- | :-- | :-- -nictype | string | - | yes | all | - | The device type, one of "physical", "bridged", "macvlan" or "p2p" -limits.ingress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes) -limits.egress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes) -limits.max | string | - | no | bridged, p2p | - | Same as modifying both limits.read and limits.write -name | string | kernel assigned | no | all | - | The name of the interface inside the container -host\_name | string | randomly assigned | no | bridged, p2p, macvlan | - | The name of the interface inside the host -hwaddr | string | randomly assigned | no | all | - | The MAC address of the new interface -mtu | integer | parent MTU | no | all | - | The MTU of the new interface -parent | string | - | yes | physical, bridged, macvlan | - | The name of the host device or bridge -vlan | integer | - | no | macvlan, physical | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to -ipv4.address | string | - | no | bridged | network | An IPv4 address to assign to the container through DHCP -ipv6.address | string | - | no | bridged | network | An IPv6 address to assign to the container through DHCP -security.mac\_filtering | boolean | false | no | bridged | network | Prevent the container from spoofing another's MAC address +Key | Type | Default | Required | Used by | API extension | Description +:-- | :-- | :-- | :-- | :-- | :-- | :-- +nictype | string | - | yes | all | - | The device type, one of "bridged", "macvlan", "p2p", "physical", or "vfio" +limits.ingress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes) +limits.egress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes) +limits.max | string | - | no | bridged, p2p | - | Same as modifying both limits.read and limits.write +name | string | kernel assigned | no | all | - | The name of the interface inside the container +host\_name | string | randomly assigned | no | bridged, macvlan, p2p, vfio | - | The name of the interface inside the host +hwaddr | string | randomly assigned | no | all | - | The MAC address of the new interface +mtu | integer | parent MTU | no | all | - | The MTU of the new interface +parent | string | - | yes | bridged, macvlan, physical, vfio | - | The name of the host device or bridge +vlan | integer | - | no | macvlan, physical | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to +ipv4.address | string | - | no | bridged | network | An IPv4 address to assign to the container through DHCP +ipv6.address | string | - | no | bridged | network | An IPv6 address to assign to the container through DHCP +security.mac\_filtering | boolean | false | no | bridged | network | Prevent the container from spoofing another's MAC address #### bridged or macvlan for connection to physical network The `bridged` and `macvlan` interface types can both be used to connect @@ -206,6 +207,31 @@ your containers to talk to the host itself. In such case, a bridge is preferable. A bridge will also let you use mac filtering and I/O limits which cannot be applied to a macvlan device. +#### vfio +The `vfio` interface type supports SR-IOV enabled network devices. These +devices associate a set of virtual functions (VFs) with the single physical +function (PF) of the network device. PFs are standard PCIe functions. VFs on +the other hand are very lightweight PCIe functions that are optimized for data +movement. They come with a limited set of configuration capabilites to prevent +changing properties of the PF. Given that VFs appear as regular PCIe devices to +the system they can be passed to containers just like a regular physical +device. The `vfio` interface type expects to be passed the name of an SR-IOV +enabled network device on the system via the `parent` property. LXD will then +check for any available VFs on the system. By default LXD will allocate the +first free VF it finds. If it detects that either none are enabled or all +currently enabled VFs are in use it will bump the number of supported VFs to +the maximum value and use the first free VF. If all possible VFs are in use or +the kernel or card doesn't support incrementing the number of VFs LXD will +return an error. To create a `vfio` network device use: + +``` +lxc config device add <container> <device-name> nic nictype=vfio parent=<sriov-enabled-device> +``` + +To tell LXD to use a specific unused VF add the `host_name` property and pass +it the name of the enabled VF. + + ### Type: disk Disk entries are essentially mountpoints inside the container. They can either be a bind-mount of an existing file or directory on the host, or From d0aea5b2360c8ee551c352fb4610cf153027884d Mon Sep 17 00:00:00 2001 From: Christian Brauner <[email protected]> Date: Wed, 18 Oct 2017 12:07:52 +0200 Subject: [PATCH 3/3] api extension: add "network_vfio" extension Closes #3941. This adds support for SR-IOV enabled network devices. Signed-off-by: Christian Brauner <[email protected]> --- doc/api-extensions.md | 3 +++ lxd/api_1.0.go | 1 + 2 files changed, 4 insertions(+) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index 04695ed6d..cbb607418 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -346,3 +346,6 @@ This adds support for querying an LXD daemon for the system resources it has ## kernel\_limits This adds support for setting process limits such as maximum number of open files for the container via `nofile`. The format is `limits.kernel.[limit name]`. + +## network\_vfio +This adds support for SR-IOV enabled network devices. diff --git a/lxd/api_1.0.go b/lxd/api_1.0.go index d05dc407d..4ac0634ce 100644 --- a/lxd/api_1.0.go +++ b/lxd/api_1.0.go @@ -130,6 +130,7 @@ func api10Get(d *Daemon, r *http.Request) Response { "storage_block_filesystem_btrfs", "resources", "kernel_limits", + "network_vfio", }, APIStatus: "stable", APIVersion: version.APIVersion,
_______________________________________________ lxc-devel mailing list [email protected] http://lists.linuxcontainers.org/listinfo/lxc-devel
