Colin Watson has proposed merging ~cjwatson/launchpad-buildd:lxd-nvidia-nestable into launchpad-buildd:master.
Commit message: Fix use of nvidia* devices in nested containers Requested reviews: Launchpad code reviewers (launchpad-reviewers) For more details, see: https://code.launchpad.net/~cjwatson/launchpad-buildd/+git/launchpad-buildd/+merge/436282 Bind-mounting these devices into the container means that they aren't visible to the lxd snap inside the container. Create our own device nodes instead. -- Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad-buildd:lxd-nvidia-nestable into launchpad-buildd:master.
diff --git a/debian/changelog b/debian/changelog index 867c9f5..1e9ed07 100644 --- a/debian/changelog +++ b/debian/changelog @@ -2,6 +2,8 @@ launchpad-buildd (227) UNRELEASED; urgency=medium * Tolerate receiving "builder_constraints": None. * Check the appropriate server.key path for the LXD snap. + * Create nvidia* devices in such a way that they can be used by nested + containers. -- Colin Watson <[email protected]> Tue, 24 Jan 2023 13:13:27 +0000 diff --git a/lpbuildd/target/lxd.py b/lpbuildd/target/lxd.py index 57f62a4..eaf4066 100644 --- a/lpbuildd/target/lxd.py +++ b/lpbuildd/target/lxd.py @@ -2,6 +2,7 @@ # GNU Affero General Public License version 3 (see the file LICENSE). from contextlib import closing +from functools import cached_property import io import json import os @@ -43,30 +44,6 @@ def get_device_mapper_major(): "Cannot determine major device number for device-mapper") -def get_nvidia_container_paths(): - """Return the paths that need to be bind-mounted for NVIDIA CUDA support. - - LXD's security.privileged=true and nvidia.runtime=true options are - unfortunately incompatible, but we can emulate the important bits of the - latter with some tactical bind-mounts. There is no very good way to do - this; this seems like the least unpleasant approach. - """ - env = dict(os.environ) - env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib" - return subprocess.check_output( - [ - "/snap/lxd/current/bin/nvidia-container-cli.real", - "list", - "--binaries", - "--firmwares", - "--ipcs", - "--libraries", - ], - env=env, - universal_newlines=True, - ).splitlines() - - fallback_hosts = dedent("""\ 127.0.0.1\tlocalhost ::1\tlocalhost ip6-localhost ip6-loopback @@ -312,6 +289,23 @@ class LXD(Backend): os.unlink(self.dnsmasq_pid_file) subprocess.call(["sudo", "ip", "link", "delete", self.bridge_name]) + @cached_property + def _nvidia_container_paths(self): + """The paths that need to be bind-mounted for NVIDIA CUDA support. + + LXD's security.privileged=true and nvidia.runtime=true options are + unfortunately incompatible, but we can emulate the important bits of + the latter with some tactical bind-mounts. There is no very good + way to do this; this seems like the least unpleasant approach. + """ + env = dict(os.environ) + env["LD_LIBRARY_PATH"] = "/snap/lxd/current/lib" + return subprocess.check_output( + ["/snap/lxd/current/bin/nvidia-container-cli.real", "list"], + env=env, + universal_newlines=True, + ).splitlines() + def create_profile(self): for addr in self.ipv4_network: if addr not in ( @@ -381,13 +375,13 @@ class LXD(Backend): "type": "disk", } if "gpu-nvidia" in self.constraints: - devices["gpu"] = {"type": "gpu"} - for i, path in enumerate(get_nvidia_container_paths()): - devices[f"nvidia-{i}"] = { - "path": path, - "source": path, - "type": "disk", - } + for i, path in enumerate(self._nvidia_container_paths): + if not path.startswith("/dev/"): + devices[f"nvidia-{i}"] = { + "path": path, + "source": path, + "type": "disk", + } self.client.profiles.create(self.profile_name, config, devices) def start(self): @@ -495,6 +489,20 @@ class LXD(Backend): "b", str(major), str(minor)]) if "gpu-nvidia" in self.constraints: + # Create nvidia* devices. We have to do this here rather than + # bind-mounting them into the container, because bind-mounts + # aren't propagated into snaps (such as lxd) installed inside + # the container. + for path in self._nvidia_container_paths: + if path.startswith("/dev/"): + st = os.stat(path) + if stat.S_ISCHR(st.st_mode): + self.run( + ["mknod", "-m", "0%o" % stat.S_IMODE(st.st_mode), + path, "c", + str(os.major(st.st_rdev)), + str(os.minor(st.st_rdev))]) + # We bind-mounted several libraries into the container, so run # ldconfig to update the dynamic linker's cache. self.run(["/sbin/ldconfig"]) diff --git a/lpbuildd/target/tests/test_lxd.py b/lpbuildd/target/tests/test_lxd.py index dea7ad8..802d34c 100644 --- a/lpbuildd/target/tests/test_lxd.py +++ b/lpbuildd/target/tests/test_lxd.py @@ -100,14 +100,19 @@ class FakeFilesystem(_FakeFilesystem): def _stat(self, real, path, *args, **kwargs): r = super()._stat(real, path, *args, **kwargs) if path in self._devices: - r = os.stat_result(list(r), {"st_rdev": self._devices[path]}) + flags, device = self._devices[path] + mode = stat.S_IMODE(r.st_mode) | flags + r = os.stat_result([mode] + list(r[1:]), {"st_rdev": device}) return r def _mknod(self, real, path, mode=0o600, device=None): - fd = os.open(path, os.O_CREAT | os.O_EXCL, mode & 0o777) + fd = os.open(path, os.O_CREAT | os.O_EXCL) + os.fchmod(fd, stat.S_IMODE(mode)) os.close(fd) - if mode & (stat.S_IFBLK | stat.S_IFCHR): - self._devices[path] = device + if stat.S_ISBLK(mode): + self._devices[path] = (stat.S_IFBLK, device) + elif stat.S_ISCHR(mode): + self._devices[path] = (stat.S_IFCHR, device) class TestLXD(TestCase): @@ -329,13 +334,13 @@ class TestLXD(TestCase): "type": "disk", } if gpu_nvidia_paths: - expected_devices["gpu"] = {"type": "gpu"} for i, path in enumerate(gpu_nvidia_paths): - expected_devices[f"nvidia-{i}"] = { - "path": path, - "source": path, - "type": "disk", - } + if not path.startswith("/dev/"): + expected_devices[f"nvidia-{i}"] = { + "path": path, + "source": path, + "type": "disk", + } client.profiles.create.assert_called_once_with( "lpbuildd", expected_config, expected_devices) @@ -374,6 +379,7 @@ class TestLXD(TestCase): client.profiles.get.side_effect = FakeLXDAPIException client.host_info = {"environment": {"driver_version": "3.0"}} gpu_nvidia_paths = [ + "/dev/nvidiactl", "/usr/bin/nvidia-smi", "/usr/bin/nvidia-persistenced", ] @@ -436,7 +442,13 @@ class TestLXD(TestCase): processes_fixture.add( FakeHostname("example", "example.buildd"), name="hostname") if gpu_nvidia: + os.mknod( + "/dev/nvidia0", stat.S_IFCHR | 0o666, os.makedev(195, 0)) + os.mknod( + "/dev/nvidiactl", stat.S_IFCHR | 0o666, os.makedev(195, 255)) gpu_nvidia_paths = [ + "/dev/nvidia0", + "/dev/nvidiactl", "/usr/bin/nvidia-smi", "/usr/bin/nvidia-persistenced", ] @@ -470,8 +482,7 @@ class TestLXD(TestCase): expected_args.append( Equals( ["/snap/lxd/current/bin/nvidia-container-cli.real", - "list", - "--binaries", "--firmwares", "--ipcs", "--libraries"])) + "list"])) expected_args.extend([ Equals(ip + ["link", "add", "dev", "lpbuilddbr0", "type", "bridge"]), @@ -518,7 +529,17 @@ class TestLXD(TestCase): ["mknod", "-m", "0660", "/dev/dm-%d" % minor, "b", str(DM_BLOCK_MAJOR), str(minor)])) if gpu_nvidia: - expected_args.append(Equals(lxc + ["/sbin/ldconfig"])) + expected_args.extend([ + Equals( + lxc + + ["mknod", "-m", "0666", "/dev/nvidia0", + "c", "195", "0"]), + Equals( + lxc + + ["mknod", "-m", "0666", "/dev/nvidiactl", + "c", "195", "255"]), + Equals(lxc + ["/sbin/ldconfig"]), + ]) expected_args.extend([ Equals( lxc + ["mkdir", "-p", "/etc/systemd/system/snapd.service.d"]),
_______________________________________________ Mailing list: https://launchpad.net/~launchpad-reviewers Post to : [email protected] Unsubscribe : https://launchpad.net/~launchpad-reviewers More help : https://help.launchpad.net/ListHelp

