On 7/31/18 2:49 PM, Wolfgang Bumiller wrote: > Signed-off-by: Wolfgang Bumiller <w.bumil...@proxmox.com> > --- > debian/control | 2 +- > src/PVE/LXC.pm | 101 > +++++++++++++++++++++++++++++++++++++++++++++++++- > src/PVE/LXC/Config.pm | 53 ++++++++++++++++++++++++++ > 3 files changed, 153 insertions(+), 3 deletions(-) > > diff --git a/debian/control b/debian/control > index 7fbea8d..b3491c9 100644 > --- a/debian/control > +++ b/debian/control > @@ -7,7 +7,7 @@ Build-Depends: debhelper (>= 7.0.50~), > libpve-guest-common-perl | libpve-common-perl (<= 4.0-89), > libpve-storage-perl, > libtest-mockmodule-perl, > - lxc (>= 2.1.0-1) | lxc-pve (>= 2.1.0-1), > + lxc (>= 3.0.2-1) | lxc-pve (>= 3.0.1+pve1-1), > pve-cluster (>= 4.0-8), > pve-doc-generator, > Standards-Version: 3.8.4 > diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm > index bc03792..46222ba 100644 > --- a/src/PVE/LXC.pm > +++ b/src/PVE/LXC.pm > @@ -27,6 +27,8 @@ use PVE::Syscall; > use PVE::LXC::Config; > use Time::HiRes qw (gettimeofday); > > +my $LXC_CONFIG_PATH = '/usr/share/lxc/config'; > + > my $nodename = PVE::INotify::nodename(); > > my $cpuinfo= PVE::ProcFSTools::read_cpuinfo(); > @@ -368,6 +370,94 @@ sub get_cgroup_subsystems { > return wantarray ? ($v1, $v2) : $v1; > } > > +# Currently we do not need to create seccomp profile 'files' as the only > +# choice our configuration actually allows is "with or without keyctl()", > +# so we distinguish between using lxc's "default" seccomp profile and our > +# added pve-userns.seccomp file. > +# > +# This returns a configuration line added to the raw lxc config. > +sub make_seccomp_config { > + my ($conf, $unprivileged, $features) = @_; > + # User-configured profile has precedence, note that the user's entry > would > + # be written 'after' this line anyway... > + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) { > + # Warn the user if this conflicts with a feature: > + if ($features->{keyctl}) { > + warn "explicitly configured lxc.seccomp.profile overrides the > following settings: features:keyctl\n"; > + } > + return ''; > + } > + > + # Privileged containers keep using the default (which is already part of > + # the files included via lxc.include, so we don't need to write it out, > + # that way it stays admin-configurable via /usr/share/lxc/config/... as > + # well) > + return '' if !$unprivileged; > + > + # Unprivileged containers will get keyctl() disabled by default as a > + # workaround for systemd-networkd behavior. But we have an option to > + # explicitly enable it: > + return '' if $features->{keyctl}; > + > + # Finally we're in an unprivileged container without `keyctl` set > + # explicitly. We have a file prepared for this: > + return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n"; > +} > + > +# Since lxc-3.0.2 we can have lxc generate a profile for the container > +# automatically. The default should be equivalent to the old > +# `lxc-container-default-cgns` profile. > +# > +# Additionally this also added `lxc.apparmor.raw` which can be used to inject > +# additional lines into the profile. We can use that to allow mounting > specific > +# file systems. > +sub make_apparmor_config { > + my ($conf, $unprivileged, $features) = @_; > + > + # user-configured profile has precedence, but first we go through our own > + # code to figure out whether we should warn the user: > + > + my $raw = "lxc.apparmor.profile = generated\n"; > + my @profile_uses; > + > + # There's lxc.apparmor.allow_nesting now, which will add the necessary > + # apparmor lines, create an apparmor namespace for the container, but > also > + # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have > + # lxcfs mounted over them, because that would prevent the container from > + # mounting new instances of them for nested containers. > + if ($features->{nesting}) { > + push @profile_uses, 'features:nesting'; > + $raw .= "lxc.apparmor.allow_nesting = 1\n" > + } else { > + # In the default profile in /etc/apparmor.d we patch this in because > + # otherwise a container can for example run `chown` on /sys, breaking > + # access to it for non-CAP_DAC_OVERRIDE tools on the host: > + $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n"; > + $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n"; > + # Preferably we could use the 'remount' flag but this does not sit well > + # with apparmor_parser currently: > + # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/, > + } > + > + if (my $mount = $features->{mount}) { > + push @profile_uses, 'features:mount'; > + foreach my $fs (PVE::Tools::split_list($mount)) { > + $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n"; > + } > + } > + > + # More to come? > + > + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) { > + if (length(my $used = join(', ', @profile_uses))) { > + warn "explicitly configured lxc.apparmor.profile overrides the > following settings: $used\n"; > + } > + return ''; > + } > + > + return $raw; > +} > + > sub update_lxc_config { > my ($vmid, $conf) = @_; > > @@ -386,7 +476,7 @@ sub update_lxc_config { > $raw .= "lxc.arch = $conf->{arch}\n"; > > my $unprivileged = $conf->{unprivileged}; > - my $custom_idmap = grep { $_->[0] eq 'lxc.idmap' } @{$conf->{lxc}}; > + my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap'); > > my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; > > @@ -398,9 +488,16 @@ sub update_lxc_config { > $inc = "$cfgpath/$ostype.userns.conf"; > $inc = "$cfgpath/userns.conf" if !-f $inc; > $raw .= "lxc.include = $inc\n"; > - $raw .= "lxc.seccomp.profile = $cfgpath/pve-userns.seccomp\n"; > } > > + my $features = PVE::LXC::Config->parse_features($conf->{features}); > + > + $raw .= make_seccomp_config($conf, $unprivileged || $custom_idmap, > + $features); > + > + $raw .= make_apparmor_config($conf, $unprivileged || $custom_idmap, > + $features);
what is the combination of '$unprivileged || $custom_idmap'`? In the methods called this parameter is called just '$unprivileged', so it's a bit confusing that it gets that also on true if an user/group ID mapping is used. maybe pull that out (with your addition this || construct is used in three places) call it something alike $usernamespaced or $use_userns? Looks OK besides that. > + > # WARNING: DO NOT REMOVE this without making sure that loop device nodes > # cannot be exposed to the container with r/w access (cgroup perms). > # When this is enabled mounts will still remain in the monitor's > namespace > diff --git a/src/PVE/LXC/Config.pm b/src/PVE/LXC/Config.pm > index a2693d7..9f6765e 100644 > --- a/src/PVE/LXC/Config.pm > +++ b/src/PVE/LXC/Config.pm > @@ -272,6 +272,41 @@ > PVE::JSONSchema::register_standard_option('pve-lxc-snapshot-name', { > maxLength => 40, > }); > > +my $features_desc = { > + mount => { > + optional => 1, > + type => 'string', > + description => "Allow mounting file systems of specific types." > + ." This should be a list of file system types as used with the > mount command." > + ." Note that this can have negative effects on the container's > security." > + ." With access to a loop device, mounting a file can circumvent the > mknod" > + ." permission of the devices cgroup, mounting an NFS file system > can" > + ." block the host's I/O completely and prevent it from rebooting, > etc.", > + format_description => 'fstype;fstype;...', > + pattern => qr/[a-zA-Z0-9; ]+/, > + }, > + nesting => { > + optional => 1, > + type => 'boolean', > + default => 0, > + description => "Allow nesting." > + ." Best used with unprivileged containers with additional id > mapping." > + ." Note that this will expose procfs and sysfs contents of the host" > + ." to the guest.", > + }, > + keyctl => { > + optional => 1, > + type => 'boolean', > + default => 0, > + description => "For unprivileged containers only: Allow the use of the > keyctl() system call." > + ." This is required to use docker inside a container." > + ." By default unprivileged containers will see this system call as > non-existent." > + ." This is mostly a workaround for systemd-networkd, as it will > treat it as a fatal" > + ." error when some keyctl() operations are denied by the kernel due > to lacking permissions." > + ." Essentially, you can choose between running systemd-networkd or > docker.", > + }, > +}; > + > my $confdesc = { > lock => { > optional => 1, > @@ -409,6 +444,12 @@ my $confdesc = { > description => "Makes the container run as unprivileged user. (Should > not be modified manually.)", > default => 0, > }, > + features => { > + optional => 1, > + type => 'string', > + format => $features_desc, > + description => "Allow containers access to advanced features.", > + }, > }; > > my $valid_lxc_conf_keys = { > @@ -872,6 +913,9 @@ sub update_pct_config { > } > } elsif ($opt eq 'unprivileged') { > die "unable to delete read-only option: '$opt'\n"; > + } elsif ($opt eq 'features') { > + next if $hotplug_error->($opt); > + delete $conf->{$opt}; > } else { > die "implement me (delete: $opt)" > } > @@ -1025,6 +1069,9 @@ sub update_pct_config { > } elsif ($opt eq 'ostype') { > next if $hotplug_error->($opt); > $conf->{$opt} = $value; > + } elsif ($opt eq 'features') { > + next if $hotplug_error->($opt); > + $conf->{$opt} = $value; > } else { > die "implement me: $opt"; > } > @@ -1176,6 +1223,12 @@ sub parse_lxc_network { > return $res; > } > > +sub parse_features { > + my ($class, $data) = @_; > + return {} if !$data; > + return PVE::JSONSchema::parse_property_string($features_desc, $data); > +} > + > sub option_exists { > my ($class, $name) = @_; > > _______________________________________________ pve-devel mailing list pve-devel@pve.proxmox.com https://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel