Re: [Devel] [PATCH rh7] ploop: use GFP_NOIO in ploop_make_request
On 31.08.2015 16:59, Konstantin Khorenko wrote: > Maxim, please review. > > Do we need the same in PCS6? yes, backport to rh6 is required because of https://bugs.openvz.org/browse/OVZ-6293 1294400.800014] INFO: task jbd2/ploop27926:1692 blocked for more than 120 seconds. [1294400.800121] Not tainted 2.6.32-042stab108.1 #1 [1294400.800177] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [1294400.800285] jbd2/ploop279 D 88081950e400 0 1692 2 0 0x0080 [1294400.800393] 88080dde4fe0 0046 88060dac1000 [1294400.800502] 00051200 88081b2c4ae0 0004991706322912 0002 [1294400.800615] 88080dde4fb8 00014d2b4e18 [1294400.800724] Call Trace: [1294400.800777] [] schedule_timeout+0x215/0x2e0 [1294400.800836] [] ? read_tsc+0x9/0x20 [1294400.800893] [] ? ktime_get_ts+0xb1/0xf0 [1294400.800951] [] ? read_tsc+0x9/0x20 [1294400.801008] [] ? ktime_get_ts+0xb1/0xf0 [1294400.801066] [] io_schedule_timeout+0x7f/0xd0 [1294400.801125] [] wait_for_completion_io+0xe4/0x120 [1294400.801185] [] ? default_wake_function+0x0/0x20 [1294400.801246] [] blkdev_issue_discard+0x216/0x230 [1294400.801305] [] scan_swap_map+0x385/0x640 [1294400.801362] [] get_swap_page+0x9d/0x140 [1294400.801420] [] add_to_swap+0x17/0x90 [1294400.801478] [] shrink_page_list.clone.0+0x2de/0x900 [1294400.801537] [] shrink_inactive_list+0x3be/0xb10 [1294400.801600] [] ? read_tsc+0x9/0x20 [1294400.801660] [] shrink_lruvec+0x430/0x600 [1294400.801718] [] shrink_zone+0x287/0x3d0 [1294400.801775] [] do_try_to_free_pages+0x588/0xa60 [1294400.801834] [] try_to_free_pages+0x8b/0x120 [1294400.801894] [] __alloc_pages_nodemask+0x671/0xb50 [1294400.801954] [] kmem_getpages+0x59/0x140 [1294400.802011] [] fallback_alloc+0x1bb/0x260 [1294400.802069] [] cache_alloc_node+0x99/0x150 [1294400.802128] [] kmem_cache_alloc+0x173/0x1e0 [1294400.802186] [] mempool_alloc_slab+0x15/0x20 [1294400.802244] [] mempool_alloc+0x67/0x170 [1294400.802302] [] ? kmem_getpages+0x59/0x140 [1294400.802361] [] bio_alloc_bioset+0x3e/0xf0 [1294400.802419] [] bio_alloc+0x15/0x30 [1294400.802478] [] ploop_make_request+0x5a8/0xa30 [ploop] [1294400.802538] [] generic_make_request+0x240/0x550 [1294400.802599] [] submit_bio+0x83/0x1c0 [1294400.802659] [] ? bio_alloc_bioset+0x5b/0xf0 [1294400.802719] [] submit_bh+0x11d/0x1e0 [1294400.802778] [] jbd2_journal_commit_transaction+0x5a8/0x1500 [jbd2] [1294400.802888] [] ? __switch_to+0x13d/0x320 [1294400.802946] [] ? try_to_del_timer_sync+0x7b/0xe0 [1294400.803008] [] kjournald2+0xb8/0x220 [jbd2] [1294400.803066] [] ? autoremove_wake_function+0x0/0x40 [1294400.803127] [] ? kjournald2+0x0/0x220 [jbd2] [1294400.803185] [] kthread+0x9e/0xc0 [1294400.803242] [] child_rip+0xa/0x20 [1294400.803298] [] ? kthread+0x0/0xc0 [1294400.803355] [] ? child_rip+0x0/0x20 > -- > Best regards, > > Konstantin Khorenko, > Virtuozzo Linux Kernel Team > > On 08/17/2015 04:30 PM, Vladimir Davydov wrote: >> Currently, we use GFP_NOFS, which may result in a dead lock as follows: >> >> filemap_fault >> do_mpage_readpage >>submit_bio >> generic_make_request initializes current->bio_list >> calls make_request_fn >> ploop_make_request >> bio_alloc(GFP_NOFS) >>kmem_cache_alloc >> memcg_charge_kmem >> try_to_free_mem_cgroup_pages >> swap_writepage >>generic_make_request puts bio on current->bio_list >> try_to-free_mem_cgroup_pages >> wait_on_page_writeback >> >> The wait_on_page_writeback will never complete then, because the >> corresponding bio is on current->bio_list and for it to get to the queue >> we must return from ploop_make_request first. >> >> The stack trace of a hung task: >> >> [] sleep_on_page+0xe/0x20 >> [] wait_on_page_bit+0x86/0xb0 >> [] shrink_page_list+0x6e2/0xaf0 >> [] shrink_inactive_list+0x1cb/0x610 >> [] shrink_lruvec+0x395/0x790 >> [] shrink_zone+0x181/0x350 >> [] do_try_to_free_pages+0x170/0x530 >> [] try_to_free_mem_cgroup_pages+0xb6/0x140 >> [] __mem_cgroup_try_charge+0x1de/0xd70 >> [] memcg_charge_kmem+0x9b/0x100 >> [] __memcg_charge_slab+0x3b/0x90 >> [] new_slab+0x264/0x3f0 >> [] __slab_alloc+0x315/0x48f >> [] kmem_cache_alloc+0x1cc/0x210 >> [] mempool_alloc_slab+0x15/0x20 >> [] mempool_alloc+0x69/0x170 >> [] bvec_alloc+0x92/0x120 >> [] bio_alloc_bioset+0x1e8/0x2e0 >> [] ploop_make_request+0x2a6/0xac0 [ploop] >> [] generic_make_request+0xe2/0x130 >> [] submit_bio+0x77/0x1c0 >> [] do_mpage_readpage+0x37f/0x6e0 >> [] mpage_readpages+0xeb/0x160 >> [] ext4_readpages+0x3c/0x40 [ext4] >> [] __do_page_cache_readahead+0x1e0/0x260 >> [] ra_submit+0x21/0x30 >> [] filemap_fault+0x321/0x4b0 >> [] __do_fault+0x8a/0x560 >> [] handle_mm_fault+0x3d0/0xd80 >> [] __do_page_fault+0x15e/0x530 >> [] do_page_fault+0x1a/0x70 >> [] page_fault+0x28/0x30 >> >>
Re: [Devel] [PATCH rh7] ploop: use GFP_NOIO in ploop_make_request
Yes, I think so. On 08/31/2015 06:59 AM, Konstantin Khorenko wrote: Maxim, please review. Do we need the same in PCS6? -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 08/17/2015 04:30 PM, Vladimir Davydov wrote: Currently, we use GFP_NOFS, which may result in a dead lock as follows: filemap_fault do_mpage_readpage submit_bio generic_make_request initializes current->bio_list calls make_request_fn ploop_make_request bio_alloc(GFP_NOFS) kmem_cache_alloc memcg_charge_kmem try_to_free_mem_cgroup_pages swap_writepage generic_make_request puts bio on current->bio_list try_to-free_mem_cgroup_pages wait_on_page_writeback The wait_on_page_writeback will never complete then, because the corresponding bio is on current->bio_list and for it to get to the queue we must return from ploop_make_request first. The stack trace of a hung task: [] sleep_on_page+0xe/0x20 [] wait_on_page_bit+0x86/0xb0 [] shrink_page_list+0x6e2/0xaf0 [] shrink_inactive_list+0x1cb/0x610 [] shrink_lruvec+0x395/0x790 [] shrink_zone+0x181/0x350 [] do_try_to_free_pages+0x170/0x530 [] try_to_free_mem_cgroup_pages+0xb6/0x140 [] __mem_cgroup_try_charge+0x1de/0xd70 [] memcg_charge_kmem+0x9b/0x100 [] __memcg_charge_slab+0x3b/0x90 [] new_slab+0x264/0x3f0 [] __slab_alloc+0x315/0x48f [] kmem_cache_alloc+0x1cc/0x210 [] mempool_alloc_slab+0x15/0x20 [] mempool_alloc+0x69/0x170 [] bvec_alloc+0x92/0x120 [] bio_alloc_bioset+0x1e8/0x2e0 [] ploop_make_request+0x2a6/0xac0 [ploop] [] generic_make_request+0xe2/0x130 [] submit_bio+0x77/0x1c0 [] do_mpage_readpage+0x37f/0x6e0 [] mpage_readpages+0xeb/0x160 [] ext4_readpages+0x3c/0x40 [ext4] [] __do_page_cache_readahead+0x1e0/0x260 [] ra_submit+0x21/0x30 [] filemap_fault+0x321/0x4b0 [] __do_fault+0x8a/0x560 [] handle_mm_fault+0x3d0/0xd80 [] __do_page_fault+0x15e/0x530 [] do_page_fault+0x1a/0x70 [] page_fault+0x28/0x30 https://jira.sw.ru/browse/PSBM-38842 Signed-off-by: Vladimir Davydov--- drivers/block/ploop/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 30eb8a7551e5..f37df4dacf8c 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -717,7 +717,7 @@ preallocate_bio(struct bio * orig_bio, struct ploop_device * plo) } if (nbio == NULL) -nbio = bio_alloc(GFP_NOFS, max(orig_bio->bi_max_vecs, block_vecs(plo))); +nbio = bio_alloc(GFP_NOIO, max(orig_bio->bi_max_vecs, block_vecs(plo))); return nbio; } @@ -852,7 +852,7 @@ static void ploop_make_request(struct request_queue *q, struct bio *bio) if (!current->io_context) { struct io_context *ioc; -ioc = get_task_io_context(current, GFP_NOFS, NUMA_NO_NODE); +ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE); if (ioc) put_io_context(ioc); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC rh7 v6] ve/tty: vt -- Implement per VE support for console and terminals
On Mon, Aug 31, 2015 at 10:51:02PM +0300, Cyrill Gorcunov wrote: > On Mon, Aug 31, 2015 at 10:36:25PM +0300, Vladimir Davydov wrote: > > > > Yeah, you're right. But then we diverge from pty ref counting design, > > which isn't good IMO. To fix that, let's increment slave's count twice > > on vtty_install, once for file and once for master reference. Then we > > If we increment it twice we have to carry extra-ref bit (which I've > dropped off) otherwise if noone has opened master peer the tty engine > will complain that the number of fd != count. What's worse is that > if noone ever has opened master we have to decrement it back together > with extra ref to be able to close the peer. I mean > > 1) Create pair inside container via slave > > slave:2 (1 - read fd + extra ref) master:0 > > noone opens master and we're calling tty_release, then in > vtty_close we've tp drop the extra ref together with count--, > but if we do so there is still a race window after tty_unlock. I see. Then it's OK. > > > will only have to decrement slave's count right after tty_init_dev, > > which is safe, because we're holding tty_lock there. Sounds OK? > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC rh7 v7] ve/tty: vt -- Implement per VE support for console and terminals
Previously in commit 8674c044330fad1458bd59b02f9037fb97e8b7af stubs for virtual terminals have been added, they support writes from kernel side which simply drops into the void. In the patch the code has been moved from kernel/ve/console.c to drivers/tty/pty.c to reuse a couple of pty helpers. Now we support up to MAX_NR_VTTY_CONSOLES virtual consoles inside container. For /dev/console we reserve the first virtual terminal. Some details on the driver itself: - The drivers carries per-VE tty instances in @vtty_idr map, once VE tries to open a terminal we allocate tty map internally and keep it intact until VE destructed, this allow us to not bind into device namespaces (ie not rely on tty_class); - Unlike buffered IO to unix98 driver once internal port buffer get full we don't block write operations if there is no reader assigned yet but zap them. This is done intentionally to behave closely to native consoles; - The kernel choose which VE request terminal using get_exec_env helper, but for opening master peer from the nodes ve0 it uses vtty_set_context/vtty_get_context/vtty_drop_context to notify tty layer which @vtty_idr to use instead of get_exec_env. https://jira.sw.ru/browse/PSBM-34533 https://jira.sw.ru/browse/PSBM-34532 https://jira.sw.ru/browse/PSBM-34107 https://jira.sw.ru/browse/PSBM-32686 https://jira.sw.ru/browse/PSBM-32685 v2: - Rename terminals from vtz to vtty - Merge code into /drivers/tty/pty.c to reuse some of pty functionality - Get rid of two array of indices, use one for master peers and fetch slaves via @link - Drop TTY_VT_OPEN and wait() on it - Add vtty_open_slave helper v3: - Reverse the scheme, the peers opened from inside of container are the slave peers as it were in pcs6 - Add vtty_set_context/vtty_drop_context/vtty_get_context to open needed tty from ve0 context - In vtty_open_master reuse existing vtty_lookup, vtty_open helpers - In ve_vtty_fini zap active tty tracking, such ttys are sitting here because the node has been opening the console and didn't release file descriptors yet with tty associated. The kernel will clean them up once they are closed but the tacking map pointer should be zapped to escape nil dereference v4: - Use lockdep_assert_held in vtty @map operations to make sure we're under @tty_mutex - vtty_install now requests for port memory earlier for vtty_install_peer simplification - Drop tty_vhangup call from vtty_close, as been found it doesn't bring any benefit - Drop TTY_BUFFER_PAGE and fix typo in vtty_write_room - Rework tty counting to be the same as in pcs6: drivers became TTY_DRIVER_TYPE_PTY and @count adjusted accordingly v5: - Treat zero as unused flag in vtty_get_context - vtty_printk helpers are dropped off - Don't test for exit state in lookup procedure: the kernel will do that on its own when slave is opened from inside of a container and for ioctl call we do such test explicitly - When pair is to open from the node and the existing peer is exiting we're allocating new pair early removin old one from per-VE ttys map, this is done to speedup open from the node - vtty_match is no longer exported into the rest of the tty code - When peer is to be closed we use own per-VE spinlock to read and modify own and peer counters, this is because the general tty->close routine is called without tty-mutex held and only one peer is locked thus such modifications are unsafe if do them locklessly. In current vanilla kernel there is no need for such lock if Unix ptys are used because master peers are always opened first and always get closed in constrast to the our driver where any peer end may be opened sole v6: - Reworked tty counting: no need for extra reference but make it close to how native Unix98 ptys are working: once master is opened it takes new TTY_PINNED flags and when it getting closed with active slave peer we defer tty destruction until both ends are spare. v7: - Move MAX_NR_VTTY_CONSOLES from header into pty.c - Drop vtty_zap_tty_map - Assign @driver_data in vtty_map_set - Rename vtty_map_del to vtty_map_clear - Merge map cleaning into vtty_map_free - Rename @current_veid to @vtty_context_veid - Rename TTY_PINNED to TTY_PINNED_BY_OTHER - Assing TTY_PINNED_BY_OTHER early in pair creation - Wake both ends of a peer in vtty_close FIXME: Once this is applied need to drop kernel/ve/coinsole.c from the source tree, dropping it immediately ruines my queue series, because there are other patches not yet merged but changing kernel/ve/console.c code. Signed-off-by: Cyrill GorcunovCC: Vladimir Davydov CC: Konstantin Khorenko --- drivers/tty/pty.c| 510 +++ drivers/tty/tty_io.c | 36 +-- include/linux/tty.h |3 include/linux/ve.h | 20 -- kernel/ve/Makefile |
[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: fix mangle root in CT
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 1518ff8ef0a78d8be1b19774506f355424103e9a Author: Pavel TikhomirovDate: Tue Sep 1 16:13:30 2015 +0400 ve/cgroup: fix mangle root in CT cgroups with depth level more than 2 were not mangled inside a container, that might caused problems with docker, docker were able to see in /proc/self/cgroup paths relative to host. But it is not docker specific: CT-103 /# mkdir /sys/fs/cgroup/devices/test.slice CT-103 /# mkdir /sys/fs/cgroup/devices/test.slice/test.scope CT-103 /# sleep 1000& [1] 578 CT-103 /# echo 578 > /sys/fs/cgroup/devices/test.slice/test.scope/tasks with patch: CT-103 /# cat /proc/578/cgroup 16:ve:/ 15:hugetlb:/ 14:perf_event:/ 12:net_cls:/ 11:freezer:/ 10:devices:/test.slice/test.scope 6:name=systemd:/user-0.slice/session-c109.scope 5:cpuset:/ 4:cpuacct,cpu:/ 3:beancounter:/ 2:memory:/ 1:blkio:/ without: CT-103 /# cat /proc/480/cgroup 16:ve:/ 15:hugetlb:/ 14:perf_event:/ 12:net_cls:/ 11:freezer:/ 10:devices:/103/test.slice/test.scope 6:name=systemd:/user.slice/user-0.slice/session-c2.scope 5:cpuset:/ 4:cpuacct,cpu:/ 3:beancounter:/ 2:memory:/ 1:blkio:/ https://jira.sw.ru/browse/PSBM-38634 Signed-off-by: Pavel Tikhomirov Reviewed-by: Cyrill Gorcunov khorenko@: this fix is quite inflexible, if we move CTs into machine.slice, we have to rework it. But i accept it because we are still not sure with final cgroups "virtualization" implementation => less work right now which can be later dropped. --- kernel/cgroup.c | 35 --- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d96176e..a07c4e0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1808,6 +1808,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { int ret = -ENAMETOOLONG; char *start; + struct ve_struct *ve = get_exec_env(); if (!cgrp->parent) { if (strlcpy(buf, "/", buflen) >= buflen) @@ -1815,21 +1816,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -#ifdef CONFIG_VE - /* -* Containers cgroups are bind-mounted from node -* so they are like '/' from inside, thus we have -* to mangle cgroup path output. -*/ - if (!ve_is_super(get_exec_env())) { - if (cgrp->parent && !cgrp->parent->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - } -#endif - start = buf + buflen - 1; *start = '\0'; @@ -1838,6 +1824,25 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) const char *name = cgroup_name(cgrp); int len; +#ifdef CONFIG_VE + if (!ve_is_super(ve) && cgrp->parent && !cgrp->parent->parent) { + /* +* Containers cgroups are bind-mounted from node +* so they are like '/' from inside, thus we have +* to mangle cgroup path output. Effectively it is +* enough to remove two topmost cgroups from path. +* e.g. in ct 101: /101/test.slice/test.scope -> +* /test.slice/test.scope +*/ + if (*start != '/') { + if (--start < buf) + goto out; + *start = '/'; + } + break; + } +#endif + len = strlen(name); if ((start -= len) < buf) goto out; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC rh7 v7] ve/tty: vt -- Implement per VE support for console and terminals
On Tue, Sep 01, 2015 at 02:59:06PM +0300, Vladimir Davydov wrote: > > > > Signed-off-by: Cyrill Gorcunov> > CC: Vladimir Davydov > > CC: Konstantin Khorenko > > Reviewed-by: Vladimir Davydov Thanks a HUGE for all your feedback! ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] mmap: call mmap prep only for regular files
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 1e596ab0358ff8dde342efb6274e08459d08a711 Author: Vladimir DavydovDate: Tue Sep 1 16:16:59 2015 +0400 mmap: call mmap prep only for regular files Port 2.6.32-x diff-mm-mmap-call-mmap-prep-only-for-regular-files We forgot to port this patch. This results in KP on an attempt to mmap a char device on ext4. = Author: Vladimir Davydov Email: vdavy...@parallels.com Subject: mmap: call mmap prep only for regular files Date: Mon, 17 Feb 2014 12:59:36 +0400 To give FS a chance to clear pfcache csum on shared mmap, we issue ->mmap(vma=NULL) for those FS's that want it (FS_HAS_MMAP_PREP) before taking mmap_sem (we can't do it under mmap_sem due to lockdep, see PSBM-23133). There we haven't checked arguments properly yet. In particular, the file can refer to a device, in which case we will crash, because devices' ->mmap (e.g. /dev/zero) is not supposed to be called with vma=NULL. Fix this by checking if the file refers to a regular file before calling mmap prep for it. https://bugzilla.openvz.org/show_bug.cgi?id=2886 https://jira.sw.ru/browse/PSBM-25031 Signed-off-by: Vladimir Davydov Acked-by: Dmitry Monakhov = Reported-by: Andrew Perepechko Signed-off-by: Vladimir Davydov Cc: Andrew Perepechko Cc: Alex Lyashkov Cc: Igor Seletskiy --- mm/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/util.c b/mm/util.c index 31cd9d7..e0ac8ae 100644 --- a/mm/util.c +++ b/mm/util.c @@ -367,6 +367,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, if (!ret) { /* Ugly fix for PSBM-23133 vdavydov@ */ if (file && file->f_op && (flag & MAP_TYPE) == MAP_SHARED && + S_ISREG(file_inode(file)->i_mode) && (file_inode(file)->i_sb->s_type->fs_flags & FS_HAS_MMAP_PREP)) file->f_op->mmap(file, NULL); down_write(>mmap_sem); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/userns: Allow PR_CAPBSET_DROP in a user namespace.
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 038113017ff594bbc49c365d48a0f3ec4f14ea8b Author: Eric W. BiedermanDate: Tue Sep 1 18:50:40 2015 +0400 ms/userns: Allow PR_CAPBSET_DROP in a user namespace. ms commit: 160da84dbb39443fdade7151bc63a88f8e953077 As the capabilites and capability bounding set are per user namespace properties it is safe to allow changing them with just CAP_SETPCAP permission in the user namespace. Acked-by: Serge Hallyn Tested-by: Richard Weinberger Signed-off-by: "Eric W. Biederman" https://jira.sw.ru/browse/PSBM-39077 Signed-off-by: Andrew Vagin --- security/commoncap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/commoncap.c b/security/commoncap.c index 3d7811d..59ff538 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -852,7 +852,7 @@ static int cap_prctl_drop(unsigned long cap) { struct cred *new; - if (!capable(CAP_SETPCAP)) + if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/fs: allow to mount devtmpfs in a non-root userns
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit cb03dcae8c9bf4e2d6d39ca82d8ead1b153d9205 Author: Andrew VaginDate: Tue Sep 1 18:55:49 2015 +0400 ve/fs: allow to mount devtmpfs in a non-root userns devtmpfs is virtualized, so it has to be secure. https://jira.sw.ru/browse/PSBM-39077 Signed-off-by: Andrew Vagin Reviewed-by: Vladimir Davydov ` --- drivers/base/devtmpfs.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index daf97ee..9f3809c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -105,6 +105,9 @@ static struct dentry *ve_dev_mount(struct file_system_type *fs_type, int flags, static struct dentry *dev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { + if (get_exec_env()->init_cred->user_ns != current_user_ns()) + return ERR_PTR(-EPERM); + #ifdef CONFIG_VE if (!ve_is_super(get_exec_env())) return ve_dev_mount(fs_type, flags, dev_name, data); @@ -120,7 +123,7 @@ static struct file_system_type dev_fs_type = { .name = "devtmpfs", .mount = dev_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_VIRTUALIZED, + .fs_flags = FS_VIRTUALIZED | FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, }; #ifdef CONFIG_BLOCK ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] cred: add ve_capable to check capabilities relative to the current VE (v2)
We want to allow a few operations in VE. Currently we use nsown_capable, but it's wrong, because in this case we allow these operations in any user namespace. v2: take ve0->cred if the currect ve isn't running Signed-off-by: Andrew Vagin--- fs/autofs4/root.c |6 ++ fs/ioprio.c|2 +- fs/namei.c |2 +- include/linux/capability.h |1 + kernel/capability.c| 20 kernel/printk.c|5 ++--- net/ipv6/sit.c |2 +- net/netfilter/nf_sockopt.c |2 +- security/commoncap.c |4 ++-- security/device_cgroup.c |4 ++-- 10 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 68e3edb..1462d8b 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -588,8 +588,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && - !capable(CAP_VE_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN)) return -EPERM; if (atomic_dec_and_test(>count)) { @@ -837,8 +836,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && - !capable(CAP_VE_SYS_ADMIN)) + if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN)) return -EPERM; switch(cmd) { diff --git a/fs/ioprio.c b/fs/ioprio.c index c876fad..f9d9187 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -75,7 +75,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_VE_ADMIN)) + if (!ve_capable(CAP_SYS_ADMIN)) return -EPERM; class = IOPRIO_CLASS_BE; data = 0; diff --git a/fs/namei.c b/fs/namei.c index 8e29a44..e7d9f54 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3397,7 +3397,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !nsown_capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !ve_capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/include/linux/capability.h b/include/linux/capability.h index 2b77384..b1131e3 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -217,6 +217,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool nsown_capable(int cap); +extern bool ve_capable(int cap); extern bool inode_capable(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/kernel/capability.c b/kernel/capability.c index 0a843d5..4a73381 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Leveraged for setting/resetting capabilities @@ -396,6 +397,25 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +#if CONFIG_VE +bool ve_capable(int cap) +{ + struct cred *cred = get_exec_env()->init_cred; + + if (cred == NULL) /* ve isn't running */ + cred = ve0.init_cred; + + return ns_capable(cred->user_ns, cap); +} +#else +bool ve_capable(int cap) +{ + return capable(cap); +} +#endif + +EXPORT_SYMBOL_GPL(ve_capable); + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check diff --git a/kernel/printk.c b/kernel/printk.c index 44b3783..91766fc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -468,14 +468,13 @@ static int check_syslog_permissions(int type, bool from_file) return 0; if (syslog_action_restricted(type)) { - if (nsown_capable(CAP_SYSLOG)) + if (ve_capable(CAP_SYSLOG)) return 0; /* * For historical reasons, accept CAP_SYS_ADMIN too, with * a warning. */ - if (nsown_capable(CAP_SYS_ADMIN) || - nsown_capable(CAP_VE_ADMIN)) { + if (ve_capable(CAP_SYS_ADMIN)) { pr_warn_once("%s (%d): Attempt to access syslog with " "CAP_SYS_ADMIN but no CAP_SYSLOG "
[Devel] [PATCH RHEL7 COMMIT] ve/kmod: allow to autoload nf_log_ipv[46]
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 778f99b62e2e3e68d1d5f2fa4dec3adcbe524c51 Author: Vladimir DavydovDate: Tue Sep 1 17:16:58 2015 +0400 ve/kmod: allow to autoload nf_log_ipv[46] These modules are required for adding the LOG iptables target. In PCS6 the LOG target is provided by ipt_LOG (in case of ipv4) or ip6t_LOG (in case of ipv6) module. In Vz7 it is split between xt_LOG (generic) and nf_log_ipv[46] (ipv[46] specific) with ipt_LOG and ip6t_LOG being aliases for xt_LOG. As in PCS6, in Vz7 we load ip{,6}t_LOG on IP{,6}T_SO_SET_REPLACE setsockopt, but in contrast to PCS6, this actually loads xt_LOG containing only generic implementation. The ipv[46] part is loaded by the xt_target->checkentry virtual method, which is log_tg_check in case of xt_LOG. The log_tg_check function loads the modules by name "nf-logger--". Since the type is 0 for ipv[46] LOG target, we should allow to load the following modules from inside a container: - nf-logger-2-0 for ipv4 (AF_INET) - nf-logger-10-0 for ipv6 (AF_INET6) https://jira.sw.ru/browse/PSBM-38573 Signed-off-by: Vladimir Davydov Acked-by: Kirill Tkhai --- kernel/kmod.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index aa5cb99..d0cdf36 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -294,6 +294,8 @@ static struct { { "nft-expr-2-reject", VE_IP_IPTABLES }, { "nft-expr-10-reject", VE_IP_IPTABLES6 }, + { "nf-logger-2-0", VE_IP_IPTABLES }, + { "nf-logger-10-0", VE_IP_IPTABLES6 }, }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/sched: Fix double put_prev_task_fair() because of trigger_cpulimit_balance()
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 315dcacbe9e305082a46c9d2ea585d83576efae9 Author: Kirill TkhaiDate: Tue Sep 1 17:08:03 2015 +0400 ve/sched: Fix double put_prev_task_fair() because of trigger_cpulimit_balance() The scheduller code is written with the assumption, that rq->curr task can't be already put. For example, in sched_move_task() we check for running = task_current(rq, tsk); and call put_prev_task() if "running" is true. When we're unlocking rq->lock in trigger_cpulimit_balance(), the task has already been put, so concurrent cpu_cgroup_attach_task()->sched_move_task() puts it one more time. https://jira.sw.ru/browse/PSBM-35082 Signed-off-by: Kirill Tkhai --- kernel/sched/fair.c | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 167d0f6..3092f76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5068,18 +5068,16 @@ static inline void trigger_cpulimit_balance(struct task_struct *p) int this_cpu, cpu, target_cpu = -1; struct sched_domain *sd; - if (!p->se.on_rq) - return; - this_rq = rq_of(cfs_rq_of(>se)); this_cpu = cpu_of(this_rq); + if (!p->se.on_rq || this_rq->active_balance) + return; + cfs_rq = top_cfs_rq_of(>se); if (check_cpulimit_spread(cfs_rq, this_cpu) >= 0) return; - raw_spin_unlock(_rq->lock); - rcu_read_lock(); for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -5096,17 +5094,14 @@ static inline void trigger_cpulimit_balance(struct task_struct *p) unlock: rcu_read_unlock(); - raw_spin_lock(_rq->lock); if (target_cpu >= 0) { - if (!this_rq->active_balance) { - this_rq->active_balance = 1; - this_rq->push_cpu = target_cpu; - raw_spin_unlock(_rq->lock); - stop_one_cpu_nowait(this_cpu, - cpulimit_balance_cpu_stop, this_rq, - _rq->active_balance_work); - raw_spin_lock(_rq->lock); - } + this_rq->active_balance = 1; + this_rq->push_cpu = target_cpu; + raw_spin_unlock(_rq->lock); + stop_one_cpu_nowait(this_rq->cpu, + cpulimit_balance_cpu_stop, this_rq, + _rq->active_balance_work); + raw_spin_lock(_rq->lock); } } #else @@ -5127,8 +5122,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) cfs_rq = cfs_rq_of(se); put_prev_entity(cfs_rq, se); } - - trigger_cpulimit_balance(prev); } /* @@ -5787,7 +5780,8 @@ static int cpulimit_balance_cpu_stop(void *data) raw_spin_lock_irq(>lock); - if (unlikely(cpu != smp_processor_id() || !rq->active_balance)) + if (unlikely(cpu != smp_processor_id() || !rq->active_balance || +!cpu_online(target_cpu))) goto out_unlock; if (unlikely(!rq->nr_running)) @@ -7269,6 +7263,11 @@ out_unlock: return 0; } +static void pre_schedule_fair(struct rq *rq, struct task_struct *prev) +{ + trigger_cpulimit_balance(prev); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -8171,6 +8170,7 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking= task_waking_fair, + .pre_schedule = pre_schedule_fair, #endif .set_curr_task = set_curr_task_fair, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/fs/inotify: do not impose limit on the number of instances by default
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit db1716aedfb68f9e479c352b1758447fdd0799c5 Author: Vladimir DavydovDate: Tue Sep 1 17:40:59 2015 +0400 ve/fs/inotify: do not impose limit on the number of instances by default In Vz7 we haven't switched to user ns yet. As a result, all containers use the same user_struct for the same user id. This leads to hitting fs.inotify.max_user_instances sysctl limit quickly (it equals 128 by default) and failing to start a container. This patch sets the default limit to INT_MAX. This is a temporary solution and should be reverted once we start using user ns. In PCS6 there is no such problem, because we actually create a user ns per container there. Although its functionality is basic in comparison to Vz7, it still results in creating a new user_struct for each user inside a container so that the inotify limit is containerized. https://jira.sw.ru/browse/PSBM-39048 Signed-off-by: Vladimir Davydov khorenko@: to be reverted once we support userns in Virtuozzo 7 --- fs/notify/inotify/inotify_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 959815c..95d5ebf 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -844,8 +844,8 @@ static int __init inotify_user_setup(void) event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + inotify_max_user_instances = INT_MAX; + inotify_max_user_watches = INT_MAX; return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] cred: add ve_capable to check capabilities relative to the current VE (v2)
On Tue, Sep 01, 2015 at 04:59:59PM +0400, Andrew Vagin wrote: > We want to allow a few operations in VE. Currently we use nsown_capable, > but it's wrong, because in this case we allow these operations in any > user namespace. > > v2: take ve0->cred if the currect ve isn't running > > Signed-off-by: Andrew VaginReviewed-by: Vladimir Davydov ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/sysfs: propagate access to all sub-directories hierarchically
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 1a33dd2936a14ae4fbede97c34ac5fbfc3dc8f30 Author: Pavel TikhomirovDate: Tue Sep 1 17:58:42 2015 +0400 ve/sysfs: propagate access to all sub-directories hierarchically Docker test TestGetContainerStats asks docker to get network stats of container. Docker tries to get network devices stats like count of bytes or packets sent/received from: /sys/class/net/[device_name]/statistics/* We showed only statistics directory but not it's content as namespace tag check went up only for one level. So make this check fully hierarchical for docker. https://jira.sw.ru/browse/PSBM-34523 Signed-off-by: Pavel Tikhomirov Reviewed-by: Vladimir Davydov --- fs/sysfs/dir.c | 10 +++--- fs/sysfs/inode.c | 8 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index da73287..b4a7fda 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -77,14 +77,18 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left, static bool sysfs_sd_visible(struct sysfs_dirent *sd, struct super_block *sb) { struct ve_struct *ve = sysfs_info(sb)->ve; + struct sysfs_dirent *tmp_sd = sd; /* Host sees anything */ if (ve_is_super(ve)) return true; - /* Entries with namespace tag always visible */ - if (sd->s_ns || (sd->s_parent && sd->s_parent->s_ns)) - return true; + /* Entries with namespace tag and their sub-entries always visible */ + while (tmp_sd) { + if (tmp_sd->s_ns) + return true; + tmp_sd = tmp_sd->s_parent; + } /* Symlinks are visible if target sd is visible */ if (sysfs_type(sd) == SYSFS_KOBJ_LINK) diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 45f5212..7e54859 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -345,13 +345,17 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha static int sysfs_sd_permission(struct sysfs_dirent *sd, int mask) { struct ve_struct *ve = get_exec_env(); + struct sysfs_dirent *tmp_sd = sd; int perm; if (ve_is_super(ve)) return 0; - if (sd->s_ns || (sd->s_parent && sd->s_parent->s_ns)) - return 0; + while (tmp_sd) { + if (tmp_sd->s_ns) + return 0; + tmp_sd = tmp_sd->s_parent; + } if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sd = sd->s_symlink.target_sd; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/sched: Fix schedule_tail() to disable preemption
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit b1d861eea2c7217a8bd6162b93c5a4356d822844 Author: Kirill TkhaiDate: Tue Sep 1 17:08:00 2015 +0400 ms/sched: Fix schedule_tail() to disable preemption Porting ms commit 1a43a14a5bd9c32dbd7af35e35a5afa703944bcb by Oleg Nesterov: finish_task_switch() enables preemption, so post_schedule(rq) can be called on the wrong (and even dead) CPU. Afaics, nothing really bad can happen, but in this case we can wrongly clear rq->post_schedule on that CPU. And this simply looks wrong in any case. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141008193644.ga32...@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Kirill Tkhai --- kernel/sched/core.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bbb6fc3..30f39a25 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2245,20 +2245,18 @@ static inline void post_schedule(struct rq *rq) asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); + struct rq *rq; +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); +#endif + rq = this_rq(); finish_task_switch(rq, prev); - /* -* FIXME: do we need to worry about rq being invalidated by the -* task_switch? -*/ post_schedule(rq); - -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); -#endif + if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: do not virtualize output of cgroup_path
The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.6.5 --> commit 607a3ac1497d796f267f72c4af27dbfd9d0cc3f0 Author: Vladimir DavydovDate: Tue Sep 1 17:38:00 2015 +0400 ve/cgroup: do not virtualize output of cgroup_path When cgroup_path() is called from inside a container, its output is "virtualized", i.e. cgroup /CTID/A/B is reported as /A/B. This was done for userspace tools to not get confused by the output of some proc files (namely, /proc/PID/{cgroup,cpuset}). However, it is wrong to virtualize cgroup_path() anytime it is called by a container. For instance, it is called from inside a container on OOM in order to dump memcg info to system log, in which case mangling its output would be incorrect. Therefore this patch makes cgroup_path() always return an absolute path. To get a container-relative path, one should now use cgroup_path_ve(). Currently, cgroup_path_ve() is only used for /proc files output (it seems to be enough for now). https://jira.sw.ru/browse/PSBM-34852 Signed-off-by: Vladimir Davydov Acked-by: Cyrill Gorcunov --- include/linux/cgroup.h | 1 + kernel/cgroup.c| 17 + kernel/cpuset.c| 2 +- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 44b64c9..ed5e6ac 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -561,6 +561,7 @@ int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); +int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a07c4e0..ad61c97 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1804,11 +1804,10 @@ static struct kobject *cgroup_kobj; * inode's i_mutex, while on the other hand cgroup_path() can be called * with some irq-safe spinlocks held. */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) +int __cgroup_path(const struct cgroup *cgrp, char *buf, int buflen, bool virt) { int ret = -ENAMETOOLONG; char *start; - struct ve_struct *ve = get_exec_env(); if (!cgrp->parent) { if (strlcpy(buf, "/", buflen) >= buflen) @@ -1825,7 +1824,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) int len; #ifdef CONFIG_VE - if (!ve_is_super(ve) && cgrp->parent && !cgrp->parent->parent) { + if (virt && cgrp->parent && !cgrp->parent->parent) { /* * Containers cgroups are bind-mounted from node * so they are like '/' from inside, thus we have @@ -1860,8 +1859,18 @@ out: rcu_read_unlock(); return ret; } + +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) +{ + return __cgroup_path(cgrp, buf, buflen, false); +} EXPORT_SYMBOL_GPL(cgroup_path); +int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen) +{ + return __cgroup_path(cgrp, buf, buflen, !ve_is_super(get_exec_env())); +} + /* * Control Group taskset */ @@ -4927,7 +4936,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); + retval = cgroup_path_ve(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; seq_puts(m, buf); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2400c4e..81030b3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2697,7 +2697,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + retval = cgroup_path_ve(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) goto out_put_task; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel