[PATCH 2/4] ccgroup: remove redundant code in cgroup_rmdir()

2014-09-17 Thread Li Zefan
We no longer clear kn->priv in cgroup_rmdir(), so we don't need
to get an extra refcnt.

Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0ce9d9e..26b8cb9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4841,13 +4841,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
-   cgroup_get(cgrp);   /* for @kn->priv clearing */
 
ret = cgroup_destroy_locked(cgrp);
 
cgroup_kn_unlock(kn);
-
-   cgroup_put(cgrp);
return ret;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] cgroup: reuse css->destroy_work for release agent

2014-09-17 Thread Li Zefan
Currently we use a global work to schedule release agent on removable
cgroups. We can change to reuse css->destroy_work to do this, which
saves a few lines of code.

Signed-off-by: Zefan Li 
---
 include/linux/cgroup.h |   7 
 kernel/cgroup.c| 108 ++---
 2 files changed, 39 insertions(+), 76 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f7898e0..97da407 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -234,13 +234,6 @@ struct cgroup {
struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 
/*
-* Linked list running through all cgroups that can
-* potentially be reaped by the release agent. Protected by
-* release_list_lock
-*/
-   struct list_head release_list;
-
-   /*
 * list of pidlists, up to two for each namespace (one for procs, one
 * for tasks); created on demand.
 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1abb554..5b6566c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -392,12 +392,7 @@ static int notify_on_release(const struct cgroup *cgrp)
;   \
else
 
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /*
@@ -1577,7 +1572,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
-   INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1587,6 +1581,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
init_waitqueue_head(&cgrp->offline_waitq);
+   INIT_WORK(&cgrp->self.destroy_work, cgroup_release_agent);
 }
 
 static void init_cgroup_root(struct cgroup_root *root,
@@ -4804,12 +4799,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
 
-   /* CSS_ONLINE is clear, remove from ->release_list for the last time */
-   raw_spin_lock(&release_list_lock);
-   if (!list_empty(&cgrp->release_list))
-   list_del_init(&cgrp->release_list);
-   raw_spin_unlock(&release_list_lock);
-
/*
 * Remove @cgrp directory along with the base files.  @cgrp has an
 * extra ref on its kn.
@@ -5274,21 +5263,14 @@ static void check_for_release(struct cgroup *cgrp)
if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
!css_has_online_children(&cgrp->self)) {
/*
-* Control Group is currently removeable. If it's not
-* already queued for a userspace notification, queue
-* it now
+* get a reference, so the cgroup can only be freed
+* after the release work is done.
 */
-   int need_schedule_work = 0;
+   if (!cgroup_tryget(cgrp))
+   return;
 
-   raw_spin_lock(&release_list_lock);
-   if (!cgroup_is_dead(cgrp) &&
-   list_empty(&cgrp->release_list)) {
-   list_add(&cgrp->release_list, &release_list);
-   need_schedule_work = 1;
-   }
-   raw_spin_unlock(&release_list_lock);
-   if (need_schedule_work)
-   schedule_work(&release_agent_work);
+   if (!queue_work(cgroup_destroy_wq, &cgrp->self.destroy_work))
+   cgroup_put(cgrp);
}
 }
 
@@ -5317,52 +5299,40 @@ static void check_for_release(struct cgroup *cgrp)
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
-   BUG_ON(work != &release_agent_work);
+   struct cgroup_subsys_state *css =
+   container_of(work, struct cgroup_subsys_state, destroy_work);
+   struct cgroup *cgrp = css->cgroup;
+   char *pathbuf = NULL, *agentbuf = NULL, *path;
+   char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
-   raw_spin_lock(&release_list_lock);
-   while (!list_empty(&release_list)) {
-   char *argv[3], *envp[3];
-   int i;
-   char *pathbuf = NULL, *agentbuf = NULL, *path;
-   struct cgroup *cgrp = list_entry(release_list.next,
-   struct cgroup,
-   release_list);
-   list_del_init(&cgrp

[PATCH 3/4] cgroup: remove bogus comments

2014-09-17 Thread Li Zefan
We never grab cgroup mutex in fork and exit paths no matter whether
notify_on_release is set or not.

Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 26b8cb9..1abb554 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -967,14 +967,6 @@ static struct cgroup *task_cgroup_from_root(struct 
task_struct *task,
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex.  These are the two most performance
- * critical pieces of code here.  The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] cgroup: remove some useless forward declarations

2014-09-17 Thread Li Zefan

Signed-off-by: Zefan Li 
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c| 2 --
 2 files changed, 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5223c5..f7898e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,7 +27,6 @@
 
 struct cgroup_root;
 struct cgroup_subsys;
-struct inode;
 struct cgroup;
 
 extern int cgroup_init_early(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 940aced..0ce9d9e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
-static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-17 Thread Li Zefan
On 2014/9/17 13:29, Li Zefan wrote:
> On 2014/9/17 7:56, Cong Wang wrote:
>> Hi, Tejun
>>
>>
>> We saw some kernel null pointer dereference in
>> cgroup_pidlist_destroy_work_fn(), more precisely at
>> __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
>> on request.
>>
> 
> Yes, please.
> 
>> Looking at the code, it seems flush_workqueue() doesn't care about new
>> incoming works, it only processes currently pending ones, if this is
>> correct, then we could have the following race condition:
>>
>> cgroup_pidlist_destroy_all():
>> //...
>> mutex_lock(&cgrp->pidlist_mutex);
>> list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
>> mod_delayed_work(cgroup_pidlist_destroy_wq,
>> &l->destroy_dwork, 0);
>> mutex_unlock(&cgrp->pidlist_mutex);
>>
>> // <--- another process calls cgroup_pidlist_start() here
>> since mutex is released
>>
>> flush_workqueue(cgroup_pidlist_destroy_wq); // <--- another
>> process adds new pidlist and queue work in pararell
>> BUG_ON(!list_empty(&cgrp->pidlists)); // <--- This check is
>> passed, list_add() could happen after this
>>
> 
> Did you confirm this is what happened when the bug was triggered?
> 
> I don't think the race condition you described exists. In 3.14 kernel,
> cgroup_diput() won't be called if there is any thread running
> cgroup_pidlist_start(). This is guaranteed by vfs.
> 
> But newer kernels are different. Looks like the bug exists in those
> kernels.
> 

Newer kernels should be also fine.

If cgroup_pidlist_destroy_all() is called, it means kernfs has already
removed the tasks file, and even if you still have it opened, when
you try to read it, it will immediately return an errno.

fd = open(cgrp/tasks)
cgroup_rmdir(cgrp)
  cgroup_destroy_locked(c)
kernfs_remove()
  ...
css_free_work_fn()
  cgroup_pidlist_destroy_all()
   read(fd of cgrp/tasks)
 return -ENODEV

So cgroup_pidlist_destroy_all() won't race with cgroup_pidlist_start().

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Kernel crash in cgroup_pidlist_destroy_work_fn()

2014-09-16 Thread Li Zefan
On 2014/9/17 7:56, Cong Wang wrote:
> Hi, Tejun
> 
> 
> We saw some kernel null pointer dereference in
> cgroup_pidlist_destroy_work_fn(), more precisely at
> __mutex_lock_slowpath(), on 3.14. I can show you the full stack trace
> on request.
> 

Yes, please.

> Looking at the code, it seems flush_workqueue() doesn't care about new
> incoming works, it only processes currently pending ones, if this is
> correct, then we could have the following race condition:
> 
> cgroup_pidlist_destroy_all():
> //...
> mutex_lock(&cgrp->pidlist_mutex);
> list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
> mod_delayed_work(cgroup_pidlist_destroy_wq,
> &l->destroy_dwork, 0);
> mutex_unlock(&cgrp->pidlist_mutex);
> 
> // <--- another process calls cgroup_pidlist_start() here
> since mutex is released
> 
> flush_workqueue(cgroup_pidlist_destroy_wq); // <--- another
> process adds new pidlist and queue work in pararell
> BUG_ON(!list_empty(&cgrp->pidlists)); // <--- This check is
> passed, list_add() could happen after this
> 

Did you confirm this is what happened when the bug was triggered?

I don't think the race condition you described exists. In 3.14 kernel,
cgroup_diput() won't be called if there is any thread running
cgroup_pidlist_start(). This is guaranteed by vfs.

But newer kernels are different. Looks like the bug exists in those
kernels.

> 
> Therefore, the newly added pidlist will point to a freed cgroup, and
> when it is freed in the delayed work we will crash.
> 
> The attached patch (compile test ONLY) could be a possible fix, since
> it will check and hold a refcount on this cgroup in
> cgroup_pidlist_start(). But I could very easily miss something here
> since there are many cgroup changes after 3.14 and I don't follow
> cgroup development.
> 
> What do you think?
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: cgroups/netfilter : kernel NULL pointer BUG at 00000038

2014-09-14 Thread Li Zefan
I think this is the same bug as the one you reported recently, which
has been fixed in mainline.

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a4189487da1b4f8260c6006b9dc47c3c4107a5ae

On 2014/9/13 0:19, Toralf Förster wrote:
> Today I observed within a 32 bit KVM machine (stable Gentoo x86 Linux) the 
> following :
> 
> Sep 12 18:14:37 n22kvmclone kernel: [   37.964900] ip_tables: (C) 2000-2006 
> Netfilter Core Team
> Sep 12 18:14:38 n22kvmclone kernel: [   38.412110] nf_conntrack version 0.5.0 
> (16384 buckets, 65536 max)
> Sep 12 18:14:38 n22kvmclone kernel: [   39.032978] [ cut here 
> ]
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033009] WARNING: CPU: 0 PID: 1632 
> at kernel/cgroup.c:1034 cgroup_get+0x91/0xb0()
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033013] Modules linked in: 
> xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
> xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
> usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
> atkbd usb_common virtio_console processor button
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033104] CPU: 0 PID: 1632 Comm: 
> runscript.sh Not tainted 3.17.0-rc4 #18
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033107] Hardware name: QEMU 
> Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033110]    
> f4b33e54 cc04b292  f4b33e84 cbc46d64 cc18f11c
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033119]   0660 
> cc19bcc5 040a cbcb5841 cbcb5841 f56de400 f4963688
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033128]  f4983cb8 f4b33e94 
> cbc46da2 0009  f4b33eb0 cbcb5841 cbe59246
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033137] Call Trace:
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033163]  [] 
> dump_stack+0x41/0x52
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033179]  [] 
> warn_slowpath_common+0x84/0xa0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033186]  [] ? 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033191]  [] ? 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033197]  [] 
> warn_slowpath_null+0x22/0x30
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033202]  [] 
> cgroup_get+0x91/0xb0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033223]  [] ? 
> kstrtoll+0x16/0x70
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033230]  [] 
> cgroup_kn_lock_live+0x2d/0x70
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033235]  [] 
> __cgroup_procs_write.isra.26+0x56/0x240
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033240]  [] ? 
> __cgroup_procs_write.isra.26+0x240/0x240
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033245]  [] 
> cgroup_tasks_write+0x17/0x20
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033250]  [] 
> cgroup_file_write+0x45/0x140
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033256]  [] ? 
> kill_css+0xd0/0xd0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033275]  [] 
> kernfs_fop_write+0xd1/0x160
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033281]  [] ? 
> kernfs_vma_page_mkwrite+0x90/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033291]  [] 
> vfs_write+0x9d/0x1e0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033297]  [] ? 
> kernfs_vma_page_mkwrite+0x90/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033306]  [] ? 
> __fdget+0x12/0x20
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033311]  [] 
> SyS_write+0x52/0xa0
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033321]  [] 
> sysenter_do_call+0x12/0x12
> Sep 12 18:14:38 n22kvmclone kernel: [   39.033325] ---[ end trace 
> f3513225d53cf0f3 ]---
> Sep 12 18:14:38 n22kvmclone kernel: [   39.036277] BUG: unable to handle 
> kernel NULL pointer dereference at 0038
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] IP: [] 
> cgroup_put+0xc/0x90
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] *pde = 
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Oops:  [#1] SMP
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Modules linked in: 
> xt_NFLOG xt_limit ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_recent 
> xt_conntrack nf_conntrack iptable_filter ip_tables af_packet dm_crypt dm_mod 
> usbhid mousedev uhci_hcd ehci_pci microcode psmouse ehci_hcd evdev usbcore 
> atkbd usb_common virtio_console processor button
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] CPU: 0 PID: 1632 Comm: 
> runscript.sh Tainted: GW  3.17.0-rc4 #18
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] Hardware name: QEMU 
> Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] task: f6216390 ti: 
> f4b32000 task.ti: f4b32000
> Sep 12 18:14:38 n22kvmclone kernel: [   39.037026] EIP: 0060:[] 
> EFLAGS: 00010282 CPU: 0
> Sep 12 18:14:38 n22kvmclone ker

Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-09 Thread Li Zefan
On 2014/9/5 21:58, Guenter Roeck wrote:
> On 09/05/2014 12:55 AM, Li Zefan wrote:
>>>>> Li,
>>>>>
>>>>> it would be great if you can send me information about your -stable queue,
>>>>> ie how you maintain it and where it is located. This will enable me to
>>>>> continue testing the stable queue for the 3.4 kernel.
>>>>>
>>>>
>>>> Thanks for testing LTS kernels!
>>>>
>>>> This is my 3.4.y git tree:
>>>>
>>>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/
>>>>
>>>> And this is the patch queue:
>>>>
>>>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/
>>>>
>>>> I use quilt. When I've added some patches to 3.4.y, I'll update this
>>>> queue. The patches and series file are under /patches. Currently there's
>>>> already a patch in the queue.
>>>>
>>>> When I release a new version, I'll clean up the queue by removing all
>>>> the files under /patches.
>>>>
>>>> Hope this is all the information you need. Please tell me if you need
>>>> me to slightly adjust my workflow so it's easier for you.
>>>
>>> It almost works.
>>>
>>> Problem is that the pending patch got converted to use  instead of
>>> just  as common in Linux. When I try to apply it with "git quiltimport",
>>> it bails out with "trailing whitespace" errors. "git am" with the individual
>>> patch works fine for some reason, though.
>>>
>>> I can try to find a workaround, but it would be better to have the file in
>>> linux file format to start with. Would that be possible ?
>>>
>>
>> Yeah, I've fixed it. I'll run dos2unix for every patch file.
>>
> 
> Yes, it now works. Wonder how it comes that the patch in Greg's patch tree
> didn't have the problem. Any idea ?
> 

I saved the patch from my email client, and turned out the lines ended
with CRLF.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-05 Thread Li Zefan
>>> Li,
>>>
>>> it would be great if you can send me information about your -stable queue,
>>> ie how you maintain it and where it is located. This will enable me to
>>> continue testing the stable queue for the 3.4 kernel.
>>>
>>
>> Thanks for testing LTS kernels!
>>
>> This is my 3.4.y git tree:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/
>>
>> And this is the patch queue:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/
>>
>> I use quilt. When I've added some patches to 3.4.y, I'll update this
>> queue. The patches and series file are under /patches. Currently there's
>> already a patch in the queue.
>>
>> When I release a new version, I'll clean up the queue by removing all
>> the files under /patches.
>>
>> Hope this is all the information you need. Please tell me if you need
>> me to slightly adjust my workflow so it's easier for you.
> 
> It almost works.
> 
> Problem is that the pending patch got converted to use  instead of
> just  as common in Linux. When I try to apply it with "git quiltimport",
> it bails out with "trailing whitespace" errors. "git am" with the individual
> patch works fine for some reason, though.
> 
> I can try to find a workaround, but it would be better to have the file in
> linux file format to start with. Would that be possible ?
> 

Yeah, I've fixed it. I'll run dos2unix for every patch file.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kernel.org PATCH] Li Zefan is now the 3.4 stable maintainer

2014-09-04 Thread Li Zefan
Hi Guenter,

Sorry for my late reply.

On 2014/8/27 12:59, Guenter Roeck wrote:
> On Tue, Aug 26, 2014 at 04:08:58PM -0700, Greg KH wrote:
>> Li has agreed to continue to support the 3.4 stable kernel tree until
>> September 2016.  Update the releases.html page on kernel.org to reflect
>> this.
>>
> Li,
> 
> it would be great if you can send me information about your -stable queue,
> ie how you maintain it and where it is located. This will enable me to
> continue testing the stable queue for the 3.4 kernel.
> 

Thanks for testing LTS kernels!

This is my 3.4.y git tree:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y.git/

And this is the patch queue:

https://git.kernel.org/cgit/linux/kernel/git/lizf/linux-3.4.y-queue.git/

I use quilt. When I've added some patches to 3.4.y, I'll update this
queue. The patches and series file are under /patches. Currently there's
already a patch in the queue.

When I release a new version, I'll clean up the queue by removing all
the files under /patches.

Hope this is all the information you need. Please tell me if you need
me to slightly adjust my workflow so it's easier for you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] cgroup: check cgroup liveliness before unbreaking kernfs

2014-09-03 Thread Li Zefan
When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we'll trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [] dump_stack+0x41/0x52
 [] warn_slowpath_common+0x7f/0xa0
 [] warn_slowpath_null+0x1d/0x20
 [] cgroup_get+0x89/0xa0
 [] cgroup_kn_lock_live+0x28/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

v2:
- move cgroup_tryget() right below cgroup_get() definition. (Tejun)

Cc:  # 3.15+
Reported-by: Toralf Förster 
Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 709a6a0..51dd46e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1031,6 +1031,11 @@ static void cgroup_get(struct cgroup *cgrp)
css_get(&cgrp->self);
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(&cgrp->self);
+}
+
 static void cgroup_put(struct cgroup *cgrp)
 {
css_put(&cgrp->self);
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(&cgroup_mutex);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] cgroup: delay the clearing of cgrp->kn->priv

2014-09-03 Thread Li Zefan
Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$ > /cgroup/sub/cgroup.procs
echo $$ > /cgroup/cgroup.procs
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [] cgroup_put+0x9/0x80
...
Call Trace:
 [] cgroup_kn_unlock+0x39/0x50
 [] cgroup_kn_lock_live+0x61/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12

We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn->priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

v2:
- remove RCU_INIT_POINTER() into the else block. (Tejun)
- remove the cgroup_parent() check. (Tejun)
- update the comment in css_tryget_online_from_dir().

Cc:  # 3.15+
Reported-by: Toralf Förster 
Signed-off-by: Zefan Li 
---
 kernel/cgroup.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..205f793 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4181,6 +4181,15 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
+
+   /*
+* There are two control paths which try to determine
+* cgroup from dentry without going through kernfs -
+* cgroupstats_build() and css_tryget_online_from_dir().
+* Those are supported by RCU protecting clearing of
+* cgrp->kn->priv backpointer.
+*/
+   RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
}
 
mutex_unlock(&cgroup_mutex);
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp->kn->priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
@@ -5175,7 +5174,7 @@ struct cgroup_subsys_state 
*css_tryget_online_from_dir(struct dentry *dentry,
/*
 * This path doesn't originate from kernfs and @kn could already
 * have been or be removed at any point.  @kn->priv is RCU
-* protected for this access.  See cgroup_rmdir() for details.
+* protected for this access.  See css_release_work_fn() for details.
 */
cgrp = rcu_dereference(kn->priv);
if (cgrp)
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] cgroup: Delay the clearing of cgrp->kn->priv

2014-09-03 Thread Li Zefan
于 2014/9/2 23:33, Tejun Heo 写道:
> Hello, Li.
> 
> On Tue, Sep 02, 2014 at 06:56:58PM +0800, Li Zefan wrote:
>> for ((; ;))
>> {
>> echo $$ > /cgroup/sub/cgroup.procs
>> ech $$ > /cgce 6f2e0c38c2108a74 ]---
>   
>   copy & paste error?
> ...

oops

>> Reported-by: Toralf Förster 
>> Signed-off-by: Li Zefan 
>> ---
>>
>> Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
>> because I was kicked out of the cgroup mailing list so didn't receive
>> emails from mailing list for a week.
>>
>> ---
>>  kernel/cgroup.c | 19 +--
>>  1 file changed, 9 insertions(+), 10 deletions(-)
>>
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 1c56924..e03fc62 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct 
>> *work)
>>  
>>  mutex_unlock(&cgroup_mutex);
>>  
>> +/*
>> + * There are two control paths which try to determine cgroup from
>> + * dentry without going through kernfs - cgroupstats_build() and
>> + * css_tryget_online_from_dir().  Those are supported by RCU
>> + * protecting clearing of cgrp->kn->priv backpointer.
>> + */
>> +if (!ss && cgroup_parent(cgrp))
>> +RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
> 
> Can we move the above into the preceding else block?  I don't think
> holding cgroup_mutex or not makes any difference here. 

> Also, why do
> we need the cgroup_parent() check?  Do we deref root's kn->priv in the
> destruction path?  If so, can you please note that in the comment?
> 

I think the check is not necessary. I was trying to make smaller difference
than the original code, and RCU_INIT_POINTER() is in cgroup_rmdir() which
won't be called on root cgroup.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] cgroup: check cgroup liveliness before unbreaking kernfs protection

2014-09-02 Thread Li Zefan
When cgroup_kn_lock_live() is called through some kernfs operation and
another thread is calling cgroup_rmdir(), we may trigger the warning in
cgroup_get().

[ cut here ]
WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0()
...
Call Trace:
 [] dump_stack+0x41/0x52
 [] warn_slowpath_common+0x7f/0xa0
 [] warn_slowpath_null+0x1d/0x20
 [] cgroup_get+0x89/0xa0
 [] cgroup_kn_lock_live+0x28/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12
---[ end trace 6f2e0c38c2108a74 ]---

Fix this by calling css_tryget() instead of cgroup_get().

Reported-by: Toralf Förster 
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e03fc62..c8d07e5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1025,6 +1025,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
return mode;
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+   return css_tryget(&cgrp->self);
+}
+
 static void cgroup_get(struct cgroup *cgrp)
 {
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -1091,7 +1096,8 @@ static struct cgroup *cgroup_kn_lock_live(struct 
kernfs_node *kn)
 * protection against removal.  Ensure @cgrp stays accessible and
 * break the active_ref protection.
 */
-   cgroup_get(cgrp);
+   if (!cgroup_tryget(cgrp))
+   return NULL;
kernfs_break_active_protection(kn);
 
mutex_lock(&cgroup_mutex);
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] cgroup: Delay the clearing of cgrp->kn->priv

2014-09-02 Thread Li Zefan
Run these two scripts concurrently:

for ((; ;))
{
mkdir /cgroup/sub
rmdir /cgroup/sub
}

for ((; ;))
{
echo $$ > /cgroup/sub/cgroup.procs
ech $$ > /cgce 6f2e0c38c2108a74 ]---
}

A kernel bug will be triggered:

BUG: unable to handle kernel NULL pointer dereference at 0038
IP: [] cgroup_put+0x9/0x80
...
Call Trace:
 [] cgroup_kn_unlock+0x39/0x50
 [] cgroup_kn_lock_live+0x61/0x70
 [] __cgroup_procs_write.isra.26+0x51/0x230
 [] cgroup_tasks_write+0x12/0x20
 [] cgroup_file_write+0x40/0x130
 [] kernfs_fop_write+0xd1/0x160
 [] vfs_write+0x98/0x1e0
 [] SyS_write+0x4d/0xa0
 [] sysenter_do_call+0x12/0x12

We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another
concurrent thread can access kn->priv after the clearing.

We should move the clearing to css_release_work_fn(). At that time
no one is holding reference to the cgroup and no one can gain a new
reference to access it.

Reported-by: Toralf Förster 
Signed-off-by: Li Zefan 
---

Toralf, Thanks for reporting the bug. I'm not able to repy to your email,
because I was kicked out of the cgroup mailing list so didn't receive
emails from mailing list for a week.

---
 kernel/cgroup.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c56924..e03fc62 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4185,6 +4185,15 @@ static void css_release_work_fn(struct work_struct *work)
 
mutex_unlock(&cgroup_mutex);
 
+   /*
+* There are two control paths which try to determine cgroup from
+* dentry without going through kernfs - cgroupstats_build() and
+* css_tryget_online_from_dir().  Those are supported by RCU
+* protecting clearing of cgrp->kn->priv backpointer.
+*/
+   if (!ss && cgroup_parent(cgrp))
+   RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+
call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 
@@ -4601,16 +4610,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
cgroup_kn_unlock(kn);
 
-   /*
-* There are two control paths which try to determine cgroup from
-* dentry without going through kernfs - cgroupstats_build() and
-* css_tryget_online_from_dir().  Those are supported by RCU
-* protecting clearing of cgrp->kn->priv backpointer, which should
-* happen after all files under it have been removed.
-*/
-   if (!ret)
-   RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
cgroup_put(cgrp);
return ret;
 }
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] cgroup: Introduce cgroup_detach_task().

2014-08-25 Thread Li Zefan
On 2014/8/25 23:00, Dongsheng Yang wrote:
> On Mon, Aug 25, 2014 at 10:47 PM, Tejun Heo  wrote:
>> On Mon, Aug 25, 2014 at 10:46:03PM +0800, Dongsheng Yang wrote:
>>> My point here is that attaching and detaching are a pair of operations.
>>
>> There is no detaching from a cgroup.  A task is always attached to a
>> cgroup whether that's a root or non-root cgroup.
> 
> Okey, I should not think it as attaching and detaching. Just treat them as
> a move between root and non-root cgroup.
> 
> It sounds reasonable to me now.
> 

I from time to time have to explain this to other people.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -mm] slab: fix cpuset check in fallback_alloc

2014-08-14 Thread Li Zefan
On 2014/8/12 5:05, David Rientjes wrote:
> On Mon, 11 Aug 2014, Vladimir Davydov wrote:
> 
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1963,7 +1963,7 @@ zonelist_scan:
>>>  
>>> /*
>>>  * Scan zonelist, looking for a zone with enough free.
>>> -* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
>>> +* See __cpuset_node_allowed() comment in kernel/cpuset.c.
>>>  */
>>> for_each_zone_zonelist_nodemask(zone, z, zonelist,
>>> high_zoneidx, nodemask) {
>>> @@ -1974,7 +1974,7 @@ zonelist_scan:
>>> continue;
>>> if (cpusets_enabled() &&
>>> (alloc_flags & ALLOC_CPUSET) &&
>>> -   !cpuset_zone_allowed_softwall(zone, gfp_mask))
>>> +   !cpuset_zone_allowed(zone, gfp_mask))
>>> continue;
>>
>> So, this is get_page_from_freelist. It's called from
>> __alloc_pages_nodemask with alloc_flags always having ALLOC_CPUSET bit
>> set and from __alloc_pages_slowpath with alloc_flags having ALLOC_CPUSET
>> bit set only for __GFP_WAIT allocations. That said, w/o your patch we
>> try to respect cpusets for all allocations, including atomic, and only
>> ignore cpusets if tight on memory (freelist's empty) for !__GFP_WAIT
>> allocations, while with your patch we always ignore cpusets for
>> !__GFP_WAIT allocations. Not sure if it really matters though, because
>> usually one uses cpuset.mems in conjunction with cpuset.cpus and it
>> won't make any difference then. It also doesn't conflict with any cpuset
>> documentation.
>>
> 
> Yeah, that's why I'm asking Li, the cpuset maintainer, if we can do this.  

I'm not quite sure. That code has been there before I got involved in cpuset.

> The only thing that we get by falling back to the page allocator slowpath 
> is that kswapd gets woken up before the allocation is attempted without 
> ALLOC_CPUSET.  It seems pointless to wakeup kswapd when the allocation can 
> succeed on any node.  Even with the patch, if the allocation fails because 
> all nodes are below their min watermark, then we still fallback to the 
> slowpath and wake up kswapd but there's nothing much else we can do 
> because it's !__GFP_WAIT.
> .

But I tend to agree with you. But if we want to do this, we should split this
change from the cleanup.

Regarding to the cleanup, I found there used to be a single 
cpuset_node_allowed(),
and your cleanup is exactly a revert of that ancient commit:

commit 02a0e53d8227aff5e62e0433f82c12c1c2805fd6
Author: Paul Jackson 
Date:   Wed Dec 13 00:34:25 2006 -0800

[PATCH] cpuset: rework cpuset_zone_allowed api

Seems the major intention was to avoid accident sleep-in-atomic bugs, because
callback_mutex might be held.

I don't see there's any reason callback_mutex can't be a spinlock. I thought
about this when Gu Zhen fixed the bug that callback_mutex is nested inside
rcu_read_lock().

--
 kernel/cpuset.c | 81 ++---
 1 file changed, 49 insertions(+), 32 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index baa155c..9d9e239 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_MUTEX(callback_mutex);
+static DEFINE_SPINLOCK(callback_lock);
 
 /*
  * CPU / memory hotplug is handled asynchronously.
@@ -848,6 +848,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
+   unsigned long flags;
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
@@ -875,9 +876,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
continue;
rcu_read_unlock();
 
-   mutex_lock(&callback_mutex);
+   spin_lock_irqsave(&callback_lock, flags);
cpumask_copy(cp->effective_cpus, new_cpus);
-   mutex_unlock(&callback_mutex);
+   spin_unlock_irqrestore(&callback_lock, flags);
 
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -910,6 +911,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  const char *buf)
 {
+   unsigned long flags;
int retval;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -942,9 +944,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval < 0)
return retval;
 
-   mutex_lock(&callback_mutex);
+   spin_lock_irqsave(&callback_lock, flags);
   

[PATCH] cpuset: fix the WARN_ON() in update_nodemasks_hier()

2014-07-30 Thread Li Zefan
The WARN_ON() is used to check if we break the legal hierarchy, on
which the effective mems should be equal to configured mems.

Reported-by: Mike Qiu 
Tested-by: Mike Qiu 
Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 53a9bbf..baa155c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1136,7 +1136,7 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
mutex_unlock(&callback_mutex);
 
WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
-   nodes_equal(cp->mems_allowed, cp->effective_mems));
+   !nodes_equal(cp->mems_allowed, cp->effective_mems));
 
update_tasks_nodemask(cp);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: WARNING: at kernel/cpuset.c:1139

2014-07-29 Thread Li Zefan
On 2014/7/29 3:20, Tejun Heo wrote:
> On Thu, Jul 24, 2014 at 08:27:40AM +0800, Li Zefan wrote:
>> On 2014/7/23 23:12, Tejun Heo wrote:
>>> On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
>>>> commit 734d45130cb ("cpuset: update cs->effective_{cpus, mems} when config
>>>> changes") introduce the below warning in my server.
>>>>
>>>> [   35.652137] [ cut here ]
>>>> [   35.652141] WARNING: at kernel/cpuset.c:1139
>>>
>>> Hah, can you reproduce it?  If so, can you detail how?
>>>
>>
>> It's a typo.
>>
>> WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
>>  nodes_equal(cp->mems_allowed, cp->effective_mems));
>>
>> should be
>>
>> WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
>>  !nodes_equal(cp->mems_allowed, cp->effective_mems));
> 
> Care to post a patch?
> 

Sorry for the delay. I had been off office for the last two weeks.
I'll do this tomorrow.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: WARNING: at kernel/cpuset.c:1139

2014-07-23 Thread Li Zefan
On 2014/7/23 23:12, Tejun Heo wrote:
> On Wed, Jul 23, 2014 at 10:50:29AM +0800, Mike Qiu wrote:
>> commit 734d45130cb ("cpuset: update cs->effective_{cpus, mems} when config
>> changes") introduce the below warning in my server.
>>
>> [   35.652137] [ cut here ]
>> [   35.652141] WARNING: at kernel/cpuset.c:1139
> 
> Hah, can you reproduce it?  If so, can you detail how?
> 

It's a typo.

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
nodes_equal(cp->mems_allowed, cp->effective_mems));

should be

WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET v2 cgroup/for-3.17] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-15 Thread Li Zefan
于 2014/7/14 23:44, Tejun Heo 写道:
> Hello,
> 
> This is v2 of dfl_files patchset.  Changes from the last version [1]
> are
> 
> * Rebased on top of cgroup/for-3.17.
> 
> * 0004 updated so that CFTYPE_ONLY_ON_DFL and CFTYPE_INSANE are
>   cleared when cfts are removed as suggested by Li.
> 
> Until now, cftype arrays carried files for both the default and legacy
> hierarchies and the files which needed to be used on only one of them
> were flagged with either CFTYPE_ONLY_ON_DFL or CFTYPE_INSANE.  This
> gets confusing very quickly and we may end up exposing interface files
> to the default hierarchy without thinking it through.
> 
> This patchset makes cgroup core provide separate sets of interfaces
> for cftype handling so that the cftypes for the default and legacy
> hierarchies are clearly distinguished.  This makes all the existing
> subsystem interface files legacy-only by default and all subsystems
> will have no interface file created when enabled on the default
> hierarchy.  Each subsystem should explicitly review and compose the
> interface for the default hierarchy.
> 
> This patchset contains the following six patches.
> 
>  0001-cgroup-split-cgroup_base_files-into-cgroup_-dfl-lega.patch
>  0002-cgroup-rename-cgroup_subsys-base_cftypes-to-legacy_c.patch
>  0003-cgroup-replace-cgroup_add_cftypes-with-cgroup_add_le.patch
>  0004-cgroup-distinguish-the-default-and-legacy-hierarchie.patch
>  0005-cgroup-make-CFTYPE_ONLY_ON_DFL-and-CFTYPE_NO_-intern.patch
>  0006-cgroup-initialize-cgrp_dfl_root_inhibit_ss_mask-from.patch
> 
> This patchset is on top of afd1a8b3e0bc ("cpuset: export effective
> masks to userspace")
> 
> and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-dfl_files-v2
> 
> diffstat follows.  Thanks.
> 
>  Documentation/cgroups/unified-hierarchy.txt |   18 ++-
>  block/blk-cgroup.c  |5
>  include/linux/cgroup.h  |   17 ++
>  kernel/cgroup.c |  160 
> +---
>  kernel/cgroup_freezer.c |2
>  kernel/cpuset.c |2
>  kernel/sched/core.c |2
>  kernel/sched/cpuacct.c  |2
>  mm/hugetlb_cgroup.c |5
>  mm/memcontrol.c |6 -
>  net/core/netclassid_cgroup.c|2
>  net/core/netprio_cgroup.c   |2
>  net/ipv4/tcp_memcontrol.c   |2
>  security/device_cgroup.c|2
>  14 files changed, 160 insertions(+), 67 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/5] cgroup: distinguish the default and legacy hierarchies when handling cftypes

2014-07-13 Thread Li Zefan
> @@ -3085,8 +3091,37 @@ static int cgroup_add_cftypes(struct cgroup_subsys 
> *ss, struct cftype *cfts)
>   return ret;
>  }
>  
> +/**
> + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
> + * @ss: target cgroup subsystem
> + * @cfts: zero-length name terminated array of cftypes
> + *
> + * Similar to cgroup_add_cftypes() but the added files are only used for
> + * the default hierarchy.
> + */
> +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
> +{
> + struct cftype *cft;
> +
> + for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
> + cft->flags |= CFTYPE_ONLY_ON_DFL;

I think we should remove this flag in cgroup_rm_cftypes_locked(). Otherwise
if we call cgroup_add_dlf_cftypes() and then cgroup_rm_cftypes() and then
cgroup_add_legacy_cftypes() for the same @cfts, both CFTYPE_ONLY_ON_DFL and
CFTYPE_INSANE are set.

> + return cgroup_add_cftypes(ss, cfts);
> +}
> +
> +/**
> + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
> + * @ss: target cgroup subsystem
> + * @cfts: zero-length name terminated array of cftypes
> + *
> + * Similar to cgroup_add_cftypes() but the added files are only used for
> + * the legacy hierarchies.
> + */
>  int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
>  {
> + struct cftype *cft;
> +
> + for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
> + cft->flags |= CFTYPE_INSANE;
>   return cgroup_add_cftypes(ss, cfts);
>  }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 09/12] cpuset: refactor cpuset_hotplug_update_tasks()

2014-07-09 Thread Li Zefan
We mix the handling for both default hierarchy and legacy hierarchy in
the same function, and it's quite messy, so split into two functions.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 121 ++--
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4b409d2..41822e2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,6 +2080,65 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
+static void hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *off_cpus,
+   nodemask_t *off_mems)
+{
+   bool is_empty;
+
+   mutex_lock(&callback_mutex);
+   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, off_cpus);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
+   nodes_andnot(cs->mems_allowed, cs->mems_allowed, *off_mems);
+   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   mutex_unlock(&callback_mutex);
+
+   /*
+* Don't call update_tasks_cpumask() if the cpuset becomes empty,
+* as the tasks will be migratecd to an ancestor.
+*/
+   if (!cpumask_empty(off_cpus) && !cpumask_empty(cs->cpus_allowed))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems) && !nodes_empty(cs->mems_allowed))
+   update_tasks_nodemask(cs);
+
+   is_empty = cpumask_empty(cs->cpus_allowed) ||
+  nodes_empty(cs->mems_allowed);
+
+   mutex_unlock(&cpuset_mutex);
+
+   /*
+* Move tasks to the nearest ancestor with execution resources,
+* This is full cgroup operation which will also call back into
+* cpuset. Should be done outside any lock.
+*/
+   if (is_empty)
+   remove_tasks_in_empty_cpuset(cs);
+
+   mutex_lock(&cpuset_mutex);
+}
+
+static void hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *off_cpus,
+nodemask_t *off_mems)
+{
+   mutex_lock(&callback_mutex);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
+   if (cpumask_empty(cs->effective_cpus))
+   cpumask_copy(cs->effective_cpus,
+parent_cs(cs)->effective_cpus);
+
+   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   if (nodes_empty(cs->effective_mems))
+   cs->effective_mems = parent_cs(cs)->effective_mems;
+   mutex_unlock(&callback_mutex);
+
+   if (!cpumask_empty(off_cpus))
+   update_tasks_cpumask(cs);
+   if (!nodes_empty(*off_mems))
+   update_tasks_nodemask(cs);
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -2092,9 +2151,6 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
static cpumask_t off_cpus;
static nodemask_t off_mems;
-   bool is_empty;
-   bool on_dfl = cgroup_on_dfl(cs->css.cgroup);
-
 retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2109,61 +2165,16 @@ retry:
goto retry;
}
 
-   cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
-   nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
-   mutex_lock(&callback_mutex);
-   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-
-   /* Inherit the effective mask of the parent, if it becomes empty. */
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, &off_cpus);
-   if (on_dfl && cpumask_empty(cs->effective_cpus))
-   cpumask_copy(cs->effective_cpus, parent_cs(cs)->effective_cpus);
-   mutex_unlock(&callback_mutex);
-
-   /*
-* If on_dfl, we need to update tasks' cpumask for empty cpuset to
-* take on ancestor's cpumask. Otherwise, don't call
-* update_tasks_cpumask() if the cpuset becomes empty, as the tasks
-* in it will be migrated to an ancestor.
-*/
-   if ((on_dfl && cpumask_empty(cs->cpus_allowed)) ||
-   (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-   update_tasks_cpumask(cs);
-
-   mutex_lock(&callback_mutex);
-   nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+   cpumask_andnot(&off_cpus, cs->effective_cpus,
+  top_cpuset.effective_cpus);
+   nodes_andnot(off_mems, cs->effective_mems, top_cpuset.effective_mems);
 
-   /* Inherit the effective mask of the parent, if it becomes empty */
-   nodes_andnot(cs->effective_mem

[PATCH v3 10/12] cpuset: enable onlined cpu/node in effective masks

2014-07-09 Thread Li Zefan
Firstly offline cpu1:

  # echo 0-1 > cpuset.cpus
  # echo 0 > /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0

Then online it:

  # echo 1 > /sys/devices/system/cpu/cpu1/online
  # cat cpuset.cpus
  0-1
  # cat cpuset.effective_cpus
  0-1

And cpuset will bring it back to the effective mask.

The implementation is quite straightforward. Instead of calculating the
offlined cpus/mems and do updates, we just set the new effective_mask
to online_mask & congifured_mask.

This is a behavior change for default hierarchy, so legacy hierarchy
won't be affected.

v2:
- make refactoring of cpuset_hotplug_update_tasks() as seperate patch,
  suggested by Tejun.
- make hotplug_update_tasks_insane() use @new_cpus and @new_mems as
  hotplug_update_tasks_sane() does.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 65 -
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 41822e2..c47cb94 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2080,26 +2080,27 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
*cs)
}
 }
 
-static void hotplug_update_tasks_legacy(struct cpuset *cs,
-   struct cpumask *off_cpus,
-   nodemask_t *off_mems)
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+   struct cpumask *new_cpus, nodemask_t *new_mems,
+   bool cpus_updated, bool mems_updated)
 {
bool is_empty;
 
mutex_lock(&callback_mutex);
-   cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, off_cpus);
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
-   nodes_andnot(cs->mems_allowed, cs->mems_allowed, *off_mems);
-   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
+   cpumask_copy(cs->cpus_allowed, new_cpus);
+   cpumask_copy(cs->effective_cpus, new_cpus);
+   cs->mems_allowed = *new_mems;
+   cs->effective_mems = *new_mems;
mutex_unlock(&callback_mutex);
 
/*
 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
 * as the tasks will be migratecd to an ancestor.
 */
-   if (!cpumask_empty(off_cpus) && !cpumask_empty(cs->cpus_allowed))
+   if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems) && !nodes_empty(cs->mems_allowed))
+   if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
 
is_empty = cpumask_empty(cs->cpus_allowed) ||
@@ -2118,24 +2119,24 @@ static void hotplug_update_tasks_legacy(struct cpuset 
*cs,
mutex_lock(&cpuset_mutex);
 }
 
-static void hotplug_update_tasks(struct cpuset *cs,
-struct cpumask *off_cpus,
-nodemask_t *off_mems)
+static void
+hotplug_update_tasks(struct cpuset *cs,
+struct cpumask *new_cpus, nodemask_t *new_mems,
+bool cpus_updated, bool mems_updated)
 {
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+   if (nodes_empty(*new_mems))
+   *new_mems = parent_cs(cs)->effective_mems;
+
mutex_lock(&callback_mutex);
-   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, off_cpus);
-   if (cpumask_empty(cs->effective_cpus))
-   cpumask_copy(cs->effective_cpus,
-parent_cs(cs)->effective_cpus);
-
-   nodes_andnot(cs->effective_mems, cs->effective_mems, *off_mems);
-   if (nodes_empty(cs->effective_mems))
-   cs->effective_mems = parent_cs(cs)->effective_mems;
+   cpumask_copy(cs->effective_cpus, new_cpus);
+   cs->effective_mems = *new_mems;
mutex_unlock(&callback_mutex);
 
-   if (!cpumask_empty(off_cpus))
+   if (cpus_updated)
update_tasks_cpumask(cs);
-   if (!nodes_empty(*off_mems))
+   if (mems_updated)
update_tasks_nodemask(cs);
 }
 
@@ -2149,8 +2150,10 @@ static void hotplug_update_tasks(struct cpuset *cs,
  */
 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
-   static cpumask_t off_cpus;
-   static nodemask_t off_mems;
+   static cpumask_t new_cpus;
+   static nodemask_t new_mems;
+   bool cpus_updated;
+   bool mems_updated;
 retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2165,14 +2168,18 @@ retry:
goto retry;
}
 
-   cpumask_andnot(&off_cpus, cs->effective_cpus,
-  top_cpuset.effective_cpus);
-   node

[PATCH v3 12/12] cpuset: export effective masks to userspace

2014-07-09 Thread Li Zefan
cpuset.cpus and cpuset.mems are the configured masks, and we need
to export effective masks to userspace, so users know the real
cpus_allowed and mems_allowed that apply to the tasks in a cpuset.

v2:
- export those masks unconditionally, suggested by Tejun.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 65878a7..53a9bbf 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1535,6 +1535,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+   FILE_EFFECTIVE_CPULIST,
+   FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1701,6 +1703,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, 
void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+   case FILE_EFFECTIVE_CPULIST:
+   s += cpulist_scnprintf(s, count, cs->effective_cpus);
+   break;
+   case FILE_EFFECTIVE_MEMLIST:
+   s += nodelist_scnprintf(s, count, cs->effective_mems);
+   break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1786,6 +1794,18 @@ static struct cftype files[] = {
},
 
{
+   .name = "effective_cpus",
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_CPULIST,
+   },
+
+   {
+   .name = "effective_mems",
+   .seq_show = cpuset_common_seq_show,
+   .private = FILE_EFFECTIVE_MEMLIST,
+   },
+
+   {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 11/12] cpuset: allow writing offlined masks to cpuset.cpus/mems

2014-07-09 Thread Li Zefan
As the configured masks won't be limited by its parent, and the top
cpuset's masks won't change when hotplug happens, it's natural to
allow writing offlined masks to the configured masks.

If on default hierarchy:

# echo 0 > /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1 > /cpuset/sub/cpuset.cpus
# cat /cpuset/sub/cpuset.cpus
1

If on legacy hierarchy:

# echo 0 > /sys/devices/system/cpu/cpu1/online
# mkdir /cpuset/sub
# echo 1 > /cpuset/sub/cpuset.cpus
-bash: echo: write error: Invalid argument

Note the checks don't need to be gated by cgroup_on_dfl, because we've
initialized top_cpuset.{cpus,mems}_allowed accordingly in cpuset_bind().

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c47cb94..65878a7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -929,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
if (retval < 0)
return retval;
 
-   if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+   if (!cpumask_subset(trialcs->cpus_allowed,
+   top_cpuset.cpus_allowed))
return -EINVAL;
}
 
@@ -1186,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
goto done;
 
if (!nodes_subset(trialcs->mems_allowed,
-   node_states[N_MEMORY])) {
-   retval =  -EINVAL;
+ top_cpuset.mems_allowed)) {
+   retval = -EINVAL;
goto done;
}
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 07/12] cpuset: apply cs->effective_{cpus,mems}

2014-07-09 Thread Li Zefan
Now we can use cs->effective_{cpus,mems} as effective masks. It's
used whenever:

- we update tasks' cpus_allowed/mems_allowed,
- we want to retrieve tasks_cs(tsk)'s cpus_allowed/mems_allowed.

They actually replace effective_{cpu,node}mask_cpuset().

effective_mask == configured_mask & parent effective_mask except when
the reault is empty, in which case it inherits parent effective_mask.
The result equals the mask computed from effective_{cpu,node}mask_cpuset().

This won't affect the original legacy hierarchy, because in this case we
make sure the effective masks are always the same with user-configured
masks.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 83 ++---
 1 file changed, 14 insertions(+), 69 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e4c31e6..820870a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -313,9 +313,9 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-   while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+   while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
-   cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+   cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 
 /*
@@ -331,9 +331,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct 
cpumask *pmask)
  */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
-   while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+   while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
-   nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+   nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 }
 
 /*
@@ -795,45 +795,6 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
 }
 
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- *   if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
-   while (cpumask_empty(cs->cpus_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- *   if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
-   while (nodes_empty(cs->mems_allowed))
-   cs = parent_cs(cs);
-   return cs;
-}
-
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -844,13 +805,12 @@ static struct cpuset *effective_nodemask_cpuset(struct 
cpuset *cs)
  */
 static void update_tasks_cpumask(struct cpuset *cs)
 {
-   struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
 
css_task_iter_start(&cs->css, &it);
while ((task = css_task_iter_next(&it)))
-   set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+   set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
 }
 
@@ -988,15 +948,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const 
nodemask_t *from,
const nodemask_t *to)
 {
struct task_struct *tsk = current;
-   struct cpuset *mems_cs;
 
tsk->mems_allowed = *to;
 
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
rcu_read_lock();
-   mems_cs = effective_nodemask_cpuset(task_cs(tsk));
-   guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+   guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
 }
 
@@ -1065,13 +1023,12 @@ static void *cpuset_being_rebound;
 static void update_tasks_nodemask(struct cpuset *cs)
 {
static nodemask_t newmems;  /* protected by cpuset_mutex */
-   struc

[PATCH v3 08/12] cpuset: make cs->{cpus,mems}_allowed as user-configured masks

2014-07-09 Thread Li Zefan
Now we've used effective cpumasks to enforce hierarchical manner,
we can use cs->{cpus,mems}_allowed as configured masks.

Configured masks can be changed by writing cpuset.cpus and cpuset.mems
only. The new behaviors are:

- They won't be changed by hotplug anymore.
- They won't be limited by its parent's masks.

This ia a behavior change, but won't take effect unless mount with
sane_behavior.

v2:
- Add comments to explain the differences between configured masks and
effective masks.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 35 +--
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 820870a..4b409d2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -77,6 +77,26 @@ struct cpuset {
 
unsigned long flags;/* "unsigned long" so bitops work */
 
+   /*
+* On default hierarchy:
+*
+* The user-configured masks can only be changed by writing to
+* cpuset.cpus and cpuset.mems, and won't be limited by the
+* parent masks.
+*
+* The effective masks is the real masks that apply to the tasks
+* in the cpuset. They may be changed if the configured masks are
+* changed or hotplug happens.
+*
+* effective_mask == configured_mask & parent's effective_mask,
+* and if it ends up empty, it will inherit the parent's mask.
+*
+*
+* On legacy hierachy:
+*
+* The user-configured masks are always the same with effective masks.
+*/
+
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
nodemask_t mems_allowed;
@@ -450,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct 
cpuset *trial)
 
par = parent_cs(cur);
 
-   /* We must be a subset of our parent cpuset */
+   /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
-   if (!is_cpuset_subset(trial, par))
+   if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
 
/*
@@ -2167,6 +2187,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+   bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
 
mutex_lock(&cpuset_mutex);
 
@@ -2174,13 +2195,14 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
 
-   cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-   mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+   cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+   mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
-   cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+   if (!on_dfl)
+   cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2189,7 +2211,8 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(&callback_mutex);
-   top_cpuset.mems_allowed = new_mems;
+   if (!on_dfl)
+   top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 03/12] cpuset: update cs->effective_{cpus,mems} when config changes

2014-07-09 Thread Li Zefan
We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The second item is done here. We don't need to treat root_cs specially
in update_cpumasks_hier().

This won't introduce behavior change.

v3:
- add a WARN_ON() to check if effective masks are the same with configured
  masks on legacy hierarchy.
- pass trialcs->cpus_allowed to update_cpumasks_hier() and add a comment for
  it. Similar change for update_nodemasks_hier(). Suggested by Tejun.

v2:
- revise the comment in update_{cpu,node}masks_hier(), suggested by Tejun.
- fix to use @cp instead of @cs in these two functions.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 88 +++--
 1 file changed, 54 insertions(+), 34 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 94f651d..da766c3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -855,36 +855,45 @@ static void update_tasks_cpumask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
  *
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
 
rcu_read_lock();
-   cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-   if (cp == root_cs) {
-   if (!update_root)
-   continue;
-   } else {
-   /* skip the whole subtree if @cp have some CPU */
-   if (!cpumask_empty(cp->cpus_allowed)) {
-   pos_css = css_rightmost_descendant(pos_css);
-   continue;
-   }
+   cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+   struct cpuset *parent = parent_cs(cp);
+
+   cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+   /* Skip the whole subtree if the cpumask remains the same. */
+   if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+   pos_css = css_rightmost_descendant(pos_css);
+   continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
 
+   mutex_lock(&callback_mutex);
+   cpumask_copy(cp->effective_cpus, new_cpus);
+   mutex_unlock(&callback_mutex);
+
+   WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+   !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
 
rcu_read_lock();
@@ -940,7 +949,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
 
-   update_tasks_cpumask_hier(cs, true);
+   /* use trialcs->cpus_allowed as a temp variable */
+   update_cpumasks_hier(cs, trialcs->cpus_allowed);
 
if (is_load_balanced)
rebuild_sched_domains_locked();
@@ -1091,36 +1101,45 @@ static void update_tasks_nodemask(struct cpuset *cs)
 }
 
 /*
- * update_tasks_nodemask_hier - Update the nodemasks of 

[PATCH v3 06/12] cpuset: initialize top_cpuset's configured masks at mount

2014-07-09 Thread Li Zefan
We now have to support different behaviors for default hierachy and
legacy hiearchy, top_cpuset's configured masks need to be initialized
accordingly.

Suppose we've offlined cpu1.

On default hierarchy:

# mount -t cgroup -o __DEVEL__sane_behavior xxx /cpuset
# cat /cpuset/cpuset.cpus
0-15

On legacy hierarchy:

# mount -t cgroup xxx /cpuset
# cat /cpuset/cpuset.cpus
0,2-15

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 37 -
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 60577cc..e4c31e6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2015,16 +2015,35 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
kfree(cs);
 }
 
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+   mutex_lock(&cpuset_mutex);
+   mutex_lock(&callback_mutex);
+
+   if (cgroup_on_dfl(root_css->cgroup)) {
+   cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+   top_cpuset.mems_allowed = node_possible_map;
+   } else {
+   cpumask_copy(top_cpuset.cpus_allowed,
+top_cpuset.effective_cpus);
+   top_cpuset.mems_allowed = top_cpuset.effective_mems;
+   }
+
+   mutex_unlock(&callback_mutex);
+   mutex_unlock(&cpuset_mutex);
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
-   .css_alloc = cpuset_css_alloc,
-   .css_online = cpuset_css_online,
-   .css_offline = cpuset_css_offline,
-   .css_free = cpuset_css_free,
-   .can_attach = cpuset_can_attach,
-   .cancel_attach = cpuset_cancel_attach,
-   .attach = cpuset_attach,
-   .base_cftypes = files,
-   .early_init = 1,
+   .css_alloc  = cpuset_css_alloc,
+   .css_online = cpuset_css_online,
+   .css_offline= cpuset_css_offline,
+   .css_free   = cpuset_css_free,
+   .can_attach = cpuset_can_attach,
+   .cancel_attach  = cpuset_cancel_attach,
+   .attach = cpuset_attach,
+   .bind   = cpuset_bind,
+   .base_cftypes   = files,
+   .early_init = 1,
 };
 
 /**
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 04/12] cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes empty

2014-07-09 Thread Li Zefan
We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The last item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index da766c3..f834002 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -877,6 +877,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 
cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some CPUs.
+*/
+   if (cpumask_empty(new_cpus))
+   cpumask_copy(new_cpus, parent->effective_cpus);
+
/* Skip the whole subtree if the cpumask remains the same. */
if (cpumask_equal(new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -1123,6 +1130,13 @@ static void update_nodemasks_hier(struct cpuset *cs, 
nodemask_t *new_mems)
 
nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
 
+   /*
+* If it becomes empty, inherit the effective mask of the
+* parent, which is guaranteed to have some MEMs.
+*/
+   if (nodes_empty(*new_mems))
+   *new_mems = parent->effective_mems;
+
/* Skip the whole subtree if the nodemask remains the same. */
if (nodes_equal(*new_mems, cp->effective_mems)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -2102,7 +2116,11 @@ retry:
 
mutex_lock(&callback_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+
+   /* Inherit the effective mask of the parent, if it becomes empty. */
cpumask_andnot(cs->effective_cpus, cs->effective_cpus, &off_cpus);
+   if (on_dfl && cpumask_empty(cs->effective_cpus))
+   cpumask_copy(cs->effective_cpus, parent_cs(cs)->effective_cpus);
mutex_unlock(&callback_mutex);
 
/*
@@ -2117,7 +2135,11 @@ retry:
 
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+
+   /* Inherit the effective mask of the parent, if it becomes empty */
nodes_andnot(cs->effective_mems, cs->effective_mems, off_mems);
+   if (on_dfl && nodes_empty(cs->effective_mems))
+   cs->effective_mems = parent_cs(cs)->effective_mems;
mutex_unlock(&callback_mutex);
 
/*
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 05/12] cpuset: use effective cpumask to build sched domains

2014-07-09 Thread Li Zefan
We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

We should partition sched domains according to effective_cpus, which
is the real cpulist that takes effects on tasks in the cpuset.

This won't introduce behavior change.

v2:
- Add a comment for the call of rebuild_sched_domains(), suggested
by Tejun.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f834002..60577cc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -494,11 +494,11 @@ out:
 #ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
  */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-   return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+   return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 }
 
 static void
@@ -615,7 +615,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
-   cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+   cpumask_copy(doms[0], top_cpuset.effective_cpus);
 
goto done;
}
@@ -719,7 +719,7 @@ restart:
struct cpuset *b = csa[j];
 
if (apn == b->pn) {
-   cpumask_or(dp, dp, b->cpus_allowed);
+   cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, 
b);
 
@@ -771,7 +771,7 @@ static void rebuild_sched_domains_locked(void)
 * passing doms with offlined cpu to partition_sched_domains().
 * Anyways, hotplug work item will rebuild sched domains.
 */
-   if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+   if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
 
/* Generate domain masks and attrs */
@@ -870,6 +870,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
 {
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+   bool need_rebuild_sched_domains = false;
 
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -903,10 +904,21 @@ static void update_cpumasks_hier(struct cpuset *cs, 
struct cpumask *new_cpus)
 
update_tasks_cpumask(cp);
 
+   /*
+* If the effective cpumask of any non-empty cpuset is changed,
+* we need to rebuild sched domains.
+*/
+   if (!cpumask_empty(cp->cpus_allowed) &&
+   is_sched_load_balance(cp))
+   need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+   if (need_rebuild_sched_domains)
+   rebuild_sched_domains_locked();
 }
 
 /**
@@ -919,7 +931,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
  const char *buf)
 {
int retval;
-   int is_load_balanced;
 
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -950,17 +961,12 @@ static int update_cpumask(struct cpuset *cs, struct 
cpuset *trialcs,
if (retval < 0)
return retval;
 
-   is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
 
/* use trialcs->cpus_allowed as a temp variable */
update_cpumasks_hier(cs, trialcs->cpus_allowed);
-
-   if (is_load_balanced)
-   rebuild_sched_domains_locked();
return 0;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the b

[PATCH v3 01/12] cpuset: add cs->effective_cpus and cs->effective_mems

2014-07-09 Thread Li Zefan
We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierachy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

This patch adds the effective masks to struct cpuset and initializes
them. The effective masks of the top cpuset is the same with configured
masks, and a child cpuset inherits its parent's effective masks.

This won't introduce behavior change.

v2:
- s/real_{mems,cpus}_allowed/effective_{mems,cpus}, suggested by Tejun.
- don't init effective masks in cpuset_css_online() if !cgroup_on_dfl.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 59 ++---
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f9d4807..ef0974c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,14 @@ struct cpuset {
struct cgroup_subsys_state css;
 
unsigned long flags;/* "unsigned long" so bitops work */
-   cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
-   nodemask_t mems_allowed;/* Memory Nodes allowed to tasks */
+
+   /* user-configured CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t cpus_allowed;
+   nodemask_t mems_allowed;
+
+   /* effective CPUs and Memory Nodes allow to tasks */
+   cpumask_var_t effective_cpus;
+   nodemask_t effective_mems;
 
/*
 * This is old Memory Nodes tasks took on.
@@ -376,13 +382,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset 
*cs)
if (!trial)
return NULL;
 
-   if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
-   kfree(trial);
-   return NULL;
-   }
-   cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+   if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
+   cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+   cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+   free_cpumask_var(trial->cpus_allowed);
+free_cs:
+   kfree(trial);
+   return NULL;
 }
 
 /**
@@ -391,6 +404,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
  */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+   free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
 }
@@ -1848,18 +1862,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
-   if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
-   kfree(cs);
-   return ERR_PTR(-ENOMEM);
-   }
+   if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+   goto free_cs;
+   if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+   goto free_cpus;
 
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+   cpumask_clear(cs->effective_cpus);
+   nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
 
return &cs->css;
+
+free_cpus:
+   free_cpumask_var(cs->cpus_allowed);
+free_cs:
+   kfree(cs);
+   return ERR_PTR(-ENOMEM);
 }
 
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1882,6 +1904,13 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
 
cpuset_inc();
 
+   mutex_lock(&callback_mutex);
+   if (cgroup_on_dfl(cs->css.cgroup)) {
+   cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+   cs->effective_mems = parent->effective_mems;
+   }
+   mutex_unlock(&callback_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
 
@@ -1941,6 +1970,7 @@ static void cpuset_css_free(struct cgroup_subsys_state 
*css)
 {
struct cpuset *cs = css_cs(css);
 
+   free_cpumask_var(c

[PATCH v3 02/12] cpuset: update cpuset->effective_{cpus,mems} at hotplug

2014-07-09 Thread Li Zefan
We're going to have separate user-configured masks and effective ones.

Eventually configured masks can only be changed by writing cpuset.cpus
and cpuset.mems, and they won't be restricted by parent cpuset. While
effective masks reflect cpu/memory hotplug and hierachical restriction,
and these are the real masks that apply to the tasks in the cpuset.

We calculate effective mask this way:
  - top cpuset's effective_mask == online_mask, otherwise
  - cpuset's effective_mask == configured_mask & parent effective_mask,
if the result is empty, it inherits parent effective mask.

Those behavior changes are for default hierarchy only. For legacy
hierarchy, effective_mask and configured_mask are the same, so we won't
break old interfaces.

To make cs->effective_{cpus,mems} to be effective masks, we need to
  - update the effective masks at hotplug
  - update the effective masks at config change
  - take on ancestor's mask when the effective mask is empty

The first item is done here.

This won't introduce behavior change.

Signed-off-by: Li Zefan 
---
 kernel/cpuset.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ef0974c..94f651d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2082,6 +2082,7 @@ retry:
 
mutex_lock(&callback_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+   cpumask_andnot(cs->effective_cpus, cs->effective_cpus, &off_cpus);
mutex_unlock(&callback_mutex);
 
/*
@@ -2096,6 +2097,7 @@ retry:
 
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+   nodes_andnot(cs->effective_mems, cs->effective_mems, off_mems);
mutex_unlock(&callback_mutex);
 
/*
@@ -2159,6 +2161,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (cpus_updated) {
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+   cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2167,6 +2170,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
if (mems_updated) {
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = new_mems;
+   top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset);
}
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 00/12] cpuset: separate configured masks and effective masks

2014-07-09 Thread Li Zefan
This patcheset introduces behavior changes, but only for default hierarchy

- We introduce new interfaces cpuset.effective_cpus and cpuset.effective_mems,
  while cpuset.cpus and cpuset.mems will be configured masks.

- The configured masks can be changed by writing cpuset.cpus/mems only. They
  won't be changed when hotplug happens.

- Users can config cpus and mems without restrictions from the parent cpuset.
  effective masks will enforce the hierarchical behavior.

- Users can also config cpus and mems to have already offlined CPU/nodes.

- When a CPU/node is onlined, it will be brought back to the effective masks
  if it's in the configured masks.

- We build sched domains based on effective cpumask but not configured cpumask.

v3:
- rebased against "cgroup: remove sane_behavior support on non-default 
hierarchies"
- addressed previous review comments
- adjusted some code, comment and changelog slightly

v2:
- fixed two bugs
- made changelogs more verbose
- added more comments
- changed cs->real_{mems,cpus}_allowed to cs->effective_{mems, cpus}
- splitted "cpuset: enable onlined cpu/node in effective masks" into 2 patches
- exported cpuset.effective_{cpus,mems} unconditionally


Li Zefan (12):
  cpuset: add cs->effective_cpus and cs->effective_mems
  cpuset: update cpuset->effective_{cpus,mems} at hotplug
  cpuset: update cs->effective_{cpus,mems} when config changes
  cpuset: inherit ancestor's masks if effective_{cpus,mems} becomes
empty
  cpuset: use effective cpumask to build sched domains
  cpuset: initialize top_cpuset's configured masks at mount
  cpuset: apply cs->effective_{cpus,mems}
  cpuset: make cs->{cpus,mems}_allowed as user-configured masks
  cpuset: refactor cpuset_hotplug_update_tasks()
  cpuset: enable onlined cpu/node in effective masks
  cpuset: allow writing offlined masks to cpuset.cpus/mems
  cpuset: export effective masks to userspace

 kernel/cpuset.c | 493 ++--
 1 file changed, 304 insertions(+), 189 deletions(-)

-- 
1.8.0.2


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.17] cgroup: remove sane_behavior support on non-default hierarchies

2014-07-09 Thread Li Zefan
On 2014/7/3 7:45, Tejun Heo wrote:
> Hello,
> 
> sane_behavior has been used as a development vehicle for the default
> unified hierarchy.  Now that the default hierarchy is in place, the
> flag became redundant and confusing as its usage is allowed on all
> hierarchies.  There are gonna be either the default hierarchy or
> legacy ones.  Let's make that clear by removing sane_behavior support
> on non-default hierarchies.
> 
> This patchset contains the following four patches.
> 
>  0001-cgroup-remove-CGRP_ROOT_OPTION_MASK.patch
>  0002-cgroup-make-interface-file-cgroup.sane_behavior-lega.patch
>  0003-cgroup-remove-sane_behavior-support-on-non-default-h.patch
>  0004-cgroup-clean-up-sane_behavior-handling.patch
> 
> 0001 is a trivial cleanup.
> 
> 0002 removes "cgroup.sane_behavior" from the default hierarchy.
> 
> 0003 removes sane_behavior support on non-default hierarchies.
> 
> 0004 cleans up sane_behavior handling.
> 
> This patchset is on top of a497c3ba1d97 ("Linux 3.16-rc2") and
> available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-dfl-instead-of-sane
> 
> diffstat follows.  Thanks.
> 
>  block/blk-throttle.c   |6 +-
>  include/linux/cgroup.h |  128 
> -
>  kernel/cgroup.c|   96 +++-
>  kernel/cpuset.c    |   33 +---
>  mm/memcontrol.c|7 +-
>  5 files changed, 117 insertions(+), 153 deletions(-)
> 

Acked-by: Li Zefan 

I'm rebasing my cpuset patchset against this.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sched/rt: overrun could happen in start_hrtick_dl

2014-07-07 Thread Li Zefan
On 2014/7/8 9:10, xiaofeng.yan wrote:
> On 2014/7/7 16:41, Peter Zijlstra wrote:
>> On Fri, Jul 04, 2014 at 12:02:21PM +, xiaofeng.yan wrote:
>>> It could be wrong for the precision of runtime and deadline
>>> when the precision is within microsecond level. For example:
>>> Task runtime deadline period
>>>   P1   200us   500us   500us
>>>
>>> This case need enbale HRTICK feature by the next command
>>> PC#echo "HRTICK" > /sys/kernel/debug/sched_features
>>> PC#./schedtool -E -t 20:50 -e ./test&
>>> PC#trace-cmd record -e sched_switch
>> Are you actually using HRTICK ?
> yes, If HRTICK is close , then all of runtime and deadline will be wrong.

I think what peter meant is, do you use HRTICK in products or
just use it for testing/experiment?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] MAINTAINERS:ARM:hisi: add Hisilicon SoC family

2014-07-04 Thread Li Zefan
On 2014/7/4 15:11, xuwei wrote:
> 
> Introduce a new mach-hisi that will support Hisilicon SoCs based on ARMv7
> and I am taking maintainership for it.
> 
> Signed-off-by: Wei Xu 
> ---
>  MAINTAINERS | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 134483f..c11c89b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -972,6 +972,14 @@ F:   arch/arm/mach-pxa/hx4700.c
>  F:   arch/arm/mach-pxa/include/mach/hx4700.h
>  F:   sound/soc/pxa/hx4700.c
>  
> +ARM/Hisilicon SoC support
> +M:   Wei Xu 
> +L:   linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
> +W:   www.hisilicon.com
> +S:   Maintained

S:  Supported ?

Supported:   Someone is actually paid to look after this.
Maintained:  Someone actually looks after it.

> +T:   git git://github.com/hisilicon/linux-hisi.git
> +F:   arch/arm/mach-hisi/
> +
>  ARM/HP JORNADA 7XX MACHINE SUPPORT
>  M:   Kristoffer Ericson 
>  W:   www.jlime.com
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.17] cgroup, blkcg, memcg: make blkcg depend on memcg on unified hierarchy

2014-07-03 Thread Li Zefan
Hi Tejun,

On 2014/6/28 9:03, Tejun Heo wrote:
> Hello, guys.
> 
> Currently, the blkio subsystem attributes all of writeback IOs to the
> root.  One of the issues is that there's no way to tell who originated
> a writeback IO from block layer.  Those IOs are usually issued
> asynchronously from a task which didn't have anything to do with
> actually generating the dirty pages.  The memory subsystem, when
> enabled, already keeps track of the ownership of each dirty page and
> it's desirable for blkio to piggyback instead of adding its own
> per-page tag.

It's great to see this being worked on!

> 
> This can be achieved on the unified hierarchy without too much
> difficulty.  This patchset implements a dependency mechanism in the
> cgroup such that a subsystem can depends on other subsystems.  If
> available, the depended-upon subsystems are enabled together
> implicitly when the subsystem is turned on.  Implicitly enabled
> subsystems are invisible and the dependencies are transparent to
> userland.
> 
> This patchset implements the dependency mechanism in cgroup core and
> make blkcg depend on memcg.  This doesn't actually solve the writeback
> problem yet but is an important step.
> 
> This patchset contains the following six patches.
> 
>  0001-cgroup-reorganize-cgroup_subtree_control_write.patch
>  0002-cgroup-introduce-cgroup-subtree_control.patch
>  0003-cgroup-make-interface-files-visible-iff-enabled-on-c.patch
>  0004-cgroup-implement-cgroup_subsys-css_reset.patch
>  0005-cgroup-implement-cgroup_subsys-depends_on.patch
>  0006-blkcg-memcg-make-blkcg-depend-on-memcg-on-the-defaul.patch
> 
> 0001-0005 gradually implement the dependency mechanism.
> 
> 0006 makes blkcg depend on memcg.
> 
> This patchset is on top of a497c3ba1d97 ("Linux 3.16-rc2") and
> available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-cgroup-dependency
> 
> diffstat follows.  Thanks.
> 
>  Documentation/cgroups/cgroups.txt   |   14 +
>  Documentation/cgroups/unified-hierarchy.txt |   23 ++-
>  block/blk-cgroup.c  |7
>  include/linux/cgroup.h  |   20 ++
>  kernel/cgroup.c     |  201 
> ++--
>  mm/memcontrol.c |   24 +++
>  6 files changed, 243 insertions(+), 46 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH cgroup/for-3.16-fixes] cpuset: break kernfs active protection in cpuset_write_resmask()

2014-06-30 Thread Li Zefan
On 2014/7/1 3:47, Tejun Heo wrote:
> Hey, Li.
> 
> Can you please test this patch and ack it?
> 
...
> Signed-off-by: Tejun Heo 
> Reported-by: Li Zefan 

Tested-by: Li Zefan 

Thanks!

> ---
>  kernel/cpuset.c |   12 
>  1 file changed, 12 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-29 Thread Li Zefan
We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Cc:  # 3.15
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3662ac..11e40cf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1739,14 +1740,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root->kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1790,6 +1795,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb->s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb->s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-29 Thread Li Zefan
kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return the superblock.
- drop kernfs_drop_sb().

[ This is a prerequisite for a bugfix. ]
Cc:  # 3.15
Acked-by: Greg Kroah-Hartman 
Signed-off-by: Li Zefan 
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(&kernfs_mutex);
+   list_for_each_entry(info, &root->supers, node) {
+   if (info->ns == ns) {
+   sb = info->sb;
+   if (!atomic_inc_not_zero(&info->sb->s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(&kernfs_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan
  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v3:
- put the refcnt immediately after getting it. (Tejun)

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Cc:  # 3.15
Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..d3662ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root == &cgrp_dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+   mutex_unlock(&cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   cgroup_put(&ss->root->cgrp);
+   }
+
for_each_root(root) {
bool name_match = false;
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[BUG] cpuset: lockdep warning

2014-06-29 Thread Li Zefan
Hi Tejun,

In this lockdep warning kernfs and workqueue are involved, so I'm not sure 
what's
happening here.

This was triggered when tasks were being moved to parent cpuset due to hotplug.
The kernel is 3.16-rc1, with no modification.

localhost:/ # mount -t cgroup -o cpuset xxx /cpuset
localhost:/ # mkdir /cpuset/tmp
localhost:/ # echo 1 > /cpuset/tmp/cpuset.cpus
localhost:/ # echo 0 > cpuset/tmp/cpuset.mems
localhost:/ # echo $$ > /cpuset/tmp/tasks
localhost:/ # echo 0 > /sys/devices/system/cpu/cpu1/online


[ 1810.292243] ==
[ 1810.292251] [ INFO: possible circular locking dependency detected ]
[ 1810.292259] 3.16.0-rc1-0.1-default+ #7 Not tainted
[ 1810.292266] ---
[ 1810.292273] kworker/1:0/32649 is trying to acquire lock:
[ 1810.292280]  (cgroup_mutex){+.+.+.}, at: [] 
cgroup_transfer_tasks+0x37/0x150
[ 1810.292300]
[ 1810.292300] but task is already holding lock:
[ 1810.292309]  (cpuset_hotplug_work){+.+...}, at: [] 
process_one_work+0x192/0x520
[ 1810.292327]
[ 1810.292327] which lock already depends on the new lock.
[ 1810.292327]
[ 1810.292339]
[ 1810.292339] the existing dependency chain (in reverse order) is:
[ 1810.292348]
[ 1810.292348] -> #2 (cpuset_hotplug_work){+.+...}:
[ 1810.292360][] validate_chain+0x656/0x7c0
[ 1810.292371][] __lock_acquire+0x382/0x660
[ 1810.292380][] lock_acquire+0xf9/0x170
[ 1810.292389][] flush_work+0x39/0x90
[ 1810.292398][] cpuset_write_resmask+0x51/0x120
[ 1810.292409][] cgroup_file_write+0x49/0x1f0
[ 1810.292419][] kernfs_fop_write+0xfd/0x190
[ 1810.292431][] vfs_write+0xe5/0x190
[ 1810.292443][] SyS_write+0x5c/0xc0
[ 1810.292452][] system_call_fastpath+0x16/0x1b
[ 1810.292464]
[ 1810.292464] -> #1 (s_active#175){.+}:
[ 1810.292476][] validate_chain+0x656/0x7c0
[ 1810.292486][] __lock_acquire+0x382/0x660
[ 1810.292495][] lock_acquire+0xf9/0x170
[ 1810.292504][] kernfs_drain+0x13b/0x1c0
[ 1810.292513][] __kernfs_remove+0xc8/0x220
[ 1810.292523][] kernfs_remove_by_name_ns+0x50/0xb0
[ 1810.292533][] cgroup_addrm_files+0x16e/0x290
[ 1810.292543][] cgroup_clear_dir+0x6d/0xa0
[ 1810.292552][] rebind_subsystems+0x10f/0x350
[ 1810.292562][] cgroup_setup_root+0x1bf/0x290
[ 1810.292571][] cgroup_mount+0x123/0x3d0
[ 1810.292581][] mount_fs+0x4d/0x1a0
[ 1810.292591][] vfs_kern_mount+0x79/0x160
[ 1810.292602][] do_new_mount+0xd9/0x200
[ 1810.292611][] do_mount+0x1dc/0x220
[ 1810.292621][] SyS_mount+0xbc/0xe0
[ 1810.292630][] system_call_fastpath+0x16/0x1b
[ 1810.292640]
[ 1810.292640] -> #0 (cgroup_mutex){+.+.+.}:
[ 1810.292651][] check_prev_add+0x43e/0x4b0
[ 1810.292660][] validate_chain+0x656/0x7c0
[ 1810.292669][] __lock_acquire+0x382/0x660
[ 1810.292678][] lock_acquire+0xf9/0x170
[ 1810.292687][] mutex_lock_nested+0x6f/0x380
[ 1810.292697][] cgroup_transfer_tasks+0x37/0x150
[ 1810.292707][] 
hotplug_update_tasks_insane+0x110/0x1d0
[ 1810.292718][] 
cpuset_hotplug_update_tasks+0x13d/0x180
[ 1810.292729][] cpuset_hotplug_workfn+0x18c/0x630
[ 1810.292739][] process_one_work+0x254/0x520
[ 1810.292748][] worker_thread+0x13d/0x3d0
[ 1810.292758][] kthread+0xf8/0x100
[ 1810.292768][] ret_from_fork+0x7c/0xb0
[ 1810.292778]
[ 1810.292778] other info that might help us debug this:
[ 1810.292778]
[ 1810.292789] Chain exists of:
[ 1810.292789]   cgroup_mutex --> s_active#175 --> cpuset_hotplug_work
[ 1810.292789]
[ 1810.292807]  Possible unsafe locking scenario:
[ 1810.292807]
[ 1810.292816]CPU0CPU1
[ 1810.292822]
[ 1810.292827]   lock(cpuset_hotplug_work);
[ 1810.292835]lock(s_active#175);
[ 1810.292845]lock(cpuset_hotplug_work);
[ 1810.292855]   lock(cgroup_mutex);
[ 1810.292862]
[ 1810.292862]  *** DEADLOCK ***
[ 1810.292862]
[ 1810.292872] 2 locks held by kworker/1:0/32649:
[ 1810.292878]  #0:  ("events"){.+.+.+}, at: [] 
process_one_work+0x192/0x520
[ 1810.292895]  #1:  (cpuset_hotplug_work){+.+...}, at: [] 
process_one_work+0x192/0x520
[ 1810.292911]
[ 1810.292911] stack backtrace:
[ 1810.292920] CPU: 1 PID: 32649 Comm: kworker/1:0 Not tainted 
3.16.0-rc1-0.1-default+ #7
[ 1810.292929] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285
  /BC11BTSA  , BIOS CTSAV036 04/27/2011
[ 1810.292943] Workqueue: events cpuset_hotplug_workfn
[ 1810.292951]  824b01e0 8800afdd3918 815a5f78 
8800afdd3958
[ 1810.292964]  810c263f 1d1fa490 8800afdd3978 
88061d1fa490
[ 1810.292976]   88061d1fad08 88061d1fad40 
8800afdd39f8
[ 1810.292989] Call Tra

Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-29 Thread Li Zefan
On 2014/6/28 19:58, Tejun Heo wrote:
> Hello, Li.
> 
> On Fri, Jun 27, 2014 at 05:13:12PM +0800, Li Zefan wrote:
>> +for_each_subsys(ss, i) {
>> +if (!(opts.subsys_mask & (1 << i)) ||
>> +ss->root == &cgrp_dfl_root)
>> +continue;
>> +
>> +if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
>> +mutex_unlock(&cgroup_mutex);
>> +msleep(10);
>> +ret = restart_syscall();
>> +goto out_free;
>> +}
> 
> Why not just put it immediately?  We know that it's not gonna be
> destroyed while holding cgroup_mutex.  It may look a bit weird but
> this is a pretty special case anyway and deferring put doesn't buy
> anything.
> 

Yeah, this is better. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan
Made a mistake again.. :(


==

From: Li Zefan 
Subject: [PATCH 1/3] cgroup: fix mount failure in a corner case

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..b94449f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root == &cgrp_dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+   mutex_unlock(&cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   j = i;
+   }
+
for_each_root(root) {
bool name_match = false;
 
@@ -1763,6 +1784,14 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
 
+   for_each_subsys(ss, i) {
+   if (i > j)
+   break;
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   cgroup_put(&ss->root->cgrp);
+   }
+
if (ret)
return ERR_PTR(ret);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan
Oh sorry the cut&paste was incomplete. Here's the complete one:

====

From: Li Zefan 
Date: Thu, 12 Jun 2014 09:11:00 +0800
Subject: [PATCH v2 1/3] cgroup: fix mount failure in a corner case

  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..6826227 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,25 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)) ||
+   ss->root === &cgrp_dfl_root)
+   continue;
+
+   if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+   mutex_unlock(&cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   j = i;
+   }
+
for_each_root(root) {
bool name_match = false;
 
@@ -1763,6 +1784,14 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
 
+   for_each_subsys(ss, i) {
+   if (i > j)
+   break;
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   cgroup_put(&ss->root->cgrp);
+   }
+
if (ret)
return ERR_PTR(ret);
 
-- 
1.8.0.2


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/3] kernfs: introduce kernfs_pin_sb()

2014-06-27 Thread Li Zefan
kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return pointer to the superblock.
- drop kernfs_drop_sb().

Signed-off-by: Li Zefan 
---
 fs/kernfs/mount.c  | 27 +++
 include/linux/kernfs.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..616c5c4 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,33 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. Return NULL if there's no superblock associated to this
+ * kernfs_root, or -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   struct super_block *sb = NULL;
+
+   mutex_lock(&kernfs_mutex);
+   list_for_each_entry(info, &root->supers, node) {
+   if (info->ns == ns) {
+   sb = info->sb;
+   if (!atomic_inc_not_zero(&info->sb->s_active))
+   sb = ERR_PTR(-EINVAL);
+   break;
+   }
+   }
+   mutex_unlock(&kernfs_mutex);
+   return sb;
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..9096296 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -287,6 +287,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   struct kernfs_root *root, bool *new_sb_created,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/3] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-27 Thread Li Zefan
We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we try to increase both the refcnt of the superblock and the
percpu refcnt of cgroup root.

v2:
- we should try to get both the superblock refcnt and cgroup_root refcnt,
  because cgroup_root may have no superblock assosiated with it.
- adjust/add comments.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae2b382..111b7c3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1655,6 +1655,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
int ret;
int i, j = -1;
bool new_sb;
+   struct super_block *sb = NULL;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1737,14 +1738,18 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/*
 * A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* pin_sb and tryget_live failure indicate that the root is
+* being destroyed.  Wait for destruction to complete so that
+* the subsystems are free.  We can use wait_queue for the
+* wait but this path is super cold.  Let's just sleep for
+* a bit and retry.
 */
-   if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+   sb = kernfs_pin_sb(root->kf_root, NULL);
+   if (IS_ERR(sb) ||
+   !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
+   if (!IS_ERR_OR_NULL(sb))
+   deactivate_super(sb);
msleep(10);
ret = restart_syscall();
goto out_free;
@@ -1796,6 +1801,17 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+
+   if (sb) {
+   /*
+* On success kernfs_mount() returns with sb->s_umount held,
+* but kernfs_mount() also increases the superblock's refcnt,
+* so calling deactivate_super() to drop the refcnt we got when
+* looking up cgroup root won't acquire sb->s_umount again.
+*/
+   WARN_ON(new_sb);
+   deactivate_super(sb);
+   }
return dentry;
 }
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/3] cgroup: fix mount failure in a corner case

2014-06-27 Thread Li Zefan
  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /mnt busy
  mount: according to mtab, xxx is already mounted on /mnt

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount for this case.

v2:
- use percpu_ref_tryget_live() rather that introducing
  percpu_ref_alive(). (Tejun)
- adjust comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..ae2b382 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i, j = -1;
bool new_sb;
 
/*
@@ -1677,6 +1679,23 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* Destruction of cgroup root is asynchronous, so we may fail to
+* mount a cgroupfs if it immediately follows a umount. Let's wait
+* a little bit and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+   mutex_unlock(&cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-26 Thread Li Zefan
On 2014/6/25 23:00, Tejun Heo wrote:
> Hey,
> 
> On Wed, Jun 25, 2014 at 09:56:31AM +0800, Li Zefan wrote:
>>> Hmmm?  Why does that matter?  The only region in cgroup_mount() which
>>> needs to be put inside such mutex would be root lookup, no?
>>
>> unfortunately that won't help. I think what you suggest is:
>>
>> cgroup_mount()
>> {
>>  mutex_lock();
>>  lookup_cgroup_root();
>>  mutex_unlock();
>>  kernfs_mount();
>> }
>>
>> cgroup_kill_sb()
>> {
>>  mutex_lock();
>>  percpu_ref_kill();
>>  mutex_Unlock();
>>  kernfs_kill_sb();
>> }
>>
>> See, we may still be destroying the superblock after we've succeeded
>> in getting the refcnt of cgroup root.
> 
> Sure, but now the decision to kill is synchronized so the other side
> can interlock with it.  e.g.
> 
> cgroup_mount()
> {
>   mutex_lock();
>   lookup_cgroup_root();
>   if (root isn't killed yet)
>   root->this_better_stay_alive++;
>   mutex_unlock();
>   kernfs_mount();
> }
> 
> cgroup_kill_sb()
> {
>   mutex_lock();
>   if (check whether root can be killed)
>   percpu_ref_kill();
>   mutex_unlock();
>   if (the above condition was true)
>   kernfs_kill_sb();
> }
> 

This looks nasty, and I don't think it's correct. If we skip the call
to kernfs_kill_sb(), kernfs_super_info won't be freed but super_block
will, so we will end up either leaking memory or accessing invalid
memory. There are other problems like returning with sb->s_umount still
held.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] mm/mempolicy: fix sleeping function called from invalid context

2014-06-25 Thread Li Zefan
On 2014/6/25 9:57, Gu Zheng wrote:
> When runing with the kernel(3.15-rc7+), the follow bug occurs:
> [ 9969.258987] BUG: sleeping function called from invalid context at 
> kernel/locking/mutex.c:586
> [ 9969.359906] in_atomic(): 1, irqs_disabled(): 0, pid: 160655, name: python
> [ 9969.441175] INFO: lockdep is turned off.
> [ 9969.488184] CPU: 26 PID: 160655 Comm: python Tainted: G   A  
> 3.15.0-rc7+ #85
> [ 9969.581032] Hardware name: FUJITSU-SV PRIMEQUEST 1800E/SB, BIOS PRIMEQUEST 
> 1000 Series BIOS Version 1.39 11/16/2012
> [ 9969.706052]  81a20e60 8803e941fbd0 8162f523 
> 8803e941fd18
> [ 9969.795323]  8803e941fbe0 8109995a 8803e941fc58 
> 81633e6c
> [ 9969.884710]  811ba5dc 880405c6b480 88041fdd90a0 
> 2000
> [ 9969.974071] Call Trace:
> [ 9970.003403]  [] dump_stack+0x4d/0x66
> [ 9970.065074]  [] __might_sleep+0xfa/0x130
> [ 9970.130743]  [] mutex_lock_nested+0x3c/0x4f0
> [ 9970.200638]  [] ? kmem_cache_alloc+0x1bc/0x210
> [ 9970.272610]  [] cpuset_mems_allowed+0x27/0x140
> [ 9970.344584]  [] ? __mpol_dup+0x63/0x150
> [ 9970.409282]  [] __mpol_dup+0xe5/0x150
> [ 9970.471897]  [] ? __mpol_dup+0x63/0x150
> [ 9970.536585]  [] ? copy_process.part.23+0x606/0x1d40
> [ 9970.613763]  [] ? trace_hardirqs_on+0xd/0x10
> [ 9970.683660]  [] ? monotonic_to_bootbased+0x2f/0x50
> [ 9970.759795]  [] copy_process.part.23+0x670/0x1d40
> [ 9970.834885]  [] do_fork+0xd8/0x380
> [ 9970.894375]  [] ? __audit_syscall_entry+0x9c/0xf0
> [ 9970.969470]  [] SyS_clone+0x16/0x20
> [ 9971.030011]  [] stub_clone+0x69/0x90
> [ 9971.091573]  [] ? system_call_fastpath+0x16/0x1b
> 
> The cause is that cpuset_mems_allowed() try to take 
> mutex_lock(&callback_mutex)
> under the rcu_read_lock(which was hold in __mpol_dup()). And in 
> cpuset_mems_allowed(),
> the access to cpuset is under rcu_read_lock, so in __mpol_dup, we can reduce 
> the
> rcu_read_lock protection region to protect the access to cpuset only in
> current_cpuset_is_being_rebound(). So that we can avoid this bug.
> This patch is a temporary solution that just addresses the bug mentioned 
> above,
> can not fix the long-standing issue about cpuset.mems rebinding on fork():
> "
> When the forker's task_struct is duplicated (which includes ->mems_allowed)
> and it races with an update to cpuset_being_rebound in update_tasks_nodemask()
> then the task's mems_allowed doesn't get updated. And the child task's
> mems_allowed can be wrong if the cpuset's nodemask changes before the
> child has been added to the cgroup's tasklist.
> "
> 
> Signed-off-by: Gu Zheng 
> Cc: stable 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cgroup: fix a typo in Documentation/cgroups/cgroups.txt

2014-06-24 Thread Li Zefan
On 2014/6/25 11:30, Chen Hanxiao wrote:
> s/iff/if
> 

This is not a typo. iff == if and only if.

> Signed-off-by: Chen Hanxiao 
> ---
>  Documentation/cgroups/cgroups.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/Documentation/cgroups/cgroups.txt 
> b/Documentation/cgroups/cgroups.txt
> index 821de56..b614f42 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -562,7 +562,7 @@ cgroup_for_each_descendant_pre() for details.
>  void css_offline(struct cgroup *cgrp);
>  (cgroup_mutex held by caller)
>  
> -This is the counterpart of css_online() and called iff css_online()
> +This is the counterpart of css_online() and called if css_online()
>  has succeeded on @cgrp. This signifies the beginning of the end of
>  @cgrp. @cgrp is being removed and the subsystem should start dropping
>  all references it's holding on @cgrp. When all references are dropped,
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-24 Thread Li Zefan
On 2014/6/25 5:01, Tejun Heo wrote:
> Hello, Li.
> 
> On Tue, Jun 24, 2014 at 09:22:00AM +0800, Li Zefan wrote:
>>> Ah, right.  Gees, I'm really hating the fact that we have ->mount but
>>> not ->umount.  However, can't we make it a bit simpler by just
>>> introducing a mutex protecting looking up and refing up an existing
>>> root and a sb going away?  The only problem is that the refcnt being
>>> killed isn't atomic w.r.t. new live ref coming up, right?  Why not
>>> just add a mutex around them so that they can't race?
>>
>> Well, kill_sb() is called with sb->s_umount held, while kernfs_mount()
>> returned with sb->s_umount held, so adding a mutex will lead to ABBA
>> deadlock.
> 
> Hmmm?  Why does that matter?  The only region in cgroup_mount() which
> needs to be put inside such mutex would be root lookup, no?
> 

unfortunately that won't help. I think what you suggest is:

cgroup_mount()
{
mutex_lock();
lookup_cgroup_root();
mutex_unlock();
kernfs_mount();
}

cgroup_kill_sb()
{
mutex_lock();
percpu_ref_kill();
mutex_Unlock();
kernfs_kill_sb();
}

See, we may still be destroying the superblock after we've succeeded
in getting the refcnt of cgroup root.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/mempolicy: fix sleeping function called from invalid context

2014-06-23 Thread Li Zefan
On 2014/6/21 5:01, Tejun Heo wrote:
> Hello, Li.
> 
> Sorry about the long delay.
> 
> On Tue, Jun 10, 2014 at 10:58:45AM +0800, Li Zefan wrote:
>> Yes, this is a long-standing issue. Besides the race you described, the child
>> task's mems_allowed can be wrong if the cpuset's nodemask changes before the
>> child has been added to the cgroup's tasklist.
>>
>> I remember Tejun once said he wanted to disallow task migration between
>> cgroups during fork, and that should fix this problem.
> 
> I'm having trouble remembering but yeah enforcing stricter behavior
> across fork could be beneficial.  Hmmm... the problem with making
> forks exclusive against migrations is that we'll end up adding more
> locking to the fork path which isn't too nice.
> 
> Hmmm... other controllers (cgroup_freezer) can reliably synchronize
> the child's state to the cgroup it belongs to.  Why can't cpuset?  Is
> there something fundamentally missing in the cgroup API?
> 

cgroup_freezer uses the fork callback. We can also do this for cpuset as
suggested by David, which adds a little bit overhead to the fork path.

David, care to send out a patch?

>>> It needs to be slightly rewritten to work properly without negatively 
>>> impacting the latency of fork().  Do you have the cycles to do it?
>>>
>>
>> Sounds you have other idea?
> 
> I don't think the suggested patch breaks anything more than it was
> broken before and we should probably apply it for the time being.  Li?
> 

Yeah, we should apply Gu Zheng's patch any way.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-23 Thread Li Zefan
On 2014/6/21 3:35, Tejun Heo wrote:
> Hello, Li.
> 
> Sorry about the long delay.
> 
> On Thu, Jun 12, 2014 at 02:33:05PM +0800, Li Zefan wrote:
>> We've converted cgroup to kernfs so cgroup won't be intertwined with
>> vfs objects and locking, but there are dark areas.
>>
>> Run two instances of this script concurrently:
>>
>> for ((; ;))
>> {
>>  mount -t cgroup -o cpuacct xxx /cgroup
>>  umount /cgroup
>> }
>>
>> After a while, I saw two mount processes were stuck at retrying, because
>> they were waiting for a subsystem to become free, but the root associated
>> with this subsystem never got freed.
>>
>> This can happen, if thread A is in the process of killing superblock but
>> hasn't called percpu_ref_kill(), and at this time thread B is mounting
>> the same cgroup root and finds the root in the root list and performs
>> percpu_ref_try_get().
>>
>> To fix this, we increase the refcnt of the superblock instead of increasing
>> the percpu refcnt of cgroup root.
> 
> Ah, right.  Gees, I'm really hating the fact that we have ->mount but
> not ->umount.  However, can't we make it a bit simpler by just
> introducing a mutex protecting looking up and refing up an existing
> root and a sb going away?  The only problem is that the refcnt being
> killed isn't atomic w.r.t. new live ref coming up, right?  Why not
> just add a mutex around them so that they can't race?
> 

Well, kill_sb() is called with sb->s_umount held, while kernfs_mount()
returned with sb->s_umount held, so adding a mutex will lead to ABBA
deadlock.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/5] cgroup: fix mount failure in a corner case

2014-06-23 Thread Li Zefan
On 2014/6/21 3:10, Tejun Heo wrote:
> On Thu, Jun 12, 2014 at 02:32:13PM +0800, Li Zefan wrote:
>> @@ -1677,6 +1679,22 @@ static struct dentry *cgroup_mount(struct 
>> file_system_type *fs_type,
>>  goto out_unlock;
>>  }
>>  
>> +/*
>> + * If some subsystems have been bound to existing cgroup hierarchies,
>> + * but those hierachies are being destroyed, let's wait a little bit
>> + * and retry.
>> + */
>> +for_each_subsys(ss, i) {
>> +if (!(opts.subsys_mask & (1 << i)))
>> +continue;
>> +if (!percpu_ref_alive(&ss->root->cgrp.self.refcnt)) {
> 
> Can't we just do tryget_live() instead and then put before retrying?
> It's not exactly a hot path and the operations are dirt cheap anyway.
> 

No much difference, though would be a bit more code. I can do that.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()

2014-06-11 Thread Li Zefan
We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.

Run two instances of this script concurrently:

for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}

After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.

This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().

To fix this, we increase the refcnt of the superblock instead of increasing
the percpu refcnt of cgroup root.

Signed-off-by: Li Zefan 
---

A better fix is welcome!

---
 kernel/cgroup.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bd37e8d..94e1814 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1654,7 +1654,7 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
struct dentry *dentry;
int ret;
int i;
-   bool new_sb;
+   bool sb_pinned = false;
 
/*
 * The first time anyone tries to mount a cgroup, enable the list
@@ -1735,19 +1735,21 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
}
 
/*
-* A root's lifetime is governed by its root cgroup.
-* tryget_live failure indicate that the root is being
-* destroyed.  Wait for destruction to complete so that the
-* subsystems are free.  We can use wait_queue for the wait
-* but this path is super cold.  Let's just sleep for a bit
-* and retry.
+* This may fail for two reasons:
+* - A concurrent mount is in process. We wait for that mount
+to complete.
+* - The superblock is being destroyed. We wait for the
+*   desctruction to complete so that the subsystems are free.
+* We can use wait_queue for the wait but this path is super
+* cold.  Let's just sleep for a bit and retry.
 */
-   if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+   if (!kernfs_pin_sb(root->kf_root, NULL)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
+   sb_pinned = true;
 
ret = 0;
goto out_unlock;
@@ -1784,8 +1786,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
 
-   dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
-   if (IS_ERR(dentry) || !new_sb)
+   dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
+   if (sb_pinned)
+   kernfs_drop_sb(root->kf_root, NULL);
+   if (!sb_pinned && IS_ERR(dentry))
cgroup_put(&root->cgrp);
return dentry;
 }
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] cgroup: fix mount failure in a corner case

2014-06-11 Thread Li Zefan
  # cat test.sh
  #! /bin/bash

  mount -t cgroup -o cpu xxx /cgroup
  umount /cgroup

  mount -t cgroup -o cpu,cpuacct xxx /cgroup
  umount /cgroup
  # ./test.sh
  mount: xxx already mounted or /cgroup busy
  mount: according to mtab, xxx is already mounted on /cgroup

It's because the cgroupfs_root of the first mount was under destruction
asynchronously.

Fix this by delaying and then retrying mount in this case.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..bd37e8d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 int flags, const char *unused_dev_name,
 void *data)
 {
+   struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+   int i;
bool new_sb;
 
/*
@@ -1677,6 +1679,22 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
goto out_unlock;
}
 
+   /*
+* If some subsystems have been bound to existing cgroup hierarchies,
+* but those hierachies are being destroyed, let's wait a little bit
+* and retry.
+*/
+   for_each_subsys(ss, i) {
+   if (!(opts.subsys_mask & (1 << i)))
+   continue;
+   if (!percpu_ref_alive(&ss->root->cgrp.self.refcnt)) {
+   mutex_unlock(&cgroup_mutex);
+   msleep(10);
+   ret = restart_syscall();
+   goto out_free;
+   }
+   }
+
for_each_root(root) {
bool name_match = false;
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] kernfs: introduce kernfs_pin_sb() and kernfs_drop_sb()

2014-06-11 Thread Li Zefan

kernfs_pin_sb() tries to get a refcnt of the superblock, while
kernfs_drop_sb() drops this refcnt.

This will be used by cgroupfs.

Signed-off-by: Li Zefan 
---
 fs/kernfs/mount.c  | 45 +
 include/linux/kernfs.h |  3 +++
 2 files changed, 48 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..4f924e0 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,51 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
 }
 
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations.
+ */
+bool kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+   int ret = false;
+
+   mutex_lock(&kernfs_mutex);
+   list_for_each_entry(info, &root->supers, node) {
+   if (info->ns == ns) {
+   ret = atomic_inc_not_zero(&info->sb->s_active);
+   break;
+   }
+   }
+   mutex_unlock(&kernfs_mutex);
+   return ret;
+}
+
+/**
+ * kernfs_drop_sb: drop the refcnt that we got by kernfs_pin_sb()
+ * @root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * This must be paired with kernfs_pin_sb(). It will require sb->u_mount
+ * if the refcnt reaches zero.
+ */
+void kernfs_drop_sb(struct kernfs_root *root, const void *ns)
+{
+   struct kernfs_super_info *info;
+
+   mutex_lock(&kernfs_mutex);
+   list_for_each_entry(info, &root->supers, node) {
+   if (info->ns == ns)
+   break;
+   }
+   mutex_unlock(&kernfs_mutex);
+   deactivate_super(info->sb);
+}
+
 void __init kernfs_init(void)
 {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..1958017 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -288,6 +288,9 @@ struct dentry *kernfs_mount_ns(struct file_system_type 
*fs_type, int flags,
   const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
 
+bool kernfs_pin_sb(struct kernfs_root *root, const void *ns);
+void kernfs_drop_sb(struct kernfs_root *root, const void *ns);
+
 void kernfs_init(void);
 
 #else  /* CONFIG_KERNFS */
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] percpu-ref: introduce percpu_ref_alive()

2014-06-11 Thread Li Zefan
This is used to check if the percpu_ref has been killed.

Signed-off-by: Li Zefan 
---
 include/linux/percpu-refcount.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index dba35c4..1d5f2b3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -96,6 +96,17 @@ static inline void percpu_ref_kill(struct percpu_ref *ref)
 #define REF_STATUS(count)  (((unsigned long) count) & PCPU_STATUS_MASK)
 
 /**
+ * percpu_ref_alive - check if the ref has been killed
+ * @ref: percpu_ref to check
+ *
+ * Return true if percpu_ref_kill() has been called to drop the initial ref.
+ */
+static inline bool percpu_ref_alive(struct percpu_ref *ref)
+{
+   return !(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD);
+}
+
+/**
  * percpu_ref_get - increment a percpu refcount
  * @ref: percpu_ref to get
  *
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] cgroup: fix broken css_has_online_children()

2014-06-11 Thread Li Zefan
After running:

  # mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \
rmdir /cgroup/sub && umount /cgroup

I found the cgroup root still existed:

  # cat /proc/cgroups
  #subsys_namehierarchy   num_cgroups enabled
  cpuset  0   1   1
  cpu 1   1   1
  ...

It turned out css_has_online_children() is broken.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 05b8ca4..1c65f24 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3327,7 +3327,7 @@ bool css_has_online_children(struct cgroup_subsys_state 
*css)
 
rcu_read_lock();
css_for_each_child(child, css) {
-   if (css->flags & CSS_ONLINE) {
+   if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/mempolicy: fix sleeping function called from invalid context

2014-06-09 Thread Li Zefan
On 2014/6/9 17:13, David Rientjes wrote:
> On Mon, 9 Jun 2014, Gu Zheng wrote:
> 
>>> I think your patch addresses the problem that you're reporting but misses 
>>> the larger problem with cpuset.mems rebinding on fork().  When the 
>>> forker's task_struct is duplicated (which includes ->mems_allowed) and it 
>>> races with an update to cpuset_being_rebound in update_tasks_nodemask() 
>>> then the task's mems_allowed doesn't get updated.
>>
>> Yes, you are right, this patch just wants to address the bug reported above.
>> The race condition you mentioned above inherently exists there, but it is yet
>> another issue, the rcu lock here makes no sense to it, and I think we need
>> additional sync-mechanisms if want to fix it.
> 
> Yes, the rcu lock is not providing protection for any critical section 
> here that requires (1) the forker's cpuset to be stored in 
> cpuset_being_rebound or (2) the forked thread's cpuset to be rebound by 
> the cpuset nodemask update, and no race involving the two.
>

Yes, this is a long-standing issue. Besides the race you described, the child
task's mems_allowed can be wrong if the cpuset's nodemask changes before the
child has been added to the cgroup's tasklist.

I remember Tejun once said he wanted to disallow task migration between
cgroups during fork, and that should fix this problem.
 
>> But thinking more, though the current implementation has flaw, but I worry
>> about the negative effect if we really want to fix it. Or maybe the fear
>> is unnecessary.:) 
>>
> 
> It needs to be slightly rewritten to work properly without negatively 
> impacting the latency of fork().  Do you have the cycles to do it?
> 

Sounds you have other idea?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] cgroup: disallow disabled controllers on the default hierarchy

2014-06-05 Thread Li Zefan
After booting with cgroup_disable=memory, I still saw memcg files
in the default hierarchy, and I can write to them, though it won't
take effect.

  # dmesg
  ...
  Disabling memory control group subsystem
  ...
  # mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup
  # ls /cgroup
  ...
  memory.failcnt   memory.move_charge_at_immigrate
  memory.force_empty   memory.numa_stat
  memory.limit_in_bytesmemory.oom_control
  ...
  # cat /cgroup/memory.usage_in_bytes
  0

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d5032d2..57b647a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3070,6 +3070,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct 
cftype *cfts)
 {
int ret;
 
+   if (ss->disabled)
+   return 0;
+
if (!cfts || cfts[0].name[0] == '\0')
return 0;
 
@@ -4679,8 +4682,6 @@ static void __init cgroup_init_subsys(struct 
cgroup_subsys *ss, bool early)
 
BUG_ON(online_css(css));
 
-   cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-
mutex_unlock(&cgroup_mutex);
 }
 
@@ -4760,10 +4761,13 @@ int __init cgroup_init(void)
 
/*
 * cftype registration needs kmalloc and can't be done
-* during early_init.  Register base cftypes separately.
+* during early_init, and the disable flag is set after
+* early_init.  Register base cftypes separately.
 */
-   if (ss->base_cftypes)
+   if (!ss->disabled) {
+   cgrp_dfl_root.subsys_mask |= 1 << ss->id;
WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+   }
}
 
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] cgroup: make the default root invisible when it's umounted

2014-06-04 Thread Li Zefan
On 2014/6/5 9:20, Tejun Heo wrote:
> Hello,
> 
> On Wed, Jun 04, 2014 at 04:59:59PM +0800, Li Zefan wrote:
>> The example I gave is the same result if sane_behavior is not specified,
>> so this is a behavioural change for the old interface?
> 
> Hmmm?  Either the userland knows about unified hierarchy or not and
> there's no point in hiding it if we know that userland knows.  It's
> really a system-wide thing which happens once.
> 

Yeah, it's reasonable.

I thought "mount -t cgroup xxx /cgroup" will mount the default hierarchy,
but I was wrong.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] cgroup: make the default root invisible when it's umounted

2014-06-04 Thread Li Zefan
On 2014/6/3 21:01, Tejun Heo wrote:
> On Tue, Jun 03, 2014 at 12:05:22PM +0800, Li Zefan wrote:
>> Before this patch (in a fresh system):
>>
>># cat /proc/$$/cgroup
>># mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup
>># umount /cgroup
>># cat /proc/$$/cgroup
>>
>> 0:cpuset,cpu,cpuacct,memory,devices,freezer,net_cls,blkio,perf_event,net_prio,hugetlb:/
>>
>> After this patch (in a fresh system):
>>
>># cat ...
>># mount ...
>># umount ...
>># cat /proc/$$/cgroup
>>#
>>
>> You won't see the default root after it's umounted.
> 
> H... I intentionally left it visible tho.  The only reason we gate
> its visibility is avoid disturbing userland which doesn't know about
> and won't use the unified hierarchy.  If the userland starts making
> use of it, there's no reason to hide it again especially as that's
> consistent with how other hierarchies behave too - they keep showing
> up if they have lingering refs.
> 

The example I gave is the same result if sane_behavior is not specified,
so this is a behavioural change for the old interface?

Do we need a fix like this?

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f46165..6f10cff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1789,6 +1789,9 @@ static void cgroup_kill_sb(struct super_block *sb)
else
percpu_ref_kill(&root->cgrp.self.refcnt);

+   if (root == &cgrp_dfl_root && !cgroup_sane_behavior(&root->cgrp))
+   cgrp_dfl_root_visible = false;
+
kernfs_kill_sb(sb);
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] cgroup: don't destroy the default root

2014-06-04 Thread Li Zefan
The default root is allocated and initialized at boot phase, so we
shouldn't destroy the default root when it's umounted, otherwise
it will lead to disaster.

Just try mount and then umount the default root, and the kernel will
crash immediately.

v2:
- No need to check for CSS_NO_REF in cgroup_get/put(). (Tejun)
- Better call cgroup_put() for the default root in kill_sb(). (Tejun)
- Add a comment.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5f75ac..3f46165 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1780,8 +1780,11 @@ static void cgroup_kill_sb(struct super_block *sb)
 * If @root doesn't have any mounts or children, start killing it.
 * This prevents new mounts by disabling percpu_ref_tryget_live().
 * cgroup_mount() may wait for @root's release.
+*
+* And don't kill the default root.
 */
-   if (css_has_online_children(&root->cgrp.self))
+   if (css_has_online_children(&root->cgrp.self) ||
+   root == &cgrp_dfl_root)
cgroup_put(&root->cgrp);
else
percpu_ref_kill(&root->cgrp.self.refcnt);
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] cgroup: don't destroy the default root

2014-06-04 Thread Li Zefan
On 2014/6/3 20:57, Tejun Heo wrote:
> Hello, Li.
> 
> On Tue, Jun 03, 2014 at 12:04:38PM +0800, Li Zefan wrote:
>>  static void cgroup_get(struct cgroup *cgrp)
>>  {
>>  WARN_ON_ONCE(cgroup_is_dead(cgrp));
>> -css_get(&cgrp->self);
>> +if (!(cgrp->self.flags & CSS_NO_REF))
>> +css_get(&cgrp->self);
> 
> Hmmm?  The same condition is tested by css_get().  Why should it be
> tested again here?
> 

Oh, I completely ignored that.

>>  static void cgroup_put(struct cgroup *cgrp)
>>  {
>> -css_put(&cgrp->self);
>> +if (!(cgrp->self.flags & CSS_NO_REF))
>> +css_put(&cgrp->self);
> 
> Ditto.
> 
>> @@ -1781,10 +1783,12 @@ static void cgroup_kill_sb(struct super_block *sb)
>>   * This prevents new mounts by disabling percpu_ref_tryget_live().
>>   * cgroup_mount() may wait for @root's release.
>>   */
>> -if (css_has_online_children(&root->cgrp.self))
>> +if (css_has_online_children(&root->cgrp.self)) {
>>  cgroup_put(&root->cgrp);
>> -else
>> -percpu_ref_kill(&root->cgrp.self.refcnt);
>> +} else {
>> +if (root != &cgrp_dfl_root)
>> +percpu_ref_kill(&root->cgrp.self.refcnt);
>> +}
> 
> As conceptually percpu_ref_kill() just puts the base ref and the
> dfl_root's refcnt never reaches zero, it won't actually trigger.

Yes it will, just try mount && umount.

I think it's because cgroup_get() is a no-op for CSS_NO_REF, so it has
only the base ref, so percpu_ref_iill() will actually schedule the
call to css_release().

> Hmmm wouldn't the above leak a ref each time the default hierarchy
> is unmounted tho?  Shouldn't it be like the following?
> 

cgroup_get() is a no-op for root cgroup of the default root, so there's
no leak, but still better to call cgroup_put().

I'll send an updated patch.

>   if (root == &cgrp_dfl_root || css_has_online_children(...))
>   cgroup_put(&root->cgrp);
>   else
>   percpu_ref_kill(...);
> 
> Thanks.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] cgroup: set visible flag only after we've mounted the default root

2014-06-02 Thread Li Zefan
This fixes the failure path, so we won't set the visible flag though
the mount is failed.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dabc486..0b6b44e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1671,7 +1671,6 @@ static struct dentry *cgroup_mount(struct 
file_system_type *fs_type,
 
/* look for a matching existing root */
if (!opts.subsys_mask && !opts.none && !opts.name) {
-   cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
ret = 0;
@@ -1770,6 +1769,9 @@ out_free:
dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+   else if (root == &cgrp_dfl_root)
+   cgrp_dfl_root_visible = true;
+
return dentry;
 }
 
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] cgroup: make the default root invisible when it's umounted

2014-06-02 Thread Li Zefan
Before this patch (in a fresh system):

   # cat /proc/$$/cgroup
   # mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup
   # umount /cgroup
   # cat /proc/$$/cgroup
   
0:cpuset,cpu,cpuacct,memory,devices,freezer,net_cls,blkio,perf_event,net_prio,hugetlb:/

After this patch (in a fresh system):

   # cat ...
   # mount ...
   # umount ...
   # cat /proc/$$/cgroup
   #

You won't see the default root after it's umounted.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f73fe48..dabc486 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1788,6 +1788,8 @@ static void cgroup_kill_sb(struct super_block *sb)
} else {
if (root != &cgrp_dfl_root)
percpu_ref_kill(&root->cgrp.self.refcnt);
+   else
+   cgrp_dfl_root_visible = false;
}
 
kernfs_kill_sb(sb);
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] cgroup: don't destroy the default root

2014-06-02 Thread Li Zefan
The default root is allocated and initialized at boot, so we
shouldn't destroy the default root when it's umounted, otherwise
it will lead to disaster.

Signed-off-by: Li Zefan 
---
 kernel/cgroup.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5f75ac..f73fe48 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1027,12 +1027,14 @@ static umode_t cgroup_file_mode(const struct cftype 
*cft)
 static void cgroup_get(struct cgroup *cgrp)
 {
WARN_ON_ONCE(cgroup_is_dead(cgrp));
-   css_get(&cgrp->self);
+   if (!(cgrp->self.flags & CSS_NO_REF))
+   css_get(&cgrp->self);
 }
 
 static void cgroup_put(struct cgroup *cgrp)
 {
-   css_put(&cgrp->self);
+   if (!(cgrp->self.flags & CSS_NO_REF))
+   css_put(&cgrp->self);
 }
 
 /**
@@ -1781,10 +1783,12 @@ static void cgroup_kill_sb(struct super_block *sb)
 * This prevents new mounts by disabling percpu_ref_tryget_live().
 * cgroup_mount() may wait for @root's release.
 */
-   if (css_has_online_children(&root->cgrp.self))
+   if (css_has_online_children(&root->cgrp.self)) {
cgroup_put(&root->cgrp);
-   else
-   percpu_ref_kill(&root->cgrp.self.refcnt);
+   } else {
+   if (root != &cgrp_dfl_root)
+   percpu_ref_kill(&root->cgrp.self.refcnt);
+   }
 
kernfs_kill_sb(sb);
 }
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 3.15 regression: wrong cgroup magic

2014-06-02 Thread Li Zefan
Cc: Greg
Cc: Jianyu Zhan

On 2014/6/3 8:56, Andy Lutomirski wrote:
> Sorry I didn't notice this earlier.  Linux 3.15 breaks my production

But 3.15 hasn't been released. :)

> system :(  The cause appears to be:
> 
> commit 2bd59d48ebfb3df41ee56938946ca0dd30887312
> Author: Tejun Heo 
> Date:   Tue Feb 11 11:52:49 2014 -0500
> 
> cgroup: convert to kernfs
> 
> In particular, this piece:
> 
> -   sb->s_magic = CGROUP_SUPER_MAGIC;
> 
> The result is that cgroup shows up with the wrong magic number, so my
> code goes "oh crap, cgroupfs isn't mounted" and fails.
> 
> I can change my code to hack around this, but I can imagine other
> things getting tripped up.  Is there still time to fix this?
> 

This should be fixed by "kernfs: move the last knowledge of sysfs out from 
kernfs".

It's in driver-core-next.

https://git.kernel.org/cgit/linux/kernel/git/gregkh/driver-core.git/commit/?h=driver-core-next&id=26fc9cd200ec839e0b3095e05ae018f27314e7aa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] MIPS: change type of asid_cache to unsigned long

2014-05-29 Thread Li Zefan
On 2014/5/29 4:09, Aaro Koskinen wrote:
> Hi,
> 
> On Tue, May 27, 2014 at 12:16:30PM +0800, Li Zefan wrote:
>> On 2014/5/21 13:36, Yong Zhang wrote:
>>> asid_cache must be unsigned long otherwise on 64bit system
>>> it will become 0 if the value in get_new_mmu_context()
>>> reaches 0x and in the end the assumption of
>>> ASID_FIRST_VERSION is not true anymore thus leads to
>>> more dangerous things.
>>
>> We should describe what problem this bug can lead to, which
>> will help people who encounter the same problem and google it.
> 
> Please describe it, then. Even if the patch is already committed,
> googling would probably still find this e-mail thread.
> 

I don't think Ralf has committed it, so we'll send out a fix
with detailed changelog.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH cgroup/for-3.16] cgroup: clean up MAINTAINERS entries

2014-05-28 Thread Li Zefan
On 2014/5/20 4:33, Tejun Heo wrote:
> On Tue, May 13, 2014 at 03:49:58PM -0400, Tejun Heo wrote:
>> There are currently three cgroup related entries in MAINTAINERS.  Make
>> the following updates.
>>
>> * Make the names - both cgroup and cpuset - singular.  We're mixing
>>   singular and plural all over the place for no good reason.
>>
>> * Drop contain...@lists.linux-foundation.org from CGROUP.  That list
>>   doesn't have much to do with cgroup per-se.
>>
>> * Add Documentation field to CGROUP.
>>
>> * Drop mm/*cgroup* from CGROUP.  memcg has separate maintainers.
>>
>> * Prefix the controller-specific ones with "CONTROL CGROUP -" and
>>   collect cgroup related entries under the core one.
>>
>> * Add (MEMCG) abbreviation to MEMCG entry.
>>
>> * Drop Balbir Singh and KAMEZAWA Hiroyuki from memcg maintainers.  It
>>   has been quite a while since both actually worked on memcg.
>>
>> Signed-off-by: Tejun Heo 
> 
> Applied to cgroup/for-3.16.
> 

Late ack for the cgroup and cpuset parts.

Don't know why I overlooked this and the debug controller patch...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] page_alloc: skip cpuset enforcement for lower zone allocations (v2)

2014-05-28 Thread Li Zefan
On 2014/5/27 2:53, Marcelo Tosatti wrote:
> 
> Zone specific allocations, such as GFP_DMA32, should not be restricted
> to cpusets allowed node list: the zones which such allocations demand
> might be contained in particular nodes outside the cpuset node list.
> 
> The alternative would be to not perform such allocations from
> applications which are cpuset restricted, which is unrealistic.
> 
> Fixes KVM's alloc_page(gfp_mask=GFP_DMA32) with cpuset as explained.
> 

Could you add the use case that you described in a previous email to
the changelog?

> Signed-off-by: Marcelo Tosatti 
> 
> v2: fix slowpath as well (David Rientjes)
> 
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 3d54c41..b70a336 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -2392,6 +2392,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t 
> gfp_mask)
>  

Add a comment accordingly?

 *  in_interrupt - any node ok (current task context irrelevant)
 *  GFP_ATOMIC   - any node ok
 *  TIF_MEMDIE   - any node ok
 *  GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *  GFP_USER - only nodes in current tasks mems allowed ok.

>   if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
>   return 1;
> +#ifdef CONFIG_NUMA
> + if (gfp_zone(gfp_mask) < policy_zone)
> + return 1;
> +#endif
>   might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
>   if (node_isset(node, current->mems_allowed))
>   return 1;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5dba293..dfea3dc 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2698,6 +2698,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
> order,
>   unsigned int cpuset_mems_cookie;
>   int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
>   struct mem_cgroup *memcg = NULL;
> + nodemask_t *cpuset_mems_allowed = &cpuset_current_mems_allowed;
>  
>   gfp_mask &= gfp_allowed_mask;
>  
> @@ -2726,9 +2727,14 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
> order,
>  retry_cpuset:
>   cpuset_mems_cookie = read_mems_allowed_begin();
>  
> +#ifdef CONFIG_NUMA
> + if (gfp_zone(gfp_mask) < policy_zone)
> + cpuset_mems_allowed = NULL;
> +#endif
> +
>   /* The preferred zone is used for statistics later */
>   first_zones_zonelist(zonelist, high_zoneidx,
> - nodemask ? : &cpuset_current_mems_allowed,
> + nodemask ? : cpuset_mems_allowed,
>   &preferred_zone);
>   if (!preferred_zone)
>   goto out;
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] MIPS: change type of asid_cache to unsigned long

2014-05-26 Thread Li Zefan
On 2014/5/27 13:23, Yong Zhang wrote:
> On Tue, May 27, 2014 at 01:07:20PM +0800, Li Zefan wrote:
>> On 2014/5/27 12:50, Yong Zhang wrote:
>>> BTW, I realy don't care who credits the patch and Ralf said that
>>> he will applied the one which moves the place of udelay_val.
>>>
>>> Anyway, if your company pays you more money if you contribute to
>>> the community, just take it and talk about it with Ralf ;-)
>>>
>>
>> We don't do contribution for money, and I don't think you do,
>> but crediting properly is one of the reason that our kernel
>> community keeps prosperous for so many years, and that's one
>> of the reason we introduced Reported-by and Tested-by tags.
> 
> I'll reply this email for the last time.
> 
> To me your action is just like Reported-by, but I admit that
> you also do analysis. If you don't the way change it to whatever
> you want.
> 

Sorry if I sounded offensive. I want Li Bin to get the credit,
because he's supposed to, and I want him to be encouraged in
contributing to the mainline kernel.

The decision is on Ralf, whether to accept your patch or let
us send our fix with detailed changelog.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] MIPS: change type of asid_cache to unsigned long

2014-05-26 Thread Li Zefan
On 2014/5/27 12:50, Yong Zhang wrote:
> BTW, I realy don't care who credits the patch and Ralf said that
> he will applied the one which moves the place of udelay_val.
> 
> Anyway, if your company pays you more money if you contribute to
> the community, just take it and talk about it with Ralf ;-)
> 

We don't do contribution for money, and I don't think you do,
but crediting properly is one of the reason that our kernel
community keeps prosperous for so many years, and that's one
of the reason we introduced Reported-by and Tested-by tags.

> Thanks,
> Yong

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] MIPS: change type of asid_cache to unsigned long

2014-05-26 Thread Li Zefan
On 2014/5/27 12:34, Yong Zhang wrote:
> On Tue, May 27, 2014 at 12:16:30PM +0800, Li Zefan wrote:
>> I'm not quite happy about what happaned here. There's a story behind
>> this patch.
>>
>> One of our Huawei product encountered a bug, and they're using WindRiver4,
>> so the kernel is 2.6.34.
>>
>> Because they bought your licnece, they asked for your help, but
>> you were reluctant on this issue, and the problem remained there
>> for about one month.
>>
>> At last they turned to us for help. We're the kernel department in
>> Huawei, but maintaining this product kernel isn't our job. Still
>> Li Bin devoted his time to analyzing this bug, and he did a great
>> job.
>>
>> Li Bin told the product team what was wrong and was about to send
>> a fix for upstream kernel.
> 
> You have time to do that but you didn't.
> 

Hah yeah, we do have time. we spent lots of time analyzing the bug,
and we were taking our time to write good changelog. As I've pointed
out that your changelog isn't informative.

>> They told you our analysis for further
>> confirmation,
> 
> So you realy didn't make the patch, right? Because you are not
> sure the right fix.
> 

We're confident about our analysis and we know how to fix it.

It's the product team wasn't sure about this, and they wasn't
able to contact with Li Bin for confirmation at that time, so they
asked you.

>> and you were so reluctant to help but so quick to
>> send the fix.
> 
> We have responsed to you.
> 

You responded to us but you did nothing to help, that's why the
product team found us.

>>
>> Li Bin never reported this bug, but he fixed it. It's a shame that
>> you took the credit from us.
> 
> I just saw a bug report and ananysis. And I agreed and confirmed it's
> a bug.
> 

And that's our work and our credit, and I don't think you're gonna
to deny it.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] MIPS: change type of asid_cache to unsigned long

2014-05-26 Thread Li Zefan
I'm not quite happy about what happaned here. There's a story behind
this patch.

One of our Huawei product encountered a bug, and they're using WindRiver4,
so the kernel is 2.6.34.

Because they bought your licnece, they asked for your help, but
you were reluctant on this issue, and the problem remained there
for about one month.

At last they turned to us for help. We're the kernel department in
Huawei, but maintaining this product kernel isn't our job. Still
Li Bin devoted his time to analyzing this bug, and he did a great
job.

Li Bin told the product team what was wrong and was about to send
a fix for upstream kernel. They told you our analysis for further
confirmation, and you were so reluctant to help but so quick to
send the fix.

Li Bin never reported this bug, but he fixed it. It's a shame that
you took the credit from us.

On 2014/5/21 13:36, Yong Zhang wrote:
> asid_cache must be unsigned long otherwise on 64bit system
> it will become 0 if the value in get_new_mmu_context()
> reaches 0x and in the end the assumption of
> ASID_FIRST_VERSION is not true anymore thus leads to
> more dangerous things.
> 

We should describe what problem this bug can lead to, which
will help people who encounter the same problem and google it.

> Reported-by: libin 
> Signed-off-by: Yong Zhang 

Should mark the patch for stable trees. Though 2.6.34 is EOL,
the fix should be backported to other kernels.

> ---
> 
> V2<-V1: Add the reporter.
> 
>  arch/mips/include/asm/cpu-info.h |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/mips/include/asm/cpu-info.h 
> b/arch/mips/include/asm/cpu-info.h
> index f6299be..ebcc2ed 100644
> --- a/arch/mips/include/asm/cpu-info.h
> +++ b/arch/mips/include/asm/cpu-info.h
> @@ -40,7 +40,7 @@ struct cache_desc {
>  
>  struct cpuinfo_mips {
>   unsigned intudelay_val;
> - unsigned intasid_cache;
> + unsigned long   asid_cache;
>  
>   /*
>* Capability and feature descriptor structure for MIPS CPU
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: iterate cgroup_subsys_states directly

2014-05-15 Thread Li Zefan
On 2014/5/10 5:31, Tejun Heo wrote:
> Hello,
> 
> Currently, while csses (cgroup_subsys_states) have ->parent linkage
> too, only cgroups form full tree through their ->children and
> ->sibling fields and css iterations naturally is implemented by
> iterating cgroups and then dereferencing the css for the specified
> subsystem.
> 
> There are now use cases where controllers need to iterate through
> csses regardless of their online state as long as they have positive
> reference.  This can't easily be achieved by iterating cgroups because
> its css pointer array needs to be cleared on offline and there may be
> multiple dying csses for a cgroup for the same subsystem and there's
> only one pointer per cgroup-subsystem pair.
> 
> This patchset moves ->children and ->sibling from cgroup to css and
> link all csses in proper trees and then make css iterators walk csses
> directly instead of going through cgroups.  This achieves iteration of
> all non-released csses while also simplifying the iteration
> implementation.  This is also in line with the general direction of
> using csses as the primary structural component.
> 
> This patchset contains the following fourteen patches.
> 
>  0001-cgroup-remove-css_parent.patch
>  0002-cgroup-remove-pointless-has-tasks-children-test-from.patch
>  0003-memcg-update-memcg_has_children-to-use-css_next_chil.patch
>  0004-device_cgroup-remove-direct-access-to-cgroup-childre.patch
>  0005-cgroup-remove-cgroup-parent.patch
>  0006-cgroup-move-cgroup-sibling-and-children-into-cgroup_.patch
>  0007-cgroup-link-all-cgroup_subsys_states-in-their-siblin.patch
>  0008-cgroup-move-cgroup-serial_nr-into-cgroup_subsys_stat.patch
>  0009-cgroup-introduce-CSS_RELEASED-and-reduce-css-iterati.patch
>  0010-cgroup-iterate-cgroup_subsys_states-directly.patch
>  0011-cgroup-use-CSS_ONLINE-instead-of-CGRP_DEAD.patch
>  0012-cgroup-convert-cgroup_has_live_children-into-css_has.patch
>  0013-device_cgroup-use-css_has_online_children-instead-of.patch
>  0014-cgroup-implement-css_tryget.patch
> 
> 0001-0004 are prep patches.
> 
> 0005-0008 move fields from cgroup to css and link csses in tree
> structure instead of cgroups.
> 
> 0009-0010 implement direct css iteration.
> 
> 0011-0013 convert a cgroup based interface to a css one, which is now
> possible as both are the same in terms of the tree structure, and fix
> devcg brekage using it.
> 
> 0014 implements css_tryget() which is to be used to gain access to
> offline but not-yet-released csses.
> 
> This pachset is on top of
> 
>  b9a63d0116e8 ("Merge branch 'for-3.16' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu into for-3.16")
>  + [1] [PATCHSET v2 cgroup/for-3.16] cgroup: post unified hierarchy fixes and 
> updates
>  + [2] (REFRESHED) [PATCHSET cgroup/for-3.16] cgroup: implement 
> cftype->write()
>  + [3] (REFRESHED) [PATCHSET cgroup/for-3.16] cgroup: remove cgroup_tree_mutex
>  + [4] [PATCHSET cgroup/for-3.16] cgroup: use css->refcnt for cgroup 
> reference counting
> 
> and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-direct-css-iteration
> 
> diffstat follows.  Thanks.
> 
>  block/blk-cgroup.h   |2 
>  include/linux/cgroup.h   |  122 +++-
>  kernel/cgroup.c  |  257 
> ---
>  kernel/cgroup_freezer.c  |2 
>  kernel/cpuset.c  |2 
>  kernel/sched/core.c  |2 
>  kernel/sched/cpuacct.c   |2 
>  mm/hugetlb_cgroup.c  |2 
>  mm/memcontrol.c  |   45 +++
>  net/core/netclassid_cgroup.c |2 
>  net/core/netprio_cgroup.c|2 
>  security/device_cgroup.c |   17 --
>  12 files changed, 251 insertions(+), 206 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: iterate cgroup_subsys_states directly

2014-05-15 Thread Li Zefan
On2014/5/14 21:07, Tejun Heo wrote:
> Hello, Li.
> 
> On Wed, May 14, 2014 at 12:21:25PM +0800, Li Zefan wrote:
>>> There are now use cases where controllers need to iterate through
>>> csses regardless of their online state as long as they have positive
>>
>> What use cases are we talking about here?
> 
> memcg wants to be able to iterate all csses whose refcnts haven't
> reached zero yet so that it can treat offline csses the same way as
> online ones in terms of memory reclaim.  They don't contain new tasks
> so new charges won't be created but offlining won't try to transfer
> all charges to the parent but just leave the offline child attached
> until all charges are eventually reclaimed from the pressure from the
> parent.
> 
> I'm not too familiar with the details but this makes sense in generic
> sense too.  Offline marks an object starting its draining phase and
> release marks the actual destruction point.  For controllers with
> persistent states like memcg, it's a lot more natural to deal offlined
> csses as "active but draining following the usual hierarchical
> operation" rather than trying to explicitly update the states from
> offline to move them to the parent especially as the effort there
> essentially is a waste as most of those moved charges aren't gonna be
> used in the parent and will be released eventually.
> 
> Guaranteeing iteration of offline but not-released csses allow
> controllers to treat the draining stage between offline and release
> more or less identically to online state which in turn can make
> ->css_offline() significantly simpler and lighter.
> 

yeah, fair enough.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: iterate cgroup_subsys_states directly

2014-05-13 Thread Li Zefan
Hi Tejun,

On 2014/5/10 5:31, Tejun Heo wrote:
> Hello,
> 
> Currently, while csses (cgroup_subsys_states) have ->parent linkage
> too, only cgroups form full tree through their ->children and
> ->sibling fields and css iterations naturally is implemented by
> iterating cgroups and then dereferencing the css for the specified
> subsystem.
> 
> There are now use cases where controllers need to iterate through
> csses regardless of their online state as long as they have positive

What use cases are we talking about here?

> reference.  This can't easily be achieved by iterating cgroups because
> its css pointer array needs to be cleared on offline and there may be
> multiple dying csses for a cgroup for the same subsystem and there's
> only one pointer per cgroup-subsystem pair.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: use css->refcnt for cgroup reference counting

2014-05-13 Thread Li Zefan
On 2014/5/10 5:13, Tejun Heo wrote:
> Hello,
> 
> Currently, cgroup and css (cgroup_subsys_state) are separately
> reference counted.  cgroup->refcnt is an atomic_t and css->refcnt is a
> percpu_ref.  css is becoming the primary structural block and used
> widely in various operaitons.  cgroup already has a css embedded in it
> (cgroup->dummy_css) to use as its proxy in such operations; however,
> there still are quite a few differences between cgroup and css
> handling limitng how cgroup->dummy_css can be used.
> 
> This patchset makes cgroup use the embedded css's refcnt for reference
> counting.  This closes one of the gaps between an cgroup embedded css,
> which is renamed to cgroup->self early in the patchset, and a normal
> css and will allow more unified handling of cgroups and csses.  In
> addition, this makes cgroup refcnting use percpu_ref too, which is a
> lot more scalable than an atomic_t.
> 
> Ultimately, it'd make things far simpler to assign a proper
> cgroup_subsys to the cgroup embedded csses and handle them the same as
> other csses; however, we can't yet do it thanks to multiple
> hierarchies as we end up with multiple csses of the same subsystem for
> the same task, but in the very long term, if multiple hierarchies can
> be removed, that's where it's headed.
> 
> This patchset contains the following nine patches.
> 
>  0001-cgroup-use-restart_syscall-for-mount-retries.patch
>  0002-cgroup-rename-cgroup-dummy_css-to-self-and-move-it-t.patch
>  0003-cgroup-separate-out-cgroup_has_live_children-from-cg.patch
>  0004-cgroup-move-check_for_release-parent-call-to-the-end.patch
>  0005-cgroup-move-cgroup-sibling-unlinking-to-cgroup_put.patch
>  0006-cgroup-remove-cgroup_destory_css_killed.patch
>  0007-cgroup-bounce-css-release-through-css-destroy_work.patch
>  0008-cgroup-enable-refcnting-for-root-csses.patch
>  0009-cgroup-use-cgroup-self.refcnt-for-cgroup-refcnting.patch
> 
> 0001-0003 are prep patches.
> 
> 0004-0006 remove cgroup_destroy_css_killed().  This brings cgroup's
> destruction path closer to css's so that they can be merged.
> 
> 0007-0009 make cgroup use the embedded css's refcnt.
> 
> This pachset is on top of
> 
>  b9a63d0116e8 ("Merge branch 'for-3.16' of 
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu into for-3.16")
>  + [1] [PATCHSET v2 cgroup/for-3.16] cgroup: post unified hierarchy fixes and 
> updates
>  + [2] (REFRESHED) [PATCHSET cgroup/for-3.16] cgroup: implement 
> cftype->write()
>  + [3] (REFRESHED) [PATCHSET cgroup/for-3.16] cgroup: remove cgroup_tree_mutex
> 
> and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-use-css-ref
> 
> diffstat follows.  Thanks.
> 
>  include/linux/cgroup.h |   25 
>  kernel/cgroup.c|  284 
> ++---
>  2 files changed, 136 insertions(+), 173 deletions(-)
> 

With the memory leak fixed:

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/9] cgroup: use restart_syscall() for mount retries

2014-05-13 Thread Li Zefan
On 2014/5/10 5:13, Tejun Heo wrote:
> cgroup_mount() uses dumb delay-and-retry logic to wait for cgroup_root
> which is being destroyed.  The retry currently loops inside
> cgroup_mount() proper.  This patch makes it return with
> restart_syscall() instead so that retry travels out to userland
> boundary.
> 
> This slightly simplifies the logic and more importantly makes the
> retry logic behave better when the wait for some reason becomes
> lengthy or infinite by allowing the operation to be suspended or
> terminated from userland.
> 
> Signed-off-by: Tejun Heo 
> ---
>  kernel/cgroup.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3083c5a..2755f33 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct 
> file_system_type *fs_type,
>   ret = parse_cgroupfs_options(data, &opts);
>   if (ret)
>   goto out_unlock;
> -retry:
> +
>   /* look for a matching existing root */
>   if (!opts.subsys_mask && !opts.none && !opts.name) {
>   cgrp_dfl_root_visible = true;
> @@ -1740,8 +1740,7 @@ retry:
>   if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
>   mutex_unlock(&cgroup_mutex);
>   msleep(10);
> - mutex_lock(&cgroup_mutex);
> - goto retry;
> + return ERR_PTR(restart_syscall());

We leak memory allocated in parse_cgroupfs_options().

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: remove cgroup_tree_mutex

2014-05-13 Thread Li Zefan
On 2014/5/7 4:19, Tejun Heo wrote:
> Hello,
> 
> cgroup_tree_mutex was introduced during kernfs conversion to work
> around the cyclic locking dependency between kernfs active protection
> and cgroup_mutex.  Some file and directory operations need to acquire
> cgroup_mutex which puts the mutex under the kernfs active protection;
> however, cgroup also needs to access the hierarchy and the registered
> cftypes to detemine which files to remove, which obviously can't be
> done while holding cgroup_mutex anymore.
> 
> cgroup_tree_mutex nests above both cgroup_mutex and kernfs active
> protection and protects hierarchy and cftypes so that those file
> operations can be performed while holding it without cgroup_mutex.
> This works but is kinda cumbersome as most places end up taking both
> cgroup_tree_mutex and cgroup_mutex and there's on-going friction on
> what needs to be protected by which combination.
> 
> Furthermore, due to new requirements from subtree_control
> implementations, kernfs ended up growing full-blown mechanism to
> bypass active protection instead of just supporting self-removal and
> cgroup ended up using both mechanisms - two layered mutexes and active
> protection bypss - on different areas, which is totally unncessary.
> 
> This patchset converts everything over to kernfs active protection
> bypass and drops cgroup_tree_mutex making cgroup locking noticeably
> simpler.  It contains the following eight patches.
> 
>  0001-cgroup-reorganize-cgroup_create.patch
>  0002-cgroup-collapse-cgroup_create-into-croup_mkdir.patch
>  0003-cgroup-grab-cgroup_mutex-earlier-in-cgroup_subtree_c.patch
>  0004-cgroup-move-cgroup-kn-priv-clearing-to-cgroup_rmdir.patch
>  0005-cgroup-factor-out-cgroup_kn_lock_live-and-cgroup_kn_.patch
>  0006-cgroup-use-cgroup_kn_lock_live-in-other-cgroup-kernf.patch
>  0007-cgroup-nest-kernfs-active-protection-under-cgroup_mu.patch
>  0008-cgroup-remove-cgroup_tree_mutex.patch
> 
> 0001-0004 reorganize various kernfs handling paths so that they are
> more uniform in terms of active protection handling.
> 
> 0005 factors out two locking helpers - cgroup_kn_lock_live() and
> cgroup_kn_unlock() - which handle both kernfs active protection bypass
> and locking.
> 
> 0006 applies it to other kernfs method implementations which were
> grabbing cgroup_mutex under active protection.
> 
> 0007 reverses the locking dependency between cgroup_mutex and kernfs
> active protection so that the latter nests under the former, making
> cgroup_mutex equivalent to cgroup_tree_mutex.
> 
> 0008 removes cgroup_tree_mutex.
> 
> This patchset is on top of
> 
>   cgroup/for-3.16 12d3089c192c ("kernel/cpuset.c: convert printk to pr_foo()")
> + [1] [PATCHSET cgroup/for-3.16] cgroup: post unified hierarchy fixes and 
> updates
> + [2] [PATCHSET cgroup/for-3.16] cgroup: implement cftype->write()
> 
> and is available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-kill-tree_mutex
> 
> diffstat follows.  Thanks.
> 
>  kernel/cgroup.c |  385 
> +++-
>  1 file changed, 163 insertions(+), 222 deletions(-)
> 

Acked-by: Li Zefan 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET v2 cgroup/for-3.16] cgroup: post unified hierarchy fixes and updates

2014-05-12 Thread Li Zefan
On 2014/5/10 3:32, Tejun Heo wrote:
> Hello,
> 
> Changes from the last take[L] are,
> 
> * 0002, 0003 and 0007 added.
> 
> * Other patches are refreshed without content change.
> 
> This patchset contains the following seven patches.
> 
>  0001-cgroup-fix-offlining-child-waiting-in-cgroup_subtree.patch
>  0002-cgroup-cgroup_idr_lock-should-be-bh.patch
>  0003-cgroup-css_release-shouldn-t-clear-cgroup-subsys.patch
>  0004-cgroup-update-and-fix-parsing-of-cgroup.subtree_cont.patch
>  0005-cgroup-use-restart_syscall-for-retries-after-offline.patch
>  0006-cgroup-use-release_agent_path_lock-in-cgroup_release.patch
>  0007-cgroup-rename-css_tryget-to-css_tryget_online.patch
> 
> 0001 fixes two bugs in cgroup_subtree_control_write().
> 
> 0004 fixes and makes subtree_control parsing stricter.
> 
> 0005 simplifies cgroup_substree_control_write() retry path by using
> restart_syscall().
> 
> 0006 makes cgroup_release_agent_show() use release_path_lock.  The
> original conversion missed this one.
> 
> 0007 renames css_tryget() to css_tryget_online().  This patch was
> posted separately before - [1] - and acked by Michal and Johannes.
> 
> This patchset is on top of cgroup/for-3.16 6e1a046e9458 ("Merge branch
> 'for-3.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
> into for-3.16") and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-post-unified-updates-v2
> 
> diffstat follows.
> 
>  block/blk-cgroup.c |2 -
>  fs/bio.c   |2 -
>  include/linux/cgroup.h |   14 +++
>  kernel/cgroup.c|   88 
> +
>  kernel/cpuset.c|6 +--
>  kernel/events/core.c   |3 +
>  mm/hugetlb_cgroup.c|2 -
>  mm/memcontrol.c|   46 +
>  8 files changed, 84 insertions(+), 79 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: implement cftype->write()

2014-05-12 Thread Li Zefan
On 2014/5/6 20:44, Tejun Heo wrote:
> Hello,
> 
> This patchset implements a new cftype operation ->write() and replace
> ->write_string() and ->trigger() with it.  ->write() is similar to
> ->write_string() but maps directly to the kernfs write operation and
> has access to all available context information including the
> associated kernfs_open_file, which will make things like implementing
> kernfs active protection manipulation easier.
> 
> ->write_string() becomes redundant and ->trigger() has always been
> spurious.  This patch replaces all users of the two methods to
> ->write() and remove them.
> 
> This patchset contains the following five patches.
> 
>  0001-cgroup-implement-cftype-write.patch
>  0002-cgroup-replace-cftype-write_string-with-cftype-write.patch
>  0003-cgroup-replace-cftype-trigger-with-cftype-write.patch
>  0004-cgroup-convert-tasks-and-cgroup.procs-handle-to-use-.patch
>  0005-cgroup-remove-cgroup-control_kn.patch
> 
> 0001 implements cftype->write().
> 
> 0002-0003 replace cftype->write_string() and ->trigger() with it.
> 
> 0004 convert "tasks" and "cgroup.procs" handler too.
> 
> 0005 removes cgroup->control_kn as the kernfs_node is now directly
> accessible from kernfs_open_file.
> 
> This patchset is on top of
> 
>   cgroup/for-3.16 12d3089c192c ("kernel/cpuset.c: convert printk to pr_foo()")
> + [1] [PATCHSET cgroup/for-3.16] cgroup: post unified hierarchy fixes and 
> updates
> 
> and available on the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
> review-post-unified-updates
> 
> diffstat follows.
> 
>  block/blk-throttle.c  |   32 -
>  block/cfq-iosched.c   |   28 +++
>  include/linux/cgroup.h|   40 +++--
>  kernel/cgroup.c   |   85 
> +++---
>  kernel/cgroup_freezer.c   |   20 --
>  kernel/cpuset.c   |   16 
>  mm/hugetlb_cgroup.c   |   33 +
>  mm/memcontrol.c   |   80 +++
>  net/ipv4/tcp_memcontrol.c |   31 +---
>  security/device_cgroup.c  |   14 +++
>  10 files changed, 197 insertions(+), 182 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/7] cgroup: use restart_syscall() for retries after offline waits in cgroup_subtree_control_write()

2014-05-12 Thread Li Zefan
Hi Tejun,

On 2014/5/10 3:32, Tejun Heo wrote:
> After waiting for a child to finish offline,
> cgroup_subtree_control_write() jumps up to retry from after the input
> parsing and active protection breaking.  This retry makes the
> scheduled locking update more difficult.

Could you explain this sentence more specific? I don't understand what
"scheduled locking update" means.

>  Let's simplify it by
> returning with restart_syscall() for retries.
> 
> Signed-off-by: Tejun Heo 
> ---
>  kernel/cgroup.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/7] cgroup: update and fix parsing of "cgroup.subtree_control"

2014-05-12 Thread Li Zefan
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 35daf89..b81e7c0 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2542,11 +2542,13 @@ static int cgroup_subtree_control_write(struct 
> cgroup_subsys_state *dummy_css,
>   int ssid, ret;
>  
>   /*
> -  * Parse input - white space separated list of subsystem names
> -  * prefixed with either + or -.
> +  * Parse input - space separated list of subsystem names prefixed
> +  * with either + or -.
>*/
>   p = buffer;
> - while ((tok = strsep(&p, " \t\n"))) {
> + while ((tok = strsep(&p, " "))) {
> + if (tok[0] =='\0')

if (tok[0] == '\0')

> + continue;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched/deadline: Fix memory leak

2014-05-08 Thread tip-bot for Li Zefan
Commit-ID:  6a7cd273dc4bc3246f37ebe874754a54ccb29141
Gitweb: http://git.kernel.org/tip/6a7cd273dc4bc3246f37ebe874754a54ccb29141
Author: Li Zefan 
AuthorDate: Thu, 17 Apr 2014 10:05:02 +0800
Committer:  Ingo Molnar 
CommitDate: Wed, 7 May 2014 11:51:32 +0200

sched/deadline: Fix memory leak

Free cpudl->free_cpus allocated in cpudl_init().

Signed-off-by: Li Zefan 
Acked-by: Juri Lelli 
Signed-off-by: Peter Zijlstra 
Cc:  # 3.14+
Link: http://lkml.kernel.org/r/534f36ce.2000...@huawei.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cpudeadline.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42..ab001b5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp)
  */
 void cpudl_cleanup(struct cpudl *cp)
 {
-   /*
-* nothing to do for the moment
-*/
+   free_cpumask_var(cp->free_cpus);
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] kernel/cpuset.c: convert printk to pr_foo()

2014-05-05 Thread Li Zefan
On 2014/5/6 1:49, Fabian Frederick wrote:
> Cc: Li Zefan 
> Cc: Andrew Morton 
> Signed-off-by: Fabian Frederick 

Acked-by: Li Zefan 

> ---
>  kernel/cpuset.c | 11 ---
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 1d8c047..7c0e8da 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -696,11 +696,8 @@ restart:
>   if (nslot == ndoms) {
>   static int warnings = 10;
>   if (warnings) {
> - printk(KERN_WARNING
> -  "rebuild_sched_domains confused:"
> -   " nslot %d, ndoms %d, csn %d, i %d,"
> -   " apn %d\n",
> -   nslot, ndoms, csn, i, apn);
> + pr_warn("rebuild_sched_domains confused: nslot 
> %d, ndoms %d, csn %d, i %d, apn %d\n",
> + nslot, ndoms, csn, i, apn);
>   warnings--;
>   }
>   continue;
> @@ -2018,7 +2015,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset 
> *cs)
>   parent = parent_cs(parent);
>  
>   if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
> - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty 
> cpuset ");
> + pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
>   pr_cont_cgroup_name(cs->css.cgroup);
>   pr_cont("\n");
>   }
> @@ -2555,7 +2552,7 @@ void cpuset_print_task_mems_allowed(struct task_struct 
> *tsk)
>   cgrp = task_cs(tsk)->css.cgroup;
>   nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
>  tsk->mems_allowed);
> - printk(KERN_INFO "%s cpuset=", tsk->comm);
> + pr_info("%s cpuset=", tsk->comm);
>   pr_cont_cgroup_name(cgrp);
>   pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] kernel/cpuset.c: kernel-doc fixes

2014-05-05 Thread Li Zefan
On 2014/5/6 1:46, Fabian Frederick wrote:
> This patch also converts seq_printf to seq_puts
> 
> Cc: Li Zefan 
> Cc: Andrew Morton 
> Signed-off-by: Fabian Frederick 

Acked-by: Li Zefan 

> ---
>  kernel/cpuset.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
...
>  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
>  {
> - seq_printf(m, "Mems_allowed:\t");
> + seq_puts(m, "Mems_allowed:\t");
>   seq_nodemask(m, &task->mems_allowed);
> - seq_printf(m, "\n");
> - seq_printf(m, "Mems_allowed_list:\t");
> + seq_puts(m, "\n");

will seq_putc() produce slightly smaller code?

> + seq_puts(m, "Mems_allowed_list:\t");
>   seq_nodemask_list(m, &task->mems_allowed);
> - seq_printf(m, "\n");
> + seq_puts(m, "\n");
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET cgroup/for-3.16] cgroup: implement css->id

2014-05-03 Thread Li Zefan
On 2014/4/25 5:02, Tejun Heo wrote:
> Hello,
> 
> Until now, cgroup->id has been used to identify all the associated
> csses and css_from_id() takes cgroup ID and returns the matching css
> by looking up the cgroup and then dereferencing the css associated
> with it; however, now that the lifetimes of cgroup and css are
> separate, this is incorrect and breaks on the unified hierarchy when a
> controller is disabled and enabled back again before the previous
> instance is released.
> 
> This series adds css->id which is a subsystem-unique ID and converts
> css_from_id() to look up by the new css->id instead.  memcg is the
> only user of css_from_id() and also converted to use css->id instead.
> 
>  0001-cgroup-make-flags-and-subsys_masks-unsigned-int.patch
>  0002-cgroup-memcg-allocate-cgroup-ID-from-1.patch
>  0003-cgroup-protect-cgroup_root-cgroup_idr-with-a-spinloc.patch
>  0004-cgroup-use-RCU-free-in-create_css-failure-path.patch
>  0005-cgroup-update-init_css-into-init_and_link_css.patch
>  0006-cgroup-memcg-implement-css-id-and-convert-css_from_i.patch
> 
> 0001-0003 are related prep/cleanups.
> 
> 0004-0006 add css->id and convert css_from_id() and its user to it.
> 
> This patchset is on top of
> 
>   cgroup/for-3.16 f8f22e53a262 ("cgroup: implement dynamic subtree controller 
> enable/disable on the default hierarchy")
> + [1] [PATCHSET cgroup/for-3.16] cgroup: implement cgroup.populated
> + [2] Misc comment / warning cleanups
> 
> and available in the following git branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git review-css_id
> 
> diffstat follows.
> 
>  include/linux/cgroup.h |   21 --
>  kernel/cgroup.c|  164 
> ++++++++-
>  mm/memcontrol.c|   10 --
>  3 files changed, 126 insertions(+), 69 deletions(-)
> 

Acked-by: Li Zefan 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] cgroup, memcg: implement css->id and convert css_from_id() to use it

2014-05-03 Thread Li Zefan
(Just came back from a short vacation)

On 2014/5/1 23:46, Tejun Heo wrote:
> On Mon, Apr 28, 2014 at 11:33:16AM +0800, Li Zefan wrote:
>> On 2014/4/25 5:02, Tejun Heo wrote:
>>> Until now, cgroup->id has been used to identify all the associated
>>> csses and css_from_id() takes cgroup ID and returns the matching css
>>> by looking up the cgroup and then dereferencing the css associated
>>> with it; however, now that the lifetimes of cgroup and css are
>>> separate, this is incorrect and breaks on the unified hierarchy when a
>>> controller is disabled and enabled back again before the previous
>>> instance is released.
>>>
>>> This patch adds css->id which is a subsystem-unique ID and converts
>>> css_from_id() to look up by the new css->id instead.  memcg is the
>>> only user of css_from_id() and also converted to use css->id instead.
>>>
>>
>> netprio_cgroup also needs to be updated.
> 
> Why?  Wouldn't it be more natural to tie that to the associated
> cgroup's ID rather than the specific css instance.  It's different for
> memcg as it involves css refcnts but netprio_cgroup, AFAICS, only uses
> it for cgroup identification anyway.
> 

You're right. I thought it won't work correctly in unified hierarchy
where controllers can be disabled/enabled.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] cgroup, memcg: implement css->id and convert css_from_id() to use it

2014-04-27 Thread Li Zefan
On 2014/4/25 5:02, Tejun Heo wrote:
> Until now, cgroup->id has been used to identify all the associated
> csses and css_from_id() takes cgroup ID and returns the matching css
> by looking up the cgroup and then dereferencing the css associated
> with it; however, now that the lifetimes of cgroup and css are
> separate, this is incorrect and breaks on the unified hierarchy when a
> controller is disabled and enabled back again before the previous
> instance is released.
> 
> This patch adds css->id which is a subsystem-unique ID and converts
> css_from_id() to look up by the new css->id instead.  memcg is the
> only user of css_from_id() and also converted to use css->id instead.
> 

netprio_cgroup also needs to be updated.

> For traditional hierarchies, this shouldn't make any functional
> difference.
> 
> Signed-off-by: Tejun Heo 
> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Jianyu Zhan 
> ---
>  include/linux/cgroup.h |  9 
>  kernel/cgroup.c| 59 
> --
>  mm/memcontrol.c|  4 ++--
>  3 files changed, 49 insertions(+), 23 deletions(-)
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] perf-event/cgroup: explicitly init the early_init field

2014-04-22 Thread Li Zefan
On 2014/4/22 15:12, Jianyu Zhan wrote:
> On Tue, Apr 22, 2014 at 2:06 PM, Ingo Molnar  wrote:
>> How can that field ever be nonzero?
>>
>> I.e. under what exact circumstances does this patch make sense?
> 
> Hi, Ingo,
> 
> More explanation.
> 
> Sure, for this global variable struct, if not initailized, its all
> fields will be initialized
> to 0 or null(depending on its type).  The point here is no to deprive
> the rights of compiler/linker of doing this initialization, it is mainly for
> documentation reason. Actually this field's value would affect how ->css_alloc
> should implemented.
> 
> Concretely, if early_init is nonzero, then ->css_alloc *must not* call 
> kzalloc,
> because in cgroup implementation, ->css_alloc will be called earlier before
> mm_init().
> 
> I don't think that the value of one field(early_init) has a so subtle
> restrition on the another field(css_alloc) is a good thing,
> but since
> it is there,
> docment it should be needed.
> 
> I could resend the patch with  more comment.
> 

nack

As I said in another mail thread, this change makes no sense.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] hugetlb_cgroup: explicitly init the early_init field

2014-04-22 Thread Li Zefan
On 2014/4/22 15:01, Jianyu Zhan wrote:
> Hi, hillf,
> 
> On Tue, Apr 22, 2014 at 2:47 PM, Hillf Danton  wrote:
>> But other fields still missed, if any. Fair?
> 
> yep, it is not fair.
> 
> Sure for this global variable struct, if not initailized, its all
> fields will be initialized
> to 0 or null(depending on its type).  The point here is no to deprive
> the rights of
> compiler/linker of doing this initialization, it is mainly for
> documentation reason.
> Actually this field's value would affect how ->css_alloc should implemented.
> 
> Concretely, if early_init is nonzero, then ->css_alloc *must not* call 
> kzalloc,
> because in cgroup implementation, ->css_alloc will be called earlier before
> mm_init().
> 
> I don't think that the value of one field(early_init) has a so subtle
> restrition on the
> another field(css_alloc) is a good thing, but since it is there,
> docment it should
> be needed.
> 

I don't see how things can be improved by initializing it to 0 explicitly,
if anything needs to be improved.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netclassid_cgroup: explicitly init the early_init field

2014-04-21 Thread Li Zefan
On 2014/4/22 13:31, Jianyu Zhan wrote:
> For a cgroup subsystem who should init early, then it should carefully
> take care of the implementation of css_alloc, because it will be called
> before mm_init() setup the world.
> 
> Luckily we don't, and we better explicitly assign the early_init field
> to 0, for document reason.
> 

The initialization is redundant, and your reason is not reasonable.

> Signed-off-by: Jianyu Zhan 

nack

> ---
>  net/core/netclassid_cgroup.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
> index 22931e1..1b07dca 100644
> --- a/net/core/netclassid_cgroup.c
> +++ b/net/core/netclassid_cgroup.c
> @@ -108,4 +108,5 @@ struct cgroup_subsys net_cls_cgrp_subsys = {
>   .css_free   = cgrp_css_free,
>   .attach = cgrp_attach,
>   .base_cftypes   = ss_files,
> + .early_init = 0,
>  };
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cgroup: use uninitialized_var() for may-be uninitialized variable

2014-04-21 Thread Li Zefan
On 2014/4/22 13:44, Jianyu Zhan wrote:
> To suppress this warning:
> 
>  warning: ‘err’ may be used uninitialized in this function 
> [-Wmaybe-uninitialized]
>   int err;
>   ^

I don't see this warning, and I don't see how this is possible.

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
struct cgroup *parent = cgrp->parent;
struct cgroup_subsys_state *css;
int err;

lockdep_assert_held(&cgroup_mutex);

css = ss->css_alloc(cgroup_css(parent, ss));
if (IS_ERR(css))
return PTR_ERR(css);

err = percpu_ref_init(&css->refcnt, css_release);
if (err)
goto err_free_css;
...

return err;
}

> 
> Use the uninitialized_var() to decalre err. It also serves to be good 
> documetation.
> 

anyway, uninitialized_var() should be avoided if possible.

nack

> Signed-off-by: Jianyu Zhan 
> ---
>  kernel/cgroup.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 24675f5..930569c 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4144,7 +4144,7 @@ static int create_css(struct cgroup *cgrp, struct 
> cgroup_subsys *ss)
>  {
>   struct cgroup *parent = cgrp->parent;
>   struct cgroup_subsys_state *css;
> - int err;
> + int uninitialized_var(err);
>  
>   lockdep_assert_held(&cgroup_mutex);
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cgroup: explicitly init the early_init field

2014-04-21 Thread Li Zefan
On 2014/4/22 13:27, Jianyu Zhan wrote:
> For a cgroup subsystem who should init early, then it should carefully
> take care of the implementation of css_alloc, because it will be called
> before mm_init() setup the world.
> 
> Luckily we don't, and we better explicitly assign the early_init field
> to 0, for document reason.
> 

If you think this is the right thing to do, you can apply the same reason
to the initialization of other structures in the whole kernel tree.

> Signed-off-by: Jianyu Zhan 

nack

> ---
>  kernel/cgroup.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 559f822..f23cb67 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -5325,5 +5325,6 @@ struct cgroup_subsys debug_cgrp_subsys = {
>   .css_alloc = debug_css_alloc,
>   .css_free = debug_css_free,
>   .base_cftypes = debug_files,
> + .early_init = 0,
>  };
>  #endif /* CONFIG_CGROUP_DEBUG */
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   5   6   7   8   9   10   >