[PATCH 1/2] trace-cmd: Introduce tracecmd_peek_next_data()

2016-07-07 Thread Namhyung Kim
The tracecmd_peek_next_data() is similar to tracecmd_read_next_data()
but it doesn't consume the record.

Signed-off-by: Namhyung Kim 
---
 trace-cmd.h   |  3 +++
 trace-input.c | 43 +++
 2 files changed, 46 insertions(+)

diff --git a/trace-cmd.h b/trace-cmd.h
index cef2458..5798345 100644
--- a/trace-cmd.h
+++ b/trace-cmd.h
@@ -164,6 +164,9 @@ struct pevent_record *
 tracecmd_read_next_data(struct tracecmd_input *handle, int *rec_cpu);
 
 struct pevent_record *
+tracecmd_peek_next_data(struct tracecmd_input *handle, int *rec_cpu);
+
+struct pevent_record *
 tracecmd_read_at(struct tracecmd_input *handle, unsigned long long offset,
 int *cpu);
 struct pevent_record *
diff --git a/trace-input.c b/trace-input.c
index 2fea066..e825328 100644
--- a/trace-input.c
+++ b/trace-input.c
@@ -1787,6 +1787,49 @@ tracecmd_read_next_data(struct tracecmd_input *handle, 
int *rec_cpu)
 }
 
 /**
+ * tracecmd_peek_next_data - return the next record
+ * @handle: input handle to the trace.dat file
+ * @rec_cpu: return pointer to the CPU that the record belongs to
+ *
+ * This returns the next record by time. This is different than
+ * tracecmd_peek_data in that it looks at all CPUs. It does a peek
+ * at each CPU and the record with the earliest time stame is
+ * returned. If @rec_cpu is not NULL it gets the CPU id the record was
+ * on. It does not increment the CPU iterator.
+ */
+struct pevent_record *
+tracecmd_peek_next_data(struct tracecmd_input *handle, int *rec_cpu)
+{
+   unsigned long long ts;
+   struct pevent_record *record, *next_record = NULL;
+   int next_cpu;
+   int cpu;
+
+   if (rec_cpu)
+   *rec_cpu = -1;
+
+   next_cpu = -1;
+   ts = 0;
+
+   for (cpu = 0; cpu < handle->cpus; cpu++) {
+   record = tracecmd_peek_data(handle, cpu);
+   if (record && (!next_record || record->ts < ts)) {
+   ts = record->ts;
+   next_cpu = cpu;
+   next_record = record;
+   }
+   }
+
+   if (next_record) {
+   if (rec_cpu)
+   *rec_cpu = next_cpu;
+   return next_record;
+   }
+
+   return NULL;
+}
+
+/**
  * tracecmd_read_prev - read the record before the given record
  * @handle: input handle to the trace.dat file
  * @record: the record to use to find the previous record.
-- 
2.9.0



[PATCH 1/2] trace-cmd: Introduce tracecmd_peek_next_data()

2016-07-07 Thread Namhyung Kim
The tracecmd_peek_next_data() is similar to tracecmd_read_next_data()
but it doesn't consume the record.

Signed-off-by: Namhyung Kim 
---
 trace-cmd.h   |  3 +++
 trace-input.c | 43 +++
 2 files changed, 46 insertions(+)

diff --git a/trace-cmd.h b/trace-cmd.h
index cef2458..5798345 100644
--- a/trace-cmd.h
+++ b/trace-cmd.h
@@ -164,6 +164,9 @@ struct pevent_record *
 tracecmd_read_next_data(struct tracecmd_input *handle, int *rec_cpu);
 
 struct pevent_record *
+tracecmd_peek_next_data(struct tracecmd_input *handle, int *rec_cpu);
+
+struct pevent_record *
 tracecmd_read_at(struct tracecmd_input *handle, unsigned long long offset,
 int *cpu);
 struct pevent_record *
diff --git a/trace-input.c b/trace-input.c
index 2fea066..e825328 100644
--- a/trace-input.c
+++ b/trace-input.c
@@ -1787,6 +1787,49 @@ tracecmd_read_next_data(struct tracecmd_input *handle, 
int *rec_cpu)
 }
 
 /**
+ * tracecmd_peek_next_data - return the next record
+ * @handle: input handle to the trace.dat file
+ * @rec_cpu: return pointer to the CPU that the record belongs to
+ *
+ * This returns the next record by time. This is different than
+ * tracecmd_peek_data in that it looks at all CPUs. It does a peek
+ * at each CPU and the record with the earliest time stame is
+ * returned. If @rec_cpu is not NULL it gets the CPU id the record was
+ * on. It does not increment the CPU iterator.
+ */
+struct pevent_record *
+tracecmd_peek_next_data(struct tracecmd_input *handle, int *rec_cpu)
+{
+   unsigned long long ts;
+   struct pevent_record *record, *next_record = NULL;
+   int next_cpu;
+   int cpu;
+
+   if (rec_cpu)
+   *rec_cpu = -1;
+
+   next_cpu = -1;
+   ts = 0;
+
+   for (cpu = 0; cpu < handle->cpus; cpu++) {
+   record = tracecmd_peek_data(handle, cpu);
+   if (record && (!next_record || record->ts < ts)) {
+   ts = record->ts;
+   next_cpu = cpu;
+   next_record = record;
+   }
+   }
+
+   if (next_record) {
+   if (rec_cpu)
+   *rec_cpu = next_cpu;
+   return next_record;
+   }
+
+   return NULL;
+}
+
+/**
  * tracecmd_read_prev - read the record before the given record
  * @handle: input handle to the trace.dat file
  * @record: the record to use to find the previous record.
-- 
2.9.0



[PATCH 2/2] trace-cmd: Use tracecmd_peek_next_data() in fgraph_ent_handler

2016-07-07 Thread Namhyung Kim
When a task was migrated to other cpu in the middle of a function, the
fgraph_exit record will be in a different cpu than the enter record.
But currently fgraph_ent_handler() only peeks at the same cpu so it
could read an incorrect record.

For example, please see following raw records:

  bash-10478 [007]73.454273: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]73.454650: funcgraph_exit:func=0x8123bf90 
calltime=0x111a37483c rettime=0x111a3d0285 overrun=0x0 depth=0
  bash-10478 [000]74.456383: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]74.456655: funcgraph_exit:func=0x8123bf90 
calltime=0x1155f24337 rettime=0x1155f66559 overrun=0x0 depth=0
  bash-10478 [000]75.458517: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [001]75.458849: funcgraph_exit:func=0x8123bf90 
calltime=0x1191ad9de0 rettime=0x1191b2a6aa overrun=0x0 depth=0
  bash-10478 [001]76.460482: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]76.460679: funcgraph_exit:func=0x8123bf90 
calltime=0x11cd6662b4 rettime=0x11cd695e03 overrun=0x0 depth=0
  bash-10478 [000]77.462334: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [004]77.462564: funcgraph_exit:func=0x8123bf90 
calltime=0x12091d71c4 rettime=0x120920e977 overrun=0x0 depth=0
  bash-10478 [004]78.464315: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [001]78.464644: funcgraph_exit:func=0x8123bf90 
calltime=0x1244d674de rettime=0x1244db7329 overrun=0x0 depth=0
  bash-10478 [001]79.466018: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [004]79.466326: funcgraph_exit:func=0x8123bf90 
calltime=0x12808b3940 rettime=0x12808fe819 overrun=0x0 depth=0
  bash-10478 [004]80.468005: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [002]80.468291: funcgraph_exit:func=0x8123bf90 
calltime=0x12bc44551f rettime=0x12bc48ac9a overrun=0x0 depth=0
  bash-10478 [002]81.469718: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [007]81.470088: funcgraph_exit:func=0x8123bf90 
calltime=0x12f7f945b8 rettime=0x12f7fee028 overrun=0x0 depth=0

The first entry was call to cma_alloc function, it was on cpu 7 but the
task was migrated to cpu 0 before returning from the function.
Currently trace-cmd shows like below:

  bash-10478 [007]73.454273: funcgraph_entry:  ! 367.216 us |  
cma_alloc();
  bash-10478 [000]73.454650: funcgraph_exit:   ! 375.369 us |  }
  bash-10478 [000]74.456383: funcgraph_entry:  ! 270.882 us |  
cma_alloc();
  bash-10478 [000]75.458517: funcgraph_entry:  ! 195.407 us |  
cma_alloc();
  bash-10478 [001]75.458849: funcgraph_exit:   ! 329.930 us |  }
  bash-10478 [001]76.460482: funcgraph_entry:  ! 327.243 us |  
cma_alloc();
  bash-10478 [000]77.462334: funcgraph_entry:  ! 293.465 us |  
cma_alloc();
  bash-10478 [004]77.462564: funcgraph_exit:   ! 227.251 us |  }
  bash-10478 [004]78.464315: funcgraph_entry:  ! 306.905 us |  
cma_alloc();
  bash-10478 [001]79.466018: funcgraph_entry:  ! 303.196 us |  
cma_alloc();
  bash-10478 [004]80.468005: funcgraph_entry:   |  
cma_alloc() {
  bash-10478 [002]80.468291: funcgraph_exit:   ! 284.539 us |  }
  bash-10478 [002]81.469718: funcgraph_entry:  ! 323.215 us |  
cma_alloc();

This is because the first funcgraph_entry on cpu 7 matched to the last
funcgraph_exit on cpu 7.  And second funcgraph_exit on cpu 0 was shown
alone.  We need to match record from all cpu rather than the same cpu.
In this case, entry on cpu 7 should be paired with exit on cpu 0.

With this patch, the output look like below:

  bash-10478 [007]73.454273: funcgraph_entry:  ! 375.369 us |  
cma_alloc();
  bash-10478 [000]74.456383: funcgraph_entry:  ! 270.882 us |  
cma_alloc();
  bash-10478 [000]75.458517: funcgraph_entry:  ! 329.930 us |  
cma_alloc();
  bash-10478 [001]76.460482: funcgraph_entry:  ! 195.407 us |  
cma_alloc();
  bash-10478 [000]77.462334: funcgraph_entry:  ! 227.251 us |  
cma_alloc();
  bash-10478 [004]78.464315: funcgraph_entry:  ! 327.243 us |  
cma_alloc();
  bash-10478 [001]79.466018: funcgraph_entry:  ! 306.905 us |  
cma_alloc();
  bash-10478 [004]80.468005: funcgraph_entry:  ! 284.539 us |  
cma_alloc();
  bash-10478 [002]81.469718: funcgraph_entry:  ! 367.216 us |  
cma_alloc();

Maybe we can separate enter and exit if they happened on different
cpu.  Anyway the time duration has correct value now.

Reported-by: Joonsoo Kim 
Signed-off-by: Namhyung Kim 
---
 trace-ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trace-ftrace.c b/trace-ftrace.c
index 

[PATCH 2/2] trace-cmd: Use tracecmd_peek_next_data() in fgraph_ent_handler

2016-07-07 Thread Namhyung Kim
When a task was migrated to other cpu in the middle of a function, the
fgraph_exit record will be in a different cpu than the enter record.
But currently fgraph_ent_handler() only peeks at the same cpu so it
could read an incorrect record.

For example, please see following raw records:

  bash-10478 [007]73.454273: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]73.454650: funcgraph_exit:func=0x8123bf90 
calltime=0x111a37483c rettime=0x111a3d0285 overrun=0x0 depth=0
  bash-10478 [000]74.456383: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]74.456655: funcgraph_exit:func=0x8123bf90 
calltime=0x1155f24337 rettime=0x1155f66559 overrun=0x0 depth=0
  bash-10478 [000]75.458517: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [001]75.458849: funcgraph_exit:func=0x8123bf90 
calltime=0x1191ad9de0 rettime=0x1191b2a6aa overrun=0x0 depth=0
  bash-10478 [001]76.460482: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [000]76.460679: funcgraph_exit:func=0x8123bf90 
calltime=0x11cd6662b4 rettime=0x11cd695e03 overrun=0x0 depth=0
  bash-10478 [000]77.462334: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [004]77.462564: funcgraph_exit:func=0x8123bf90 
calltime=0x12091d71c4 rettime=0x120920e977 overrun=0x0 depth=0
  bash-10478 [004]78.464315: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [001]78.464644: funcgraph_exit:func=0x8123bf90 
calltime=0x1244d674de rettime=0x1244db7329 overrun=0x0 depth=0
  bash-10478 [001]79.466018: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [004]79.466326: funcgraph_exit:func=0x8123bf90 
calltime=0x12808b3940 rettime=0x12808fe819 overrun=0x0 depth=0
  bash-10478 [004]80.468005: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [002]80.468291: funcgraph_exit:func=0x8123bf90 
calltime=0x12bc44551f rettime=0x12bc48ac9a overrun=0x0 depth=0
  bash-10478 [002]81.469718: funcgraph_entry:   func=0x8123bf90 
depth=0
  bash-10478 [007]81.470088: funcgraph_exit:func=0x8123bf90 
calltime=0x12f7f945b8 rettime=0x12f7fee028 overrun=0x0 depth=0

The first entry was call to cma_alloc function, it was on cpu 7 but the
task was migrated to cpu 0 before returning from the function.
Currently trace-cmd shows like below:

  bash-10478 [007]73.454273: funcgraph_entry:  ! 367.216 us |  
cma_alloc();
  bash-10478 [000]73.454650: funcgraph_exit:   ! 375.369 us |  }
  bash-10478 [000]74.456383: funcgraph_entry:  ! 270.882 us |  
cma_alloc();
  bash-10478 [000]75.458517: funcgraph_entry:  ! 195.407 us |  
cma_alloc();
  bash-10478 [001]75.458849: funcgraph_exit:   ! 329.930 us |  }
  bash-10478 [001]76.460482: funcgraph_entry:  ! 327.243 us |  
cma_alloc();
  bash-10478 [000]77.462334: funcgraph_entry:  ! 293.465 us |  
cma_alloc();
  bash-10478 [004]77.462564: funcgraph_exit:   ! 227.251 us |  }
  bash-10478 [004]78.464315: funcgraph_entry:  ! 306.905 us |  
cma_alloc();
  bash-10478 [001]79.466018: funcgraph_entry:  ! 303.196 us |  
cma_alloc();
  bash-10478 [004]80.468005: funcgraph_entry:   |  
cma_alloc() {
  bash-10478 [002]80.468291: funcgraph_exit:   ! 284.539 us |  }
  bash-10478 [002]81.469718: funcgraph_entry:  ! 323.215 us |  
cma_alloc();

This is because the first funcgraph_entry on cpu 7 matched to the last
funcgraph_exit on cpu 7.  And second funcgraph_exit on cpu 0 was shown
alone.  We need to match record from all cpu rather than the same cpu.
In this case, entry on cpu 7 should be paired with exit on cpu 0.

With this patch, the output look like below:

  bash-10478 [007]73.454273: funcgraph_entry:  ! 375.369 us |  
cma_alloc();
  bash-10478 [000]74.456383: funcgraph_entry:  ! 270.882 us |  
cma_alloc();
  bash-10478 [000]75.458517: funcgraph_entry:  ! 329.930 us |  
cma_alloc();
  bash-10478 [001]76.460482: funcgraph_entry:  ! 195.407 us |  
cma_alloc();
  bash-10478 [000]77.462334: funcgraph_entry:  ! 227.251 us |  
cma_alloc();
  bash-10478 [004]78.464315: funcgraph_entry:  ! 327.243 us |  
cma_alloc();
  bash-10478 [001]79.466018: funcgraph_entry:  ! 306.905 us |  
cma_alloc();
  bash-10478 [004]80.468005: funcgraph_entry:  ! 284.539 us |  
cma_alloc();
  bash-10478 [002]81.469718: funcgraph_entry:  ! 367.216 us |  
cma_alloc();

Maybe we can separate enter and exit if they happened on different
cpu.  Anyway the time duration has correct value now.

Reported-by: Joonsoo Kim 
Signed-off-by: Namhyung Kim 
---
 trace-ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trace-ftrace.c b/trace-ftrace.c
index 636b08b..edc9349 100644
--- a/trace-ftrace.c
+++ 

Re: [PATCH] drm/radeon: Remove deprecated create_singlethread_workqueue

2016-07-07 Thread Michel Dänzer
On 07.07.2016 16:43, Christian König wrote:
> Am 07.07.2016 um 05:32 schrieb Michel Dänzer:
>> On 06.07.2016 22:45, Tejun Heo wrote:
>>> On Wed, Jul 06, 2016 at 12:12:52PM +0900, Michel Dänzer wrote:
>>>
 Not being very familiar with the workqueue APIs, I'll describe how it's
 supposed to work from a driver POV, which will hopefully help you guys
 decide on the most appropriate alloc_workqueue parameters.

 There is one flip work queue for each hardware CRTC. At most one
 radeon_flip_work_func item can be queued for any of them at any time.
 When a radeon_flip_work_func item is queued, it should be executed ASAP
 (so WQ_HIGHPRI might be appropriate?).
>>> Hmmm... the only time WQ_HIGHPRI should be used is when it'd otherwise
>>> require a kthread w/ nice value at -20.  Would that be the case here?
>>> What are the consequences of the work item getting delayed?
>> A page flip may be delayed to a later display refresh cycle.
>>
>>
>>> Also, what kind of delays matter here?  Is it millisec range or micro?
>> It can be the latter in theory, but normally rather the former.
> 
> Well to be precise with a typical 1920x1080@60 resolution you have about
> 2.16ms time under ideal conditions for the flip.
> 
> So using the high priority queue still sounds like a good idea to me.

How did you arrive at 2.16ms?

Userspace can call the ioctl up to one full refresh cycle ahead of time,
which is ~16ms at 60 Hz. On the other hand userspace can also call the
ioctl arbitrarily close to the vertical blank period, in which case even
a delay of just 1ms (or even significantly less) may cause the flip to
be delayed by one refresh cycle.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] drm/radeon: Remove deprecated create_singlethread_workqueue

2016-07-07 Thread Michel Dänzer
On 07.07.2016 16:43, Christian König wrote:
> Am 07.07.2016 um 05:32 schrieb Michel Dänzer:
>> On 06.07.2016 22:45, Tejun Heo wrote:
>>> On Wed, Jul 06, 2016 at 12:12:52PM +0900, Michel Dänzer wrote:
>>>
 Not being very familiar with the workqueue APIs, I'll describe how it's
 supposed to work from a driver POV, which will hopefully help you guys
 decide on the most appropriate alloc_workqueue parameters.

 There is one flip work queue for each hardware CRTC. At most one
 radeon_flip_work_func item can be queued for any of them at any time.
 When a radeon_flip_work_func item is queued, it should be executed ASAP
 (so WQ_HIGHPRI might be appropriate?).
>>> Hmmm... the only time WQ_HIGHPRI should be used is when it'd otherwise
>>> require a kthread w/ nice value at -20.  Would that be the case here?
>>> What are the consequences of the work item getting delayed?
>> A page flip may be delayed to a later display refresh cycle.
>>
>>
>>> Also, what kind of delays matter here?  Is it millisec range or micro?
>> It can be the latter in theory, but normally rather the former.
> 
> Well to be precise with a typical 1920x1080@60 resolution you have about
> 2.16ms time under ideal conditions for the flip.
> 
> So using the high priority queue still sounds like a good idea to me.

How did you arrive at 2.16ms?

Userspace can call the ioctl up to one full refresh cycle ahead of time,
which is ~16ms at 60 Hz. On the other hand userspace can also call the
ioctl arbitrarily close to the vertical blank period, in which case even
a delay of just 1ms (or even significantly less) may cause the flip to
be delayed by one refresh cycle.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer



signature.asc
Description: OpenPGP digital signature


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrew Vagin
On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages) wrote:
> > On 7 July 2016 at 17:01, James Bottomley
> >  wrote:
> [Serge already answered the parenting issue]
> > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > Hm.  Probably best-effort based on the process hierarchy.  So 
> > > > yeah you could probably get a tree into a state that would be 
> > > > wrongly recreated. Create a new netns, bind mount it, exit;  Have 
> > > > another task create a new user_ns, bind mount it, exit;  Third 
> > > > task setns()s first to the new netns then to the new user_ns.  I 
> > > > suspect criu will recreate that wrongly.
> > > 
> > > This is a bit pathological, and you have to be root to do it: so 
> > > root can set up a nesting hierarchy, bind it and destroy the pids 
> > > but I know of no current orchestration system which does this.
> > > 
> > > Actually, I have to back pedal a bit: the way I currently set up
> > > architecture emulation containers does precisely this: I set up the
> > > namespaces unprivileged with child mount namespaces, but then I ask
> > > root to bind the userns and kill the process that created it so I 
> > > have a permanent handle to enter the namespace by, so I suspect 
> > > that when our current orchestration systems get more sophisticated, 
> > > they might eventually want to do something like this as well.
> > > 
> > > In theory, we could get nsfs to show this information as an option
> > > (just add a show_options entry to the superblock ops), but the 
> > > problem is that although each namespace has a parent user_ns, 
> > > there's no way to get it without digging in the namespace specific 
> > > structure.  Probably we should restructure to move it into 
> > > ns_common, then we could display it (and enforce all namespaces 
> > > having owning user_ns) but it would be a
> > 
> > I'm missing something here. Is it not already the case that all
> > namespaces have an owning user_ns?
> 
> Um, yes, I don't believe I said they don't.  The problem I thought you
> were having is that there's no way of seeing what it is.
> 
> nsfs is the Namespace fileystem where bound namespaces appear to a cat
> of /proc/self/mounts.  It can display any information that's in
> ns_common (the common core of namespaces) but the owning user_ns
> pointer currently isn't in this structure.  Every user namespace has a
> pointer to it, but they're all privately embedded in the individual
> namespace specific structures.  What I was proposing was that since
> every current namespace has a pointer somewhere to the owning user
> namespace, we could abstract this out into ns_common so it's now
> accessible to be displayed by nsfs, probably as a mount option.

James, I am not sure that I understood you correctly. We have one
file system for all namespace files, how we can show per-file properties
in mount options. I think we can show all required information in
fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
/proc/pid/fdinfo/X for it.

> 
> James
> 
> 
> ___
> CRIU mailing list
> c...@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrew Vagin
On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages) wrote:
> > On 7 July 2016 at 17:01, James Bottomley
> >  wrote:
> [Serge already answered the parenting issue]
> > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > Hm.  Probably best-effort based on the process hierarchy.  So 
> > > > yeah you could probably get a tree into a state that would be 
> > > > wrongly recreated. Create a new netns, bind mount it, exit;  Have 
> > > > another task create a new user_ns, bind mount it, exit;  Third 
> > > > task setns()s first to the new netns then to the new user_ns.  I 
> > > > suspect criu will recreate that wrongly.
> > > 
> > > This is a bit pathological, and you have to be root to do it: so 
> > > root can set up a nesting hierarchy, bind it and destroy the pids 
> > > but I know of no current orchestration system which does this.
> > > 
> > > Actually, I have to back pedal a bit: the way I currently set up
> > > architecture emulation containers does precisely this: I set up the
> > > namespaces unprivileged with child mount namespaces, but then I ask
> > > root to bind the userns and kill the process that created it so I 
> > > have a permanent handle to enter the namespace by, so I suspect 
> > > that when our current orchestration systems get more sophisticated, 
> > > they might eventually want to do something like this as well.
> > > 
> > > In theory, we could get nsfs to show this information as an option
> > > (just add a show_options entry to the superblock ops), but the 
> > > problem is that although each namespace has a parent user_ns, 
> > > there's no way to get it without digging in the namespace specific 
> > > structure.  Probably we should restructure to move it into 
> > > ns_common, then we could display it (and enforce all namespaces 
> > > having owning user_ns) but it would be a
> > 
> > I'm missing something here. Is it not already the case that all
> > namespaces have an owning user_ns?
> 
> Um, yes, I don't believe I said they don't.  The problem I thought you
> were having is that there's no way of seeing what it is.
> 
> nsfs is the Namespace fileystem where bound namespaces appear to a cat
> of /proc/self/mounts.  It can display any information that's in
> ns_common (the common core of namespaces) but the owning user_ns
> pointer currently isn't in this structure.  Every user namespace has a
> pointer to it, but they're all privately embedded in the individual
> namespace specific structures.  What I was proposing was that since
> every current namespace has a pointer somewhere to the owning user
> namespace, we could abstract this out into ns_common so it's now
> accessible to be displayed by nsfs, probably as a mount option.

James, I am not sure that I understood you correctly. We have one
file system for all namespace files, how we can show per-file properties
in mount options. I think we can show all required information in
fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
/proc/pid/fdinfo/X for it.

> 
> James
> 
> 
> ___
> CRIU mailing list
> c...@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


Re: [PATCH v12 1/2] Documentation: DT: dma: Add Xilinx zynqmp dma device tree binding documentation

2016-07-07 Thread Vinod Koul
On Fri, Jul 01, 2016 at 05:07:05PM +0530, Kedareswara rao Appana wrote:
> Device-tree binding documentation for Xilinx zynqmp dma engine
> used in Zynq UltraScale+ MPSoC.

And I missed the cleanup part, so both applied  now

-- 
~Vinod


Re: [PATCH v12 1/2] Documentation: DT: dma: Add Xilinx zynqmp dma device tree binding documentation

2016-07-07 Thread Vinod Koul
On Fri, Jul 01, 2016 at 05:07:05PM +0530, Kedareswara rao Appana wrote:
> Device-tree binding documentation for Xilinx zynqmp dma engine
> used in Zynq UltraScale+ MPSoC.

And I missed the cleanup part, so both applied  now

-- 
~Vinod


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrei Vagin
On Thu, Jul 7, 2016 at 10:41 PM, Andrei Vagin  wrote:
> On Thu, Jul 7, 2016 at 8:26 PM, James Bottomley
>  wrote:
>> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
>>> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
>>> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
>>> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
>>> > > wrote:
>>> > > > On 7 July 2016 at 17:01, James Bottomley
>>> > > >  wrote:
>>> > > [Serge already answered the parenting issue]
>>> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
>>> > > > > > Hm.  Probably best-effort based on the process hierarchy.
>>> > > > > >  So
>>> > > > > > yeah you could probably get a tree into a state that would
>>> > > > > > be
>>> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
>>> > > > > >   Have
>>> > > > > > another task create a new user_ns, bind mount it, exit;
>>> > > > > >  Third
>>> > > > > > task setns()s first to the new netns then to the new
>>> > > > > > user_ns.  I
>>> > > > > > suspect criu will recreate that wrongly.
>>> > > > >
>>> > > > > This is a bit pathological, and you have to be root to do it:
>>> > > > > so
>>> > > > > root can set up a nesting hierarchy, bind it and destroy the
>>> > > > > pids
>>> > > > > but I know of no current orchestration system which does
>>> > > > > this.
>>> > > > >
>>> > > > > Actually, I have to back pedal a bit: the way I currently set
>>> > > > > up
>>> > > > > architecture emulation containers does precisely this: I set
>>> > > > > up the
>>> > > > > namespaces unprivileged with child mount namespaces, but then
>>> > > > > I ask
>>> > > > > root to bind the userns and kill the process that created it
>>> > > > > so I
>>> > > > > have a permanent handle to enter the namespace by, so I
>>> > > > > suspect
>>> > > > > that when our current orchestration systems get more
>>> > > > > sophisticated,
>>> > > > > they might eventually want to do something like this as well.
>>> > > > >
>>> > > > > In theory, we could get nsfs to show this information as an
>>> > > > > option
>>> > > > > (just add a show_options entry to the superblock ops), but
>>> > > > > the
>>> > > > > problem is that although each namespace has a parent user_ns,
>>> > > > > there's no way to get it without digging in the namespace
>>> > > > > specific
>>> > > > > structure.  Probably we should restructure to move it into
>>> > > > > ns_common, then we could display it (and enforce all
>>> > > > > namespaces
>>> > > > > having owning user_ns) but it would be a
>>> > > >
>>> > > > I'm missing something here. Is it not already the case that all
>>> > > > namespaces have an owning user_ns?
>>> > >
>>> > > Um, yes, I don't believe I said they don't.  The problem I
>>> > > thought you
>>> > > were having is that there's no way of seeing what it is.
>>> > >
>>> > > nsfs is the Namespace fileystem where bound namespaces appear to
>>> > > a cat
>>> > > of /proc/self/mounts.  It can display any information that's in
>>> > > ns_common (the common core of namespaces) but the owning user_ns
>>> > > pointer currently isn't in this structure.  Every user namespace
>>> > > has a
>>> > > pointer to it, but they're all privately embedded in the
>>> > > individual
>>> > > namespace specific structures.  What I was proposing was that
>>> > > since
>>> > > every current namespace has a pointer somewhere to the owning
>>> > > user
>>> > > namespace, we could abstract this out into ns_common so it's now
>>> > > accessible to be displayed by nsfs, probably as a mount option.
>>> >
>>> > James, I am not sure that I understood you correctly. We have one
>>> > file system for all namespace files, how we can show per-file
>>> > properties
>>> > in mount options. I think we can show all required information in
>>> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
>>> > /proc/pid/fdinfo/X for it.
>>>
>>> Here is a proof-of-concept patch.
>>>
>>> How it works:
>>>
>>> In [1]: import os
>>>
>>> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
>>>
>>> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
>>> pos:  0
>>> flags:010
>>> mnt_id:   2
>>> userns: 4026531837
>>>
>>> In [4]: print "/proc/self/ns/user -> %s" %
>>> os.readlink("/proc/self/ns/user")
>>> /proc/self/ns/user -> user:[4026531837]
>>
>> can't you just do
>>
>> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'
>
> We can get fdinfo for any ns file. I used /proc/self/ns/pid as an example.
>
> Look at another example:
>
> [root@fc22-vm ~]# cat /proc/self/mountinfo | grep pid_ns_file
> 115 38 0:3 pid:[4026532306] /tmp/pid_ns_file rw shared:67 - nsfs nsfs rw
>

Sorry, I forgot to say that fd is a file descriptor for /tmp/pid_ns_file

In [2]  : fd = os.open("/tmp/pid_ns_file", os.O_RDONLY)
In [3]  : fd
Out[4]: 5

> In [4]: print open("/proc/self/fdinfo/5").read()
> 

Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrei Vagin
On Thu, Jul 7, 2016 at 10:41 PM, Andrei Vagin  wrote:
> On Thu, Jul 7, 2016 at 8:26 PM, James Bottomley
>  wrote:
>> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
>>> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
>>> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
>>> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
>>> > > wrote:
>>> > > > On 7 July 2016 at 17:01, James Bottomley
>>> > > >  wrote:
>>> > > [Serge already answered the parenting issue]
>>> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
>>> > > > > > Hm.  Probably best-effort based on the process hierarchy.
>>> > > > > >  So
>>> > > > > > yeah you could probably get a tree into a state that would
>>> > > > > > be
>>> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
>>> > > > > >   Have
>>> > > > > > another task create a new user_ns, bind mount it, exit;
>>> > > > > >  Third
>>> > > > > > task setns()s first to the new netns then to the new
>>> > > > > > user_ns.  I
>>> > > > > > suspect criu will recreate that wrongly.
>>> > > > >
>>> > > > > This is a bit pathological, and you have to be root to do it:
>>> > > > > so
>>> > > > > root can set up a nesting hierarchy, bind it and destroy the
>>> > > > > pids
>>> > > > > but I know of no current orchestration system which does
>>> > > > > this.
>>> > > > >
>>> > > > > Actually, I have to back pedal a bit: the way I currently set
>>> > > > > up
>>> > > > > architecture emulation containers does precisely this: I set
>>> > > > > up the
>>> > > > > namespaces unprivileged with child mount namespaces, but then
>>> > > > > I ask
>>> > > > > root to bind the userns and kill the process that created it
>>> > > > > so I
>>> > > > > have a permanent handle to enter the namespace by, so I
>>> > > > > suspect
>>> > > > > that when our current orchestration systems get more
>>> > > > > sophisticated,
>>> > > > > they might eventually want to do something like this as well.
>>> > > > >
>>> > > > > In theory, we could get nsfs to show this information as an
>>> > > > > option
>>> > > > > (just add a show_options entry to the superblock ops), but
>>> > > > > the
>>> > > > > problem is that although each namespace has a parent user_ns,
>>> > > > > there's no way to get it without digging in the namespace
>>> > > > > specific
>>> > > > > structure.  Probably we should restructure to move it into
>>> > > > > ns_common, then we could display it (and enforce all
>>> > > > > namespaces
>>> > > > > having owning user_ns) but it would be a
>>> > > >
>>> > > > I'm missing something here. Is it not already the case that all
>>> > > > namespaces have an owning user_ns?
>>> > >
>>> > > Um, yes, I don't believe I said they don't.  The problem I
>>> > > thought you
>>> > > were having is that there's no way of seeing what it is.
>>> > >
>>> > > nsfs is the Namespace fileystem where bound namespaces appear to
>>> > > a cat
>>> > > of /proc/self/mounts.  It can display any information that's in
>>> > > ns_common (the common core of namespaces) but the owning user_ns
>>> > > pointer currently isn't in this structure.  Every user namespace
>>> > > has a
>>> > > pointer to it, but they're all privately embedded in the
>>> > > individual
>>> > > namespace specific structures.  What I was proposing was that
>>> > > since
>>> > > every current namespace has a pointer somewhere to the owning
>>> > > user
>>> > > namespace, we could abstract this out into ns_common so it's now
>>> > > accessible to be displayed by nsfs, probably as a mount option.
>>> >
>>> > James, I am not sure that I understood you correctly. We have one
>>> > file system for all namespace files, how we can show per-file
>>> > properties
>>> > in mount options. I think we can show all required information in
>>> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
>>> > /proc/pid/fdinfo/X for it.
>>>
>>> Here is a proof-of-concept patch.
>>>
>>> How it works:
>>>
>>> In [1]: import os
>>>
>>> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
>>>
>>> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
>>> pos:  0
>>> flags:010
>>> mnt_id:   2
>>> userns: 4026531837
>>>
>>> In [4]: print "/proc/self/ns/user -> %s" %
>>> os.readlink("/proc/self/ns/user")
>>> /proc/self/ns/user -> user:[4026531837]
>>
>> can't you just do
>>
>> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'
>
> We can get fdinfo for any ns file. I used /proc/self/ns/pid as an example.
>
> Look at another example:
>
> [root@fc22-vm ~]# cat /proc/self/mountinfo | grep pid_ns_file
> 115 38 0:3 pid:[4026532306] /tmp/pid_ns_file rw shared:67 - nsfs nsfs rw
>

Sorry, I forgot to say that fd is a file descriptor for /tmp/pid_ns_file

In [2]  : fd = os.open("/tmp/pid_ns_file", os.O_RDONLY)
In [3]  : fd
Out[4]: 5

> In [4]: print open("/proc/self/fdinfo/5").read()
> pos: 0
> flags: 010
> mnt_id: 115
> userns: 4026532305
>
>
> In [5]: 

Re: [PATCH v12 2/2] dmaengine: Add Xilinx zynqmp dma engine driver support

2016-07-07 Thread Vinod Koul
On Fri, Jul 01, 2016 at 05:07:06PM +0530, Kedareswara rao Appana wrote:
> +static int zynqmp_dma_chan_probe(struct zynqmp_dma_device *zdev,
> +struct platform_device *pdev)
> +{
> + struct zynqmp_dma_chan *chan;
> + struct resource *res;
> + struct device_node *node = pdev->dev.of_node;
> + int err;
> +
> + chan = devm_kzalloc(zdev->dev, sizeof(*chan), GFP_KERNEL);
> + if (!chan)
> + return -ENOMEM;
> + chan->dev = zdev->dev;
> + chan->zdev = zdev;
> +
> + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + chan->regs = devm_ioremap_resource(>dev, res);
> + if (IS_ERR(chan->regs))
> + return PTR_ERR(chan->regs);
> +
> + chan->bus_width = ZYNQMP_DMA_BUS_WIDTH_64;
> + chan->dst_burst_len = ZYNQMP_DMA_AWLEN_RST_VAL;
> + chan->src_burst_len = ZYNQMP_DMA_ARLEN_RST_VAL;
> + err = of_property_read_u32(node, "xlnx,bus-width", >bus_width);
> + if ((err < 0) && ((chan->bus_width != ZYNQMP_DMA_BUS_WIDTH_64) ||
> +   (chan->bus_width != ZYNQMP_DMA_BUS_WIDTH_128))) {
> + dev_err(zdev->dev, "invalid bus-width value");
> + return err;
> + }
> +
> + chan->is_dmacoherent =  of_property_read_bool(node, "dma-coherent");
> + zdev->chan = chan;
> + tasklet_init(>tasklet, zynqmp_dma_do_tasklet, (ulong)chan);

where is this killed?

> + spin_lock_init(>lock);
> + INIT_LIST_HEAD(>active_list);
> + INIT_LIST_HEAD(>pending_list);
> + INIT_LIST_HEAD(>done_list);
> + INIT_LIST_HEAD(>free_list);
> +
> + dma_cookie_init(>common);
> + chan->common.device = >common;
> + list_add_tail(>common.device_node, >common.channels);
> +
> + zynqmp_dma_init(chan);
> + chan->irq = platform_get_irq(pdev, 0);
> + if (chan->irq < 0)
> + return -ENXIO;
> + err = devm_request_irq(>dev, chan->irq, zynqmp_dma_irq_handler, 0,
> +"zynqmp-dma", chan);

this needs to be freed/diabled in remove, dont see that

-- 
~Vinod


Re: [PATCH v12 2/2] dmaengine: Add Xilinx zynqmp dma engine driver support

2016-07-07 Thread Vinod Koul
On Fri, Jul 01, 2016 at 05:07:06PM +0530, Kedareswara rao Appana wrote:
> +static int zynqmp_dma_chan_probe(struct zynqmp_dma_device *zdev,
> +struct platform_device *pdev)
> +{
> + struct zynqmp_dma_chan *chan;
> + struct resource *res;
> + struct device_node *node = pdev->dev.of_node;
> + int err;
> +
> + chan = devm_kzalloc(zdev->dev, sizeof(*chan), GFP_KERNEL);
> + if (!chan)
> + return -ENOMEM;
> + chan->dev = zdev->dev;
> + chan->zdev = zdev;
> +
> + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + chan->regs = devm_ioremap_resource(>dev, res);
> + if (IS_ERR(chan->regs))
> + return PTR_ERR(chan->regs);
> +
> + chan->bus_width = ZYNQMP_DMA_BUS_WIDTH_64;
> + chan->dst_burst_len = ZYNQMP_DMA_AWLEN_RST_VAL;
> + chan->src_burst_len = ZYNQMP_DMA_ARLEN_RST_VAL;
> + err = of_property_read_u32(node, "xlnx,bus-width", >bus_width);
> + if ((err < 0) && ((chan->bus_width != ZYNQMP_DMA_BUS_WIDTH_64) ||
> +   (chan->bus_width != ZYNQMP_DMA_BUS_WIDTH_128))) {
> + dev_err(zdev->dev, "invalid bus-width value");
> + return err;
> + }
> +
> + chan->is_dmacoherent =  of_property_read_bool(node, "dma-coherent");
> + zdev->chan = chan;
> + tasklet_init(>tasklet, zynqmp_dma_do_tasklet, (ulong)chan);

where is this killed?

> + spin_lock_init(>lock);
> + INIT_LIST_HEAD(>active_list);
> + INIT_LIST_HEAD(>pending_list);
> + INIT_LIST_HEAD(>done_list);
> + INIT_LIST_HEAD(>free_list);
> +
> + dma_cookie_init(>common);
> + chan->common.device = >common;
> + list_add_tail(>common.device_node, >common.channels);
> +
> + zynqmp_dma_init(chan);
> + chan->irq = platform_get_irq(pdev, 0);
> + if (chan->irq < 0)
> + return -ENXIO;
> + err = devm_request_irq(>dev, chan->irq, zynqmp_dma_irq_handler, 0,
> +"zynqmp-dma", chan);

this needs to be freed/diabled in remove, dont see that

-- 
~Vinod


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrei Vagin
On Thu, Jul 7, 2016 at 8:26 PM, James Bottomley
 wrote:
> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
>> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
>> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
>> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
>> > > wrote:
>> > > > On 7 July 2016 at 17:01, James Bottomley
>> > > >  wrote:
>> > > [Serge already answered the parenting issue]
>> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
>> > > > > > Hm.  Probably best-effort based on the process hierarchy.
>> > > > > >  So
>> > > > > > yeah you could probably get a tree into a state that would
>> > > > > > be
>> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
>> > > > > >   Have
>> > > > > > another task create a new user_ns, bind mount it, exit;
>> > > > > >  Third
>> > > > > > task setns()s first to the new netns then to the new
>> > > > > > user_ns.  I
>> > > > > > suspect criu will recreate that wrongly.
>> > > > >
>> > > > > This is a bit pathological, and you have to be root to do it:
>> > > > > so
>> > > > > root can set up a nesting hierarchy, bind it and destroy the
>> > > > > pids
>> > > > > but I know of no current orchestration system which does
>> > > > > this.
>> > > > >
>> > > > > Actually, I have to back pedal a bit: the way I currently set
>> > > > > up
>> > > > > architecture emulation containers does precisely this: I set
>> > > > > up the
>> > > > > namespaces unprivileged with child mount namespaces, but then
>> > > > > I ask
>> > > > > root to bind the userns and kill the process that created it
>> > > > > so I
>> > > > > have a permanent handle to enter the namespace by, so I
>> > > > > suspect
>> > > > > that when our current orchestration systems get more
>> > > > > sophisticated,
>> > > > > they might eventually want to do something like this as well.
>> > > > >
>> > > > > In theory, we could get nsfs to show this information as an
>> > > > > option
>> > > > > (just add a show_options entry to the superblock ops), but
>> > > > > the
>> > > > > problem is that although each namespace has a parent user_ns,
>> > > > > there's no way to get it without digging in the namespace
>> > > > > specific
>> > > > > structure.  Probably we should restructure to move it into
>> > > > > ns_common, then we could display it (and enforce all
>> > > > > namespaces
>> > > > > having owning user_ns) but it would be a
>> > > >
>> > > > I'm missing something here. Is it not already the case that all
>> > > > namespaces have an owning user_ns?
>> > >
>> > > Um, yes, I don't believe I said they don't.  The problem I
>> > > thought you
>> > > were having is that there's no way of seeing what it is.
>> > >
>> > > nsfs is the Namespace fileystem where bound namespaces appear to
>> > > a cat
>> > > of /proc/self/mounts.  It can display any information that's in
>> > > ns_common (the common core of namespaces) but the owning user_ns
>> > > pointer currently isn't in this structure.  Every user namespace
>> > > has a
>> > > pointer to it, but they're all privately embedded in the
>> > > individual
>> > > namespace specific structures.  What I was proposing was that
>> > > since
>> > > every current namespace has a pointer somewhere to the owning
>> > > user
>> > > namespace, we could abstract this out into ns_common so it's now
>> > > accessible to be displayed by nsfs, probably as a mount option.
>> >
>> > James, I am not sure that I understood you correctly. We have one
>> > file system for all namespace files, how we can show per-file
>> > properties
>> > in mount options. I think we can show all required information in
>> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
>> > /proc/pid/fdinfo/X for it.
>>
>> Here is a proof-of-concept patch.
>>
>> How it works:
>>
>> In [1]: import os
>>
>> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
>>
>> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
>> pos:  0
>> flags:010
>> mnt_id:   2
>> userns: 4026531837
>>
>> In [4]: print "/proc/self/ns/user -> %s" %
>> os.readlink("/proc/self/ns/user")
>> /proc/self/ns/user -> user:[4026531837]
>
> can't you just do
>
> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

We can get fdinfo for any ns file. I used /proc/self/ns/pid as an example.

Look at another example:

[root@fc22-vm ~]# cat /proc/self/mountinfo | grep pid_ns_file
115 38 0:3 pid:[4026532306] /tmp/pid_ns_file rw shared:67 - nsfs nsfs rw

In [4]: print open("/proc/self/fdinfo/5").read()
pos: 0
flags: 010
mnt_id: 115
userns: 4026532305


In [5]: os.readlink("/proc/self/ns/user")
Out[5]: 'user:[4026531837]'


>
> ?
>
> But what Michael was asking about was the parent user_ns of all the
> other namespaces ... I don't think there's any way we can get that out
> of any information in /proc/self/
>
> James
>
>
> 

Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrei Vagin
On Thu, Jul 7, 2016 at 8:26 PM, James Bottomley
 wrote:
> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
>> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
>> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
>> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
>> > > wrote:
>> > > > On 7 July 2016 at 17:01, James Bottomley
>> > > >  wrote:
>> > > [Serge already answered the parenting issue]
>> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
>> > > > > > Hm.  Probably best-effort based on the process hierarchy.
>> > > > > >  So
>> > > > > > yeah you could probably get a tree into a state that would
>> > > > > > be
>> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
>> > > > > >   Have
>> > > > > > another task create a new user_ns, bind mount it, exit;
>> > > > > >  Third
>> > > > > > task setns()s first to the new netns then to the new
>> > > > > > user_ns.  I
>> > > > > > suspect criu will recreate that wrongly.
>> > > > >
>> > > > > This is a bit pathological, and you have to be root to do it:
>> > > > > so
>> > > > > root can set up a nesting hierarchy, bind it and destroy the
>> > > > > pids
>> > > > > but I know of no current orchestration system which does
>> > > > > this.
>> > > > >
>> > > > > Actually, I have to back pedal a bit: the way I currently set
>> > > > > up
>> > > > > architecture emulation containers does precisely this: I set
>> > > > > up the
>> > > > > namespaces unprivileged with child mount namespaces, but then
>> > > > > I ask
>> > > > > root to bind the userns and kill the process that created it
>> > > > > so I
>> > > > > have a permanent handle to enter the namespace by, so I
>> > > > > suspect
>> > > > > that when our current orchestration systems get more
>> > > > > sophisticated,
>> > > > > they might eventually want to do something like this as well.
>> > > > >
>> > > > > In theory, we could get nsfs to show this information as an
>> > > > > option
>> > > > > (just add a show_options entry to the superblock ops), but
>> > > > > the
>> > > > > problem is that although each namespace has a parent user_ns,
>> > > > > there's no way to get it without digging in the namespace
>> > > > > specific
>> > > > > structure.  Probably we should restructure to move it into
>> > > > > ns_common, then we could display it (and enforce all
>> > > > > namespaces
>> > > > > having owning user_ns) but it would be a
>> > > >
>> > > > I'm missing something here. Is it not already the case that all
>> > > > namespaces have an owning user_ns?
>> > >
>> > > Um, yes, I don't believe I said they don't.  The problem I
>> > > thought you
>> > > were having is that there's no way of seeing what it is.
>> > >
>> > > nsfs is the Namespace fileystem where bound namespaces appear to
>> > > a cat
>> > > of /proc/self/mounts.  It can display any information that's in
>> > > ns_common (the common core of namespaces) but the owning user_ns
>> > > pointer currently isn't in this structure.  Every user namespace
>> > > has a
>> > > pointer to it, but they're all privately embedded in the
>> > > individual
>> > > namespace specific structures.  What I was proposing was that
>> > > since
>> > > every current namespace has a pointer somewhere to the owning
>> > > user
>> > > namespace, we could abstract this out into ns_common so it's now
>> > > accessible to be displayed by nsfs, probably as a mount option.
>> >
>> > James, I am not sure that I understood you correctly. We have one
>> > file system for all namespace files, how we can show per-file
>> > properties
>> > in mount options. I think we can show all required information in
>> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
>> > /proc/pid/fdinfo/X for it.
>>
>> Here is a proof-of-concept patch.
>>
>> How it works:
>>
>> In [1]: import os
>>
>> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
>>
>> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
>> pos:  0
>> flags:010
>> mnt_id:   2
>> userns: 4026531837
>>
>> In [4]: print "/proc/self/ns/user -> %s" %
>> os.readlink("/proc/self/ns/user")
>> /proc/self/ns/user -> user:[4026531837]
>
> can't you just do
>
> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

We can get fdinfo for any ns file. I used /proc/self/ns/pid as an example.

Look at another example:

[root@fc22-vm ~]# cat /proc/self/mountinfo | grep pid_ns_file
115 38 0:3 pid:[4026532306] /tmp/pid_ns_file rw shared:67 - nsfs nsfs rw

In [4]: print open("/proc/self/fdinfo/5").read()
pos: 0
flags: 010
mnt_id: 115
userns: 4026532305


In [5]: os.readlink("/proc/self/ns/user")
Out[5]: 'user:[4026531837]'


>
> ?
>
> But what Michael was asking about was the parent user_ns of all the
> other namespaces ... I don't think there's any way we can get that out
> of any information in /proc/self/
>
> James
>
>
> ___
> Containers mailing list
> 

Re: [v4] powerpc: Export thread_struct.used_vr/used_vsr to user space

2016-07-07 Thread Simon Guo
On Thu, Jul 07, 2016 at 11:21:18PM +1000, Benjamin Herrenschmidt wrote:
> I think the right fix is that if a restore_sigcontext() has the MSR bits set,
> it should set the corresponding used_* flag.
> 
> Or is there a reason why that won't work ?

That sounds reaonable to me.
I will prepare a patch based on that.

Michael, Ben, Laurent,
Thanks the discussion and proposal.

- Simon


Re: [v4] powerpc: Export thread_struct.used_vr/used_vsr to user space

2016-07-07 Thread Simon Guo
On Thu, Jul 07, 2016 at 11:21:18PM +1000, Benjamin Herrenschmidt wrote:
> I think the right fix is that if a restore_sigcontext() has the MSR bits set,
> it should set the corresponding used_* flag.
> 
> Or is there a reason why that won't work ?

That sounds reaonable to me.
I will prepare a patch based on that.

Michael, Ben, Laurent,
Thanks the discussion and proposal.

- Simon


Re: Introspecting userns relationships to other namespaces?

2016-07-07 Thread W. Trevor King
On Thu, Jul 07, 2016 at 08:26:47PM -0700, James Bottomley wrote:
> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
> > On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> > > I think we can show all required information in fdinfo. We open
> > > a namespaces file (/proc/pid/ns/N) and then read
> > > /proc/pid/fdinfo/X for it.
> > 
> > Here is a proof-of-concept patch.
> > …
> > In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
> > 
> > In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
> > pos:0
> > flags:  010
> > mnt_id: 2
> > userns: 4026531837
> > 
> > In [4]: print "/proc/self/ns/user -> %s" %
> > os.readlink("/proc/self/ns/user")
> > /proc/self/ns/user -> user:[4026531837]
> 
> can't you just do
> 
> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

With Andrew's fdinfo approach you know the user namespace owning
/proc/self/ns/pid is 4026531837.  That happens to be
/proc/self/ns/user in this case, but doesn't have to be in general.

> But what Michael was asking about was the parent user_ns of all the
> other namespaces ... I don't think there's any way we can get that
> out of any information in /proc/self/

If fdinfo only shows immediate parents, you'd need to walk the tree to
get back to the root.  And at each layer of the PID namespace tree
there will be another user-namespace parent branching off).  With a
tree like:

  Namespace | Parent   | Owning userns
 ---+--+---
  Root userns   | -| -
  Root PID ns   | -| Root userns
  Child userns  | Root usens   | Root userns
  Child PID ns  | Root PID ns  | Root userns
  Grandchild userns | Child userns | Child userns
  Grandchild PID ns | Child PID ns | Grandchild userns

Walking from the granchild PID namespace would give you:

  Grandchild PID ns
  |-- Child PID ns
  |   |-- Root PID ns
  |   `-- Root userns 
  `-- Granchild userns
  `-- Child userns
  `-- Root userns

If you only put one level in fdinfo, you're stuck if one of the
namespaces involved has neither bind mounts nor a PID to give you
handle on it [1].  And if you want to put that whole ancestor tree in
fdinfo, you have to come up with some way to handle the two-parent
branching.

I'm also not sure how exposing nsfs information [2] would handle
namespaces that had neither a surviving bind mount nor a direct
process.

If all the information is available (possible after a mechanical patch
[3] makes it more accessible), then it seems easier to put it in a
separate /proc or /sys file.  There was a stab at this for PID
namespaces in [4] (the same series that landed NStgid, etc.) with
additional background and alternative approaches in [5].  There were
problems with that patch (and it was trying to do more by also listing
a process's ID in each PID namespace), but the “let's put the whole
tree in a new file” approach seems sound to me.

Cheers,
Trevor

[1]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=20536
 Subject: Re: Introspecting userns relationships to other namespaces?
 Date: Thu, 7 Jul 2016 13:24:42 -0500
 Message-ID: <20160707182442.ga6...@mail.hallyn.com>
[2]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=30499
 Subject: Re: [CRIU] Introspecting userns relationships to other namespaces?
 Date: Thu, 07 Jul 2016 20:20:05 -0700
 Message-ID: <1467948005.2322.84.ca...@hansenpartnership.com>
[3]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=20537
 Subject: Re: Introspecting userns relationships to other namespaces?
 Message-ID: <1467903712.2347.16.ca...@hansenpartnership.com>
 Date: Thu, 07 Jul 2016 08:01:52 -0700
[4]: http://thread.gmane.org/gmane.linux.kernel.containers/28925/focus=28928
 Subject: [resend][PATCH v9 1/3] procfs: show hierarchy of pid namespace
 Date: Tue, 23 Dec 2014 18:20:37 +0800
 Message-ID: <1419330039-29207-2-git-send-email-chenhanx...@cn.fujitsu.com>
[5]: http://thread.gmane.org/gmane.linux.kernel.containers/28105
 Subject: [RFC]Pid conversion between pid namespace
 Date: Thu, 3 Jul 2014 12:18:33 +
 Message-ID: 
<5871495633F38949900D2BF2DC04883E55C374@G08CNEXMBPEKD02.g08.fujitsu.local>

-- 
This email may be signed or encrypted with GnuPG (http://www.gnupg.org).
For more information, see http://en.wikipedia.org/wiki/Pretty_Good_Privacy


signature.asc
Description: OpenPGP digital signature


Re: Introspecting userns relationships to other namespaces?

2016-07-07 Thread W. Trevor King
On Thu, Jul 07, 2016 at 08:26:47PM -0700, James Bottomley wrote:
> On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
> > On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> > > I think we can show all required information in fdinfo. We open
> > > a namespaces file (/proc/pid/ns/N) and then read
> > > /proc/pid/fdinfo/X for it.
> > 
> > Here is a proof-of-concept patch.
> > …
> > In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
> > 
> > In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
> > pos:0
> > flags:  010
> > mnt_id: 2
> > userns: 4026531837
> > 
> > In [4]: print "/proc/self/ns/user -> %s" %
> > os.readlink("/proc/self/ns/user")
> > /proc/self/ns/user -> user:[4026531837]
> 
> can't you just do
> 
> readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

With Andrew's fdinfo approach you know the user namespace owning
/proc/self/ns/pid is 4026531837.  That happens to be
/proc/self/ns/user in this case, but doesn't have to be in general.

> But what Michael was asking about was the parent user_ns of all the
> other namespaces ... I don't think there's any way we can get that
> out of any information in /proc/self/

If fdinfo only shows immediate parents, you'd need to walk the tree to
get back to the root.  And at each layer of the PID namespace tree
there will be another user-namespace parent branching off).  With a
tree like:

  Namespace | Parent   | Owning userns
 ---+--+---
  Root userns   | -| -
  Root PID ns   | -| Root userns
  Child userns  | Root usens   | Root userns
  Child PID ns  | Root PID ns  | Root userns
  Grandchild userns | Child userns | Child userns
  Grandchild PID ns | Child PID ns | Grandchild userns

Walking from the granchild PID namespace would give you:

  Grandchild PID ns
  |-- Child PID ns
  |   |-- Root PID ns
  |   `-- Root userns 
  `-- Granchild userns
  `-- Child userns
  `-- Root userns

If you only put one level in fdinfo, you're stuck if one of the
namespaces involved has neither bind mounts nor a PID to give you
handle on it [1].  And if you want to put that whole ancestor tree in
fdinfo, you have to come up with some way to handle the two-parent
branching.

I'm also not sure how exposing nsfs information [2] would handle
namespaces that had neither a surviving bind mount nor a direct
process.

If all the information is available (possible after a mechanical patch
[3] makes it more accessible), then it seems easier to put it in a
separate /proc or /sys file.  There was a stab at this for PID
namespaces in [4] (the same series that landed NStgid, etc.) with
additional background and alternative approaches in [5].  There were
problems with that patch (and it was trying to do more by also listing
a process's ID in each PID namespace), but the “let's put the whole
tree in a new file” approach seems sound to me.

Cheers,
Trevor

[1]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=20536
 Subject: Re: Introspecting userns relationships to other namespaces?
 Date: Thu, 7 Jul 2016 13:24:42 -0500
 Message-ID: <20160707182442.ga6...@mail.hallyn.com>
[2]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=30499
 Subject: Re: [CRIU] Introspecting userns relationships to other namespaces?
 Date: Thu, 07 Jul 2016 20:20:05 -0700
 Message-ID: <1467948005.2322.84.ca...@hansenpartnership.com>
[3]: http://thread.gmane.org/gmane.linux.kernel.containers/30456/focus=20537
 Subject: Re: Introspecting userns relationships to other namespaces?
 Message-ID: <1467903712.2347.16.ca...@hansenpartnership.com>
 Date: Thu, 07 Jul 2016 08:01:52 -0700
[4]: http://thread.gmane.org/gmane.linux.kernel.containers/28925/focus=28928
 Subject: [resend][PATCH v9 1/3] procfs: show hierarchy of pid namespace
 Date: Tue, 23 Dec 2014 18:20:37 +0800
 Message-ID: <1419330039-29207-2-git-send-email-chenhanx...@cn.fujitsu.com>
[5]: http://thread.gmane.org/gmane.linux.kernel.containers/28105
 Subject: [RFC]Pid conversion between pid namespace
 Date: Thu, 3 Jul 2014 12:18:33 +
 Message-ID: 
<5871495633F38949900D2BF2DC04883E55C374@G08CNEXMBPEKD02.g08.fujitsu.local>

-- 
This email may be signed or encrypted with GnuPG (http://www.gnupg.org).
For more information, see http://en.wikipedia.org/wiki/Pretty_Good_Privacy


signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/4] regulator: tps65917/palmas: Cleanups and bugfixes

2016-07-07 Thread Keerthy



On Friday 20 May 2016 11:46 AM, Laxman Dewangan wrote:


On Friday 20 May 2016 10:01 AM, Keerthy wrote:

+ Lee Jones

On Saturday 07 May 2016 12:31 AM, Nishanth Menon wrote:

On 05/06/2016 12:14 PM, Mark Brown wrote:

On Fri, May 06, 2016 at 12:44:23PM +0530, Laxman Dewangan wrote:


When you are here, can you implement the dt parsing with the new
method from
regulator framework.
Regulator FW calls callback to parse customized DT property, just
need to
pass the node and pointer when registering.



This will helps lots in cleanups and readability.


Yes, please.


yeah, the driver has started showing it's age, it will be good to do a
refactor.


Laxman,

I got the dt parsing with new method from regulator framework part, But
by new method do you also want to remove the dt compatible of
regulators and let only the mfd compatible stay?

replace of_platform_populate with mfd_add_devices so that linux
handles the drivers split up and not the device tree?



The DT binding of child devices of the palmas are like that each sub
node has compatible.
So I dont think we can change this to avoid regression.

However, if we make the child devices independent of the parent devices
then it will be very useful to use across different PMIC if they have
same IP.
Currently, child devices are very much tightly coupled with parent
devices for the register access and global structure member accces.

This is exactly what we did for the max77686 RTC driver which is used by
max77686, max77802 and max77620.

There is two mfd core driver, max77686 and max77620 and uses same RTC
driver rtc-max77686.c


Laxman,

Sorry for responding late on this thread. The new way of the dt parsing 
with the new method expects the driver to populate vsel_reg, vsel_mask,

enable_reg, enable_mask.

The inherent difference in palmas regulator driver w.r.t handling 
regulators is that this driver treats smps and ldo differently. It has 
separate read/write functions for both and goes by separate base 
addresses for spmp and ldo. Now to get all this unified under one 
regulator_desc array a lot of code churn would be needed in both header 
and C files. Not sure if that is okay.


Regards,
Keerthy





Re: [PATCH 0/4] regulator: tps65917/palmas: Cleanups and bugfixes

2016-07-07 Thread Keerthy



On Friday 20 May 2016 11:46 AM, Laxman Dewangan wrote:


On Friday 20 May 2016 10:01 AM, Keerthy wrote:

+ Lee Jones

On Saturday 07 May 2016 12:31 AM, Nishanth Menon wrote:

On 05/06/2016 12:14 PM, Mark Brown wrote:

On Fri, May 06, 2016 at 12:44:23PM +0530, Laxman Dewangan wrote:


When you are here, can you implement the dt parsing with the new
method from
regulator framework.
Regulator FW calls callback to parse customized DT property, just
need to
pass the node and pointer when registering.



This will helps lots in cleanups and readability.


Yes, please.


yeah, the driver has started showing it's age, it will be good to do a
refactor.


Laxman,

I got the dt parsing with new method from regulator framework part, But
by new method do you also want to remove the dt compatible of
regulators and let only the mfd compatible stay?

replace of_platform_populate with mfd_add_devices so that linux
handles the drivers split up and not the device tree?



The DT binding of child devices of the palmas are like that each sub
node has compatible.
So I dont think we can change this to avoid regression.

However, if we make the child devices independent of the parent devices
then it will be very useful to use across different PMIC if they have
same IP.
Currently, child devices are very much tightly coupled with parent
devices for the register access and global structure member accces.

This is exactly what we did for the max77686 RTC driver which is used by
max77686, max77802 and max77620.

There is two mfd core driver, max77686 and max77620 and uses same RTC
driver rtc-max77686.c


Laxman,

Sorry for responding late on this thread. The new way of the dt parsing 
with the new method expects the driver to populate vsel_reg, vsel_mask,

enable_reg, enable_mask.

The inherent difference in palmas regulator driver w.r.t handling 
regulators is that this driver treats smps and ldo differently. It has 
separate read/write functions for both and goes by separate base 
addresses for spmp and ldo. Now to get all this unified under one 
regulator_desc array a lot of code churn would be needed in both header 
and C files. Not sure if that is okay.


Regards,
Keerthy





Re: [PATCH 0/2] Fix issue with alternatives/paravirt patches

2016-07-07 Thread Christopher Arges
On Tue, Jul 05, 2016 at 10:34:58PM -0400, Jessica Yu wrote:
> Hi,
> 
> A few months ago, Chris Arges reported a bug involving alternatives/paravirt
> patching that was discussed here [1] and here [2]. To briefly summarize the
> bug, patch modules that contained .altinstructions or .parainstructions
> sections would break because these alternative/paravirt patches would be
> applied first by the module loader (see x86 module_finalize()), then
> livepatch would later clobber these patches when applying per-object
> relocations. This lead to crashes and unpredictable behavior.
> 
> One conclusion we reached from our last discussion was that we will
> need to introduce some arch-specific code to address this problem.
> This patchset presents a possible fix for the bug by adding a new
> arch-specific arch_klp_init_object_loaded() function that by default
> does nothing but can be overridden by different arches.
> 
> To fix this issue for x86, since we can access a patch module's Elf
> sections through mod->klp_info, we can simply delay the calls to
> apply_paravirt() and apply_alternatives() to arch_klp_init_object_loaded(),
> which is called after relocations have been written for an object.
> In addition, for patch modules, .parainstructions and .altinstructions are
> prefixed by ".klp.arch.${objname}" so that the module loader ignores them
> and livepatch can apply them manually.
> 
> Currently for kpatch, we don't support including jump table sections in
> the patch module, and supporting .smp_locks is currently broken, so we
> don't consider those sections (for now).
> 
> I did some light testing with some patches to kvm and verified that the
> original issue reported in [2] was fixed.
> 
> Based on linux-next.
> 

Jessica,

I was able to test these patches on top of linux-next. I took your kpatch
branch and hacked it a bit to get it working and was able to
apply a patch to 'kvm_arch_vm_ioctl' while running a VM workload.

Great job!

Tested-by: Chris J Arges 

--chris

> [1] http://thread.gmane.org/gmane.linux.kernel/2185604/
> [2] https://github.com/dynup/kpatch/issues/580
> 
> Jessica Yu (2):
>   livepatch: use arch_klp_init_object_loaded() to finish arch-specific tasks
>   livepatch/x86: apply alternatives and paravirt patches after relocations
> 
>  arch/x86/kernel/Makefile|  1 +
>  arch/x86/kernel/livepatch.c | 66 
> +
>  include/linux/livepatch.h   |  3 +++
>  kernel/livepatch/core.c | 12 +++--
>  4 files changed, 80 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/kernel/livepatch.c
> 
> -- 
> 2.4.3
> 


Re: [PATCH 0/2] Fix issue with alternatives/paravirt patches

2016-07-07 Thread Christopher Arges
On Tue, Jul 05, 2016 at 10:34:58PM -0400, Jessica Yu wrote:
> Hi,
> 
> A few months ago, Chris Arges reported a bug involving alternatives/paravirt
> patching that was discussed here [1] and here [2]. To briefly summarize the
> bug, patch modules that contained .altinstructions or .parainstructions
> sections would break because these alternative/paravirt patches would be
> applied first by the module loader (see x86 module_finalize()), then
> livepatch would later clobber these patches when applying per-object
> relocations. This lead to crashes and unpredictable behavior.
> 
> One conclusion we reached from our last discussion was that we will
> need to introduce some arch-specific code to address this problem.
> This patchset presents a possible fix for the bug by adding a new
> arch-specific arch_klp_init_object_loaded() function that by default
> does nothing but can be overridden by different arches.
> 
> To fix this issue for x86, since we can access a patch module's Elf
> sections through mod->klp_info, we can simply delay the calls to
> apply_paravirt() and apply_alternatives() to arch_klp_init_object_loaded(),
> which is called after relocations have been written for an object.
> In addition, for patch modules, .parainstructions and .altinstructions are
> prefixed by ".klp.arch.${objname}" so that the module loader ignores them
> and livepatch can apply them manually.
> 
> Currently for kpatch, we don't support including jump table sections in
> the patch module, and supporting .smp_locks is currently broken, so we
> don't consider those sections (for now).
> 
> I did some light testing with some patches to kvm and verified that the
> original issue reported in [2] was fixed.
> 
> Based on linux-next.
> 

Jessica,

I was able to test these patches on top of linux-next. I took your kpatch
branch and hacked it a bit to get it working and was able to
apply a patch to 'kvm_arch_vm_ioctl' while running a VM workload.

Great job!

Tested-by: Chris J Arges 

--chris

> [1] http://thread.gmane.org/gmane.linux.kernel/2185604/
> [2] https://github.com/dynup/kpatch/issues/580
> 
> Jessica Yu (2):
>   livepatch: use arch_klp_init_object_loaded() to finish arch-specific tasks
>   livepatch/x86: apply alternatives and paravirt patches after relocations
> 
>  arch/x86/kernel/Makefile|  1 +
>  arch/x86/kernel/livepatch.c | 66 
> +
>  include/linux/livepatch.h   |  3 +++
>  kernel/livepatch/core.c | 12 +++--
>  4 files changed, 80 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/kernel/livepatch.c
> 
> -- 
> 2.4.3
> 


Re: [PATCH v2 0/5] dmaengine: vdma: AXI DMAS Enhancments

2016-07-07 Thread Vinod Koul
On Fri, Jun 24, 2016 at 10:51:21AM +0530, Kedareswara rao Appana wrote:
> This patch series does the following thing.
> ---> Add support for AXI DMA Multi-channel DMA mode.
> ---> Delete AXI DMA binding doc.
> ---> Rename the driver and update config options.

Applied after changing the patch tags for driver name for last two.

-- 
~Vinod


Re: [PATCH v2 0/5] dmaengine: vdma: AXI DMAS Enhancments

2016-07-07 Thread Vinod Koul
On Fri, Jun 24, 2016 at 10:51:21AM +0530, Kedareswara rao Appana wrote:
> This patch series does the following thing.
> ---> Add support for AXI DMA Multi-channel DMA mode.
> ---> Delete AXI DMA binding doc.
> ---> Rename the driver and update config options.

Applied after changing the patch tags for driver name for last two.

-- 
~Vinod


[git pull] drm fixes

2016-07-07 Thread Dave Airlie

Hi Linus,

One nouveau fix, and a few AMD Polaris fixes and some Allwinner fixes.

I've got some vmware fixes that I might send separate over the weekend, 
they fix some black screens, but I'm still debating them.

Dave.

The following changes since commit a99cde438de0c4c0cecc1d1af1a55a75b10bfdef:

  Linux 4.7-rc6 (2016-07-03 23:01:00 -0700)

are available in the git repository at:

  git://people.freedesktop.org/~airlied/linux tags/drm-fixes-for-v4.7-rc7

for you to fetch changes up to 39c8859418d5d2d29482fcd7d58daba6e299fac5:

  Merge tag 'sunxi-drm-fixes-for-4.7-2' of 
https://git.kernel.org/pub/scm/linux/kernel/git/mripard/linux into drm-fixes 
(2016-07-08 13:29:11 +1000)


Ben Skeggs (1):
  drm/nouveau/disp/sor/gf119: select correct sor when poking training 
pattern

Dave Airlie (3):
  Merge branch 'drm-fixes-4.7' of git://people.freedesktop.org/~agd5f/linux 
into drm-fixes
  Merge branch 'linux-4.7' of git://github.com/skeggsb/linux into drm-fixes
  Merge tag 'sunxi-drm-fixes-for-4.7-2' of 
https://git.kernel.org/.../mripard/linux into drm-fixes

Huang Rui (2):
  drm/amd/powerplay: fix incorrect voltage table value for polaris10
  drm/amd/powerplay: fix incorrect voltage table value for tonga

Maxime Ripard (2):
  drm/sun4i: Report proper vblank
  drm/sun4i: Send vblank event when the CRTC is disabled

Peter Chen (1):
  gpu: drm: sun4i_drv: add missing of_node_put after calling 
of_parse_phandle

Rex Zhu (3):
  drm/amd/powerplay: incorrectly use of the function return value
  drm/amd/powerplay: fix bug that get wrong polaris evv voltage.
  drm/amd/powerplay: Update CKS on/ CKS off voltage offset calculation.

 .../gpu/drm/amd/powerplay/hwmgr/polaris10_hwmgr.c  | 32 ++
 drivers/gpu/drm/amd/powerplay/hwmgr/ppatomctrl.c   |  4 +--
 drivers/gpu/drm/amd/powerplay/hwmgr/ppatomctrl.h   |  2 +-
 drivers/gpu/drm/amd/powerplay/hwmgr/tonga_hwmgr.c  |  2 +-
 .../amd/powerplay/hwmgr/tonga_processpptables.c|  2 +-
 .../gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c|  3 +-
 drivers/gpu/drm/sun4i/sun4i_crtc.c |  8 ++
 drivers/gpu/drm/sun4i/sun4i_drv.c  |  3 +-
 8 files changed, 32 insertions(+), 24 deletions(-)


[git pull] drm fixes

2016-07-07 Thread Dave Airlie

Hi Linus,

One nouveau fix, and a few AMD Polaris fixes and some Allwinner fixes.

I've got some vmware fixes that I might send separate over the weekend, 
they fix some black screens, but I'm still debating them.

Dave.

The following changes since commit a99cde438de0c4c0cecc1d1af1a55a75b10bfdef:

  Linux 4.7-rc6 (2016-07-03 23:01:00 -0700)

are available in the git repository at:

  git://people.freedesktop.org/~airlied/linux tags/drm-fixes-for-v4.7-rc7

for you to fetch changes up to 39c8859418d5d2d29482fcd7d58daba6e299fac5:

  Merge tag 'sunxi-drm-fixes-for-4.7-2' of 
https://git.kernel.org/pub/scm/linux/kernel/git/mripard/linux into drm-fixes 
(2016-07-08 13:29:11 +1000)


Ben Skeggs (1):
  drm/nouveau/disp/sor/gf119: select correct sor when poking training 
pattern

Dave Airlie (3):
  Merge branch 'drm-fixes-4.7' of git://people.freedesktop.org/~agd5f/linux 
into drm-fixes
  Merge branch 'linux-4.7' of git://github.com/skeggsb/linux into drm-fixes
  Merge tag 'sunxi-drm-fixes-for-4.7-2' of 
https://git.kernel.org/.../mripard/linux into drm-fixes

Huang Rui (2):
  drm/amd/powerplay: fix incorrect voltage table value for polaris10
  drm/amd/powerplay: fix incorrect voltage table value for tonga

Maxime Ripard (2):
  drm/sun4i: Report proper vblank
  drm/sun4i: Send vblank event when the CRTC is disabled

Peter Chen (1):
  gpu: drm: sun4i_drv: add missing of_node_put after calling 
of_parse_phandle

Rex Zhu (3):
  drm/amd/powerplay: incorrectly use of the function return value
  drm/amd/powerplay: fix bug that get wrong polaris evv voltage.
  drm/amd/powerplay: Update CKS on/ CKS off voltage offset calculation.

 .../gpu/drm/amd/powerplay/hwmgr/polaris10_hwmgr.c  | 32 ++
 drivers/gpu/drm/amd/powerplay/hwmgr/ppatomctrl.c   |  4 +--
 drivers/gpu/drm/amd/powerplay/hwmgr/ppatomctrl.h   |  2 +-
 drivers/gpu/drm/amd/powerplay/hwmgr/tonga_hwmgr.c  |  2 +-
 .../amd/powerplay/hwmgr/tonga_processpptables.c|  2 +-
 .../gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c|  3 +-
 drivers/gpu/drm/sun4i/sun4i_crtc.c |  8 ++
 drivers/gpu/drm/sun4i/sun4i_drv.c  |  3 +-
 8 files changed, 32 insertions(+), 24 deletions(-)


Re: [PATCH v3 3/4] perf annotate: add powerpc support

2016-07-07 Thread Ravi Bangoria

Hi Michael,

On Wednesday 06 July 2016 03:38 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:


On Thursday 30 June 2016 11:51 AM, Michael Ellerman wrote:

On Thu, 2016-06-30 at 11:44 +0530, Ravi Bangoria wrote:

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 36a5825..b87eac7 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -476,6 +481,125 @@ static int ins__cmp(const void *a, const void *b)

...

+
+static struct ins *ins__find_powerpc(const char *name)
+{
+   int i;
+   struct ins *ins;
+   struct ins_ops *ops;
+   static struct instructions_powerpc head;
+   static bool list_initialized;
+
+   /*
+* - Interested only if instruction starts with 'b'.
+* - Few start with 'b', but aren't branch instructions.
+* - Let's also ignore instructions involving 'ctr' and
+*   'tar' since target branch addresses for those can't
+*   be determined statically.
+*/
+   if (name[0] != 'b' ||
+   !strncmp(name, "bcd", 3)   ||
+   !strncmp(name, "brinc", 5) ||
+   !strncmp(name, "bper", 4)  ||
+   strstr(name, "ctr")||
+   strstr(name, "tar"))
+   return NULL;

It would be good if 'bctr' was at least recognised as a branch, even if we
can't determine the target. They are very common.

We can not show arrow for this since we don't know the target location.
can you please suggest how you intends perf to display bctr?

Yeah I understand you can't show an arrow.

I guess it could just be an unterminated arrow? But I'm not sure if
that's easy to do with the way the UI is constructed. eg. something
like:

 ld  r12,0(r12)
 mtctr   r12
 bctrl  -->
 ld  r3,-32704(r2)

But that's just an idea.


I've sent v4 which enables annotate for bctr' instructions.

for 'bctr', it will show down arrow(indicate jump) and 'bctrl' will show
right arrow(indicate call). But no navigation options will be provided.
By pressing Enter key on that, message will be shown that like
"Invalid target"

Please review it.


bctr can be classified into two variants -- 'bctr' and 'bctrl'.

'bctr' will be considered as jump instruction but jump__parse() won't
be able to find any target location and hence it will set target to
UINT64_MAX which transform 'bctr' to 'bctr UINT64_MAX'. This
looks misleading.

Agreed.


bctrl will be considered as call instruction but call_parse() won't
be able to find any target function and hence it won't show any
navigation arrow for this instruction. Which is same as filter it
beforehand.

OK.

Maybe what I'm asking for is an enhancement and can be done later.


It doesn't look like we have the opcode handy here? Could we get it somehow?
That would make this a *lot* more robust.

objdump prints machine code, but I don't know how difficult that would
be to parse to get opcode.

Normal objdump -d output includes the opcode, eg:

c000886c:   2c 2c 00 00 cmpdi   r12,0
 ^^^

The only thing you need to know is the endian and you can reconstruct
the raw instruction.

Then you can just decode the opcode, see how we do it in the kernel with
eg. instr_is_relative_branch().


I'm sorry. I was thinking that you wants to show opcodes with perf
annotate. But you were asking to use opcode instead of parsing
instructions.

This looks like rewrite parsing code. I don't know whether there is any
library already available for this which we can directly use. I'm thinking
about this.

- Ravi


cheers





Re: [PATCH v3 3/4] perf annotate: add powerpc support

2016-07-07 Thread Ravi Bangoria

Hi Michael,

On Wednesday 06 July 2016 03:38 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:


On Thursday 30 June 2016 11:51 AM, Michael Ellerman wrote:

On Thu, 2016-06-30 at 11:44 +0530, Ravi Bangoria wrote:

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 36a5825..b87eac7 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -476,6 +481,125 @@ static int ins__cmp(const void *a, const void *b)

...

+
+static struct ins *ins__find_powerpc(const char *name)
+{
+   int i;
+   struct ins *ins;
+   struct ins_ops *ops;
+   static struct instructions_powerpc head;
+   static bool list_initialized;
+
+   /*
+* - Interested only if instruction starts with 'b'.
+* - Few start with 'b', but aren't branch instructions.
+* - Let's also ignore instructions involving 'ctr' and
+*   'tar' since target branch addresses for those can't
+*   be determined statically.
+*/
+   if (name[0] != 'b' ||
+   !strncmp(name, "bcd", 3)   ||
+   !strncmp(name, "brinc", 5) ||
+   !strncmp(name, "bper", 4)  ||
+   strstr(name, "ctr")||
+   strstr(name, "tar"))
+   return NULL;

It would be good if 'bctr' was at least recognised as a branch, even if we
can't determine the target. They are very common.

We can not show arrow for this since we don't know the target location.
can you please suggest how you intends perf to display bctr?

Yeah I understand you can't show an arrow.

I guess it could just be an unterminated arrow? But I'm not sure if
that's easy to do with the way the UI is constructed. eg. something
like:

 ld  r12,0(r12)
 mtctr   r12
 bctrl  -->
 ld  r3,-32704(r2)

But that's just an idea.


I've sent v4 which enables annotate for bctr' instructions.

for 'bctr', it will show down arrow(indicate jump) and 'bctrl' will show
right arrow(indicate call). But no navigation options will be provided.
By pressing Enter key on that, message will be shown that like
"Invalid target"

Please review it.


bctr can be classified into two variants -- 'bctr' and 'bctrl'.

'bctr' will be considered as jump instruction but jump__parse() won't
be able to find any target location and hence it will set target to
UINT64_MAX which transform 'bctr' to 'bctr UINT64_MAX'. This
looks misleading.

Agreed.


bctrl will be considered as call instruction but call_parse() won't
be able to find any target function and hence it won't show any
navigation arrow for this instruction. Which is same as filter it
beforehand.

OK.

Maybe what I'm asking for is an enhancement and can be done later.


It doesn't look like we have the opcode handy here? Could we get it somehow?
That would make this a *lot* more robust.

objdump prints machine code, but I don't know how difficult that would
be to parse to get opcode.

Normal objdump -d output includes the opcode, eg:

c000886c:   2c 2c 00 00 cmpdi   r12,0
 ^^^

The only thing you need to know is the endian and you can reconstruct
the raw instruction.

Then you can just decode the opcode, see how we do it in the kernel with
eg. instr_is_relative_branch().


I'm sorry. I was thinking that you wants to show opcodes with perf
annotate. But you were asking to use opcode instead of parsing
instructions.

This looks like rewrite parsing code. I don't know whether there is any
library already available for this which we can directly use. I'm thinking
about this.

- Ravi


cheers





Re: [PATCH v2] kexec: Fix kdump failure with notsc

2016-07-07 Thread Wei, Jiangang
Hi , Eric

Thanks for your comments firstly.

On Thu, 2016-07-07 at 12:55 -0500, Eric W. Biederman wrote:
> Wei Jiangang  writes:
> 
> > If we specify the 'notsc' boot parameter for the dump-capture kernel,
> > and then trigger a crash(panic) by using "ALT-SysRq-c" or "echo c >
> > /proc/sysrq-trigger",
> > the dump-capture kernel will hang in calibrate_delay_converge():
> >
> > /* wait for "start of" clock tick */
> > ticks = jiffies;
> > while (ticks == jiffies)
> > ; /* nothing */
> >
> > serial log of the hang is as follows:
> >
> > tsc: Fast TSC calibration using PIT
> > tsc: Detected 2099.947 MHz processor
> > Calibrating delay loop...
> >
> > The reason is that the dump-capture kernel hangs in while loops and
> > waits for jiffies to be updated, but no timer interrupts is passed
> > to BSP by APIC.
> >
> > In fact, the local APIC was disabled in reboot and crash path by
> > lapic_shutdown(). We need to put APIC in legacy mode in kexec jump path
> > (put the system into PIT during the crash kernel),
> > so that the dump-capture kernel can get timer interrupts.
> >
> > BTW,
> > I found the buggy commit 522e66464467 ("x86/apic: Disable I/O APIC
> > before shutdown of the local APIC") via bisection.
> >
> > Originally, I want to revert it.
> > But Ingo Molnar comments that "By reverting the change can paper over
> > the bug, but re-introduce the bug that can result in certain CPUs hanging
> > if IO-APIC sends an APIC message if the lapic is disabled prematurely"
> > And I think it's pertinent.
> 
> Sigh.  Can we please just do the work to rip out the apic shutdown code
> from the kexec on panic code path?

Do you mean remove the calls for disable_IO_APIC() and lapic_shutdown()
in native_machine_crash_shutdown()?

If so, I have tried it, but it doesn't work for this problem.
> 
> I forgetting details but the only reason we have do any apic shutdown
> is bugs in older kernels that could not initialize a system properly
> if we did not shut down the apics.
> 
> I certainly don't see an issue with goofy cases like notsc not working
> on a crash capture kernel if we are not initializing the hardware
> properly.
> 
> The strategy really needs to be to only do the absolutely essential
> hardware shutdown in the crashing kernel, every adintional line of code
> we execute in the crashing kernel increases our chances of hitting a
> bug.
> 
> Under that policy things like requring we don't pass boot options that
> inhibit the dump catpure kernel from initializing the hardware from a
> random state are reasonable requirements.  AKA I don't see any
> justification in this as to why we would even want to support notsc
> on the dump capture kernel.  Especially when things clearly work when
> that option is not specified.

firstly do some clarification,

My commit message metioned that "specify the 'notsc' boot parameter for
the dump-capture kernel ". 
That's just the reproducing method used by myself for this problem.

In fact, If we specify notsc only for the first kernel,  which also can
trigger the bug.


And secondly,

In multiple CPU configurations the TSC values on different processors
may be different, 
which may cause random (bad) results.

for example,
FUJITSU's server (PRIMEQUEST 2000 series) supports Dynamic
Reconfiguration.
http://www.fujitsu.com/global/products/computing/servers/mission-critical/primequest/technology/availability/dynamic-reconfiguration.html

This feature enables to hot-add system board which contains cpus and
memories, this means some cpus can be hot-added to system. 
tsc of hot-added cpus is not consistent with tsc of
existing-from-boot-time cpus. (though hardware and firmware make an
effort to speficy the same tsc value as existing one)

PRIMEQUEST can happen this tsc-inconsistency, we recommend to specify
"notsc" boot option for Dynamic Reconfiguration users.

so we really need to specify 'notsc'.

Regards,
wei

> Eric
> 
> 
> > Signed-off-by: Wei Jiangang 
> > ---
> >  arch/x86/include/asm/apic.h| 5 +
> >  arch/x86/kernel/apic/apic.c| 9 +
> >  arch/x86/kernel/machine_kexec_32.c | 5 ++---
> >  arch/x86/kernel/machine_kexec_64.c | 6 +++---
> >  4 files changed, 19 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> > index bc27611fa58f..5d7e635e580a 100644
> > --- a/arch/x86/include/asm/apic.h
> > +++ b/arch/x86/include/asm/apic.h
> > @@ -128,6 +128,7 @@ extern void clear_local_APIC(void);
> >  extern void disconnect_bsp_APIC(int virt_wire_setup);
> >  extern void disable_local_APIC(void);
> >  extern void lapic_shutdown(void);
> > +extern int lapic_disabled(void);
> >  extern void sync_Arb_IDs(void);
> >  extern void init_bsp_APIC(void);
> >  extern void setup_local_APIC(void);
> > @@ -165,6 +166,10 @@ extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 
> > msg_type, u8 mask);
> >  
> >  #else /* 

Re: [PATCH v2] kexec: Fix kdump failure with notsc

2016-07-07 Thread Wei, Jiangang
Hi , Eric

Thanks for your comments firstly.

On Thu, 2016-07-07 at 12:55 -0500, Eric W. Biederman wrote:
> Wei Jiangang  writes:
> 
> > If we specify the 'notsc' boot parameter for the dump-capture kernel,
> > and then trigger a crash(panic) by using "ALT-SysRq-c" or "echo c >
> > /proc/sysrq-trigger",
> > the dump-capture kernel will hang in calibrate_delay_converge():
> >
> > /* wait for "start of" clock tick */
> > ticks = jiffies;
> > while (ticks == jiffies)
> > ; /* nothing */
> >
> > serial log of the hang is as follows:
> >
> > tsc: Fast TSC calibration using PIT
> > tsc: Detected 2099.947 MHz processor
> > Calibrating delay loop...
> >
> > The reason is that the dump-capture kernel hangs in while loops and
> > waits for jiffies to be updated, but no timer interrupts is passed
> > to BSP by APIC.
> >
> > In fact, the local APIC was disabled in reboot and crash path by
> > lapic_shutdown(). We need to put APIC in legacy mode in kexec jump path
> > (put the system into PIT during the crash kernel),
> > so that the dump-capture kernel can get timer interrupts.
> >
> > BTW,
> > I found the buggy commit 522e66464467 ("x86/apic: Disable I/O APIC
> > before shutdown of the local APIC") via bisection.
> >
> > Originally, I want to revert it.
> > But Ingo Molnar comments that "By reverting the change can paper over
> > the bug, but re-introduce the bug that can result in certain CPUs hanging
> > if IO-APIC sends an APIC message if the lapic is disabled prematurely"
> > And I think it's pertinent.
> 
> Sigh.  Can we please just do the work to rip out the apic shutdown code
> from the kexec on panic code path?

Do you mean remove the calls for disable_IO_APIC() and lapic_shutdown()
in native_machine_crash_shutdown()?

If so, I have tried it, but it doesn't work for this problem.
> 
> I forgetting details but the only reason we have do any apic shutdown
> is bugs in older kernels that could not initialize a system properly
> if we did not shut down the apics.
> 
> I certainly don't see an issue with goofy cases like notsc not working
> on a crash capture kernel if we are not initializing the hardware
> properly.
> 
> The strategy really needs to be to only do the absolutely essential
> hardware shutdown in the crashing kernel, every adintional line of code
> we execute in the crashing kernel increases our chances of hitting a
> bug.
> 
> Under that policy things like requring we don't pass boot options that
> inhibit the dump catpure kernel from initializing the hardware from a
> random state are reasonable requirements.  AKA I don't see any
> justification in this as to why we would even want to support notsc
> on the dump capture kernel.  Especially when things clearly work when
> that option is not specified.

firstly do some clarification,

My commit message metioned that "specify the 'notsc' boot parameter for
the dump-capture kernel ". 
That's just the reproducing method used by myself for this problem.

In fact, If we specify notsc only for the first kernel,  which also can
trigger the bug.


And secondly,

In multiple CPU configurations the TSC values on different processors
may be different, 
which may cause random (bad) results.

for example,
FUJITSU's server (PRIMEQUEST 2000 series) supports Dynamic
Reconfiguration.
http://www.fujitsu.com/global/products/computing/servers/mission-critical/primequest/technology/availability/dynamic-reconfiguration.html

This feature enables to hot-add system board which contains cpus and
memories, this means some cpus can be hot-added to system. 
tsc of hot-added cpus is not consistent with tsc of
existing-from-boot-time cpus. (though hardware and firmware make an
effort to speficy the same tsc value as existing one)

PRIMEQUEST can happen this tsc-inconsistency, we recommend to specify
"notsc" boot option for Dynamic Reconfiguration users.

so we really need to specify 'notsc'.

Regards,
wei

> Eric
> 
> 
> > Signed-off-by: Wei Jiangang 
> > ---
> >  arch/x86/include/asm/apic.h| 5 +
> >  arch/x86/kernel/apic/apic.c| 9 +
> >  arch/x86/kernel/machine_kexec_32.c | 5 ++---
> >  arch/x86/kernel/machine_kexec_64.c | 6 +++---
> >  4 files changed, 19 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> > index bc27611fa58f..5d7e635e580a 100644
> > --- a/arch/x86/include/asm/apic.h
> > +++ b/arch/x86/include/asm/apic.h
> > @@ -128,6 +128,7 @@ extern void clear_local_APIC(void);
> >  extern void disconnect_bsp_APIC(int virt_wire_setup);
> >  extern void disable_local_APIC(void);
> >  extern void lapic_shutdown(void);
> > +extern int lapic_disabled(void);
> >  extern void sync_Arb_IDs(void);
> >  extern void init_bsp_APIC(void);
> >  extern void setup_local_APIC(void);
> > @@ -165,6 +166,10 @@ extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 
> > msg_type, u8 mask);
> >  
> >  #else /* !CONFIG_X86_LOCAL_APIC */
> >  static inline void 

[PATCH v4 2/3] perf annotate: Enable cross arch annotate

2016-07-07 Thread Ravi Bangoria
Change current data structures and function to enable cross arch
annotate.

Current implementation does not contain logic of record on one arch
and annotating on other. This remote annotate is partially possible
with current implementation for x86 (or may be arm as well) only.
But, to make remote annotation work properly, all architecture
instruction tables need to be included in the perf binary. And while
annotating, look for instruction table where perf.data was recorded.

Signed-off-by: Ravi Bangoria 
---
Changes in v4:
  - __maybe_unused was misplaced at few location. Corrected it

 tools/perf/builtin-top.c  |   2 +-
 tools/perf/ui/browsers/annotate.c |   3 +-
 tools/perf/ui/gtk/annotate.c  |   2 +-
 tools/perf/util/annotate.c| 134 --
 tools/perf/util/annotate.h|   5 +-
 5 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 07fc792..d4fd947 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -128,7 +128,7 @@ static int perf_top__parse_source(struct perf_top *top, 
struct hist_entry *he)
return err;
}
 
-   err = symbol__annotate(sym, map, 0);
+   err = symbol__annotate(sym, map, 0, NULL);
if (err == 0) {
 out_assign:
top->sym_filter_entry = he;
diff --git a/tools/perf/ui/browsers/annotate.c 
b/tools/perf/ui/browsers/annotate.c
index 29dc6d2..3a652a6f 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -1050,7 +1050,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map 
*map,
  (nr_pcnt - 1);
}
 
-   if (symbol__annotate(sym, map, sizeof_bdl) < 0) {
+   if (symbol__annotate(sym, map, sizeof_bdl,
+perf_evsel__env_arch(evsel)) < 0) {
ui__error("%s", ui_helpline__last_msg);
goto out_free_offsets;
}
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 9c7ff8d..d7150b3 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -166,7 +166,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct 
map *map,
if (map->dso->annotate_warned)
return -1;
 
-   if (symbol__annotate(sym, map, 0) < 0) {
+   if (symbol__annotate(sym, map, 0, perf_evsel__env_arch(evsel)) < 0) {
ui__error("%s", ui_helpline__current);
return -1;
}
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e9825fe..32889ce 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -20,12 +20,14 @@
 #include 
 #include 
 #include 
+#include 
+#include "../arch/common.h"
 
 const char *disassembler_style;
 const char *objdump_path;
 static regex_t  file_lineno;
 
-static struct ins *ins__find(const char *name);
+static struct ins *ins__find(const char *name, const char *norm_arch);
 static int disasm_line__parse(char *line, char **namep, char **rawp);
 
 static void ins__delete(struct ins_operands *ops)
@@ -53,7 +55,7 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
-static int call__parse(struct ins_operands *ops)
+static int call__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *endptr, *tok, *name;
 
@@ -65,10 +67,8 @@ static int call__parse(struct ins_operands *ops)
 
name++;
 
-#ifdef __arm__
-   if (strchr(name, '+'))
+   if (!strcmp(norm_arch, NORM_ARM) && strchr(name, '+'))
return -1;
-#endif
 
tok = strchr(name, '>');
if (tok == NULL)
@@ -117,7 +117,8 @@ bool ins__is_call(const struct ins *ins)
return ins->ops == _ops;
 }
 
-static int jump__parse(struct ins_operands *ops)
+static int jump__parse(struct ins_operands *ops,
+  const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
 
@@ -172,7 +173,7 @@ static int comment__symbol(char *raw, char *comment, u64 
*addrp, char **namep)
return 0;
 }
 
-static int lock__parse(struct ins_operands *ops)
+static int lock__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *name;
 
@@ -183,7 +184,7 @@ static int lock__parse(struct ins_operands *ops)
if (disasm_line__parse(ops->raw, , >locked.ops->raw) < 0)
goto out_free_ops;
 
-   ops->locked.ins = ins__find(name);
+   ops->locked.ins = ins__find(name, norm_arch);
free(name);
 
if (ops->locked.ins == NULL)
@@ -193,7 +194,7 @@ static int lock__parse(struct ins_operands *ops)
return 0;
 
if (ops->locked.ins->ops->parse &&
-   ops->locked.ins->ops->parse(ops->locked.ops) < 0)
+   ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0)
goto out_free_ops;
 

[PATCH v4 3/3] perf annotate: add powerpc support

2016-07-07 Thread Ravi Bangoria
From: Naveen N. Rao 

Powerpc has long list of branch instructions and hardcoding them in
table appears to be error-prone. So, add new function to find
instruction instead of creating table. This function dynamically
create table(list of 'struct ins'), and instead of creating object
every time, first check if list already contain object for that
instruction.

Signed-off-by: Naveen N. Rao 
Signed-off-by: Ravi Bangoria 
---
Chnages in v4:
  - Added support for branch instructions that includes 'ctr'

 tools/perf/util/annotate.c | 155 +++--
 tools/perf/util/annotate.h |   3 +-
 2 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 32889ce..9de1271 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -55,10 +55,15 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
-static int call__parse(struct ins_operands *ops, const char *norm_arch)
+static int call__parse(char *ins_name, struct ins_operands *ops,
+  const char *norm_arch)
 {
char *endptr, *tok, *name;
 
+   /* Special case for powerpc */
+   if (!strcmp(norm_arch, NORM_POWERPC) && strstr(ins_name, "ctr"))
+   return 0;
+
ops->target.addr = strtoull(ops->raw, , 16);
 
name = strchr(endptr, '<');
@@ -117,7 +122,7 @@ bool ins__is_call(const struct ins *ins)
return ins->ops == _ops;
 }
 
-static int jump__parse(struct ins_operands *ops,
+static int jump__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
   const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
@@ -135,6 +140,13 @@ static int jump__parse(struct ins_operands *ops,
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
   struct ins_operands *ops)
 {
+   /*
+* Instructions that does not include target address in operand
+* like 'bctr' for powerpc.
+*/
+   if (!ops->target.addr)
+   return scnprintf(bf, size, "%-6.6s", ins->name);
+
return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, 
ops->target.offset);
 }
 
@@ -173,7 +185,8 @@ static int comment__symbol(char *raw, char *comment, u64 
*addrp, char **namep)
return 0;
 }
 
-static int lock__parse(struct ins_operands *ops, const char *norm_arch)
+static int lock__parse(char *ins_name, struct ins_operands *ops,
+  const char *norm_arch)
 {
char *name;
 
@@ -194,7 +207,8 @@ static int lock__parse(struct ins_operands *ops, const char 
*norm_arch)
return 0;
 
if (ops->locked.ins->ops->parse &&
-   ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0)
+   ops->locked.ins->ops->parse(ins_name,
+   ops->locked.ops, norm_arch) < 0)
goto out_free_ops;
 
return 0;
@@ -237,7 +251,8 @@ static struct ins_ops lock_ops = {
.scnprintf = lock__scnprintf,
 };
 
-static int mov__parse(struct ins_operands *ops, const char *norm_arch)
+static int mov__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
+ const char *norm_arch)
 {
char *s = strchr(ops->raw, ','), *target, *comment, prev;
 
@@ -304,7 +319,7 @@ static struct ins_ops mov_ops = {
.scnprintf = mov__scnprintf,
 };
 
-static int dec__parse(struct ins_operands *ops,
+static int dec__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
  const char *norm_arch __maybe_unused)
 {
char *target, *comment, *s, prev;
@@ -459,6 +474,11 @@ static struct ins instructions_arm[] = {
{ .name = "bne",   .ops  = _ops, },
 };
 
+struct instructions_powerpc {
+   struct ins *ins;
+   struct list_head list;
+};
+
 static int ins__key_cmp(const void *name, const void *insp)
 {
const struct ins *ins = insp;
@@ -474,6 +494,125 @@ static int ins__cmp(const void *a, const void *b)
return strcmp(ia->name, ib->name);
 }
 
+static struct ins *list_add__ins_powerpc(struct instructions_powerpc *head,
+const char *name, struct ins_ops *ops)
+{
+   struct instructions_powerpc *ins_powerpc;
+   struct ins *ins;
+
+   ins = zalloc(sizeof(struct ins));
+   if (!ins)
+   return NULL;
+
+   ins_powerpc = zalloc(sizeof(struct instructions_powerpc));
+   if (!ins_powerpc)
+   goto out_free_ins;
+
+   ins->name = strdup(name);
+   if (!ins->name)
+   goto out_free_ins_power;
+
+   ins->ops = ops;
+   ins_powerpc->ins = ins;
+   list_add_tail(&(ins_powerpc->list), &(head->list));
+
+   return ins;
+
+out_free_ins_power:
+   zfree(_powerpc);

[PATCH v4 3/3] perf annotate: add powerpc support

2016-07-07 Thread Ravi Bangoria
From: Naveen N. Rao 

Powerpc has long list of branch instructions and hardcoding them in
table appears to be error-prone. So, add new function to find
instruction instead of creating table. This function dynamically
create table(list of 'struct ins'), and instead of creating object
every time, first check if list already contain object for that
instruction.

Signed-off-by: Naveen N. Rao 
Signed-off-by: Ravi Bangoria 
---
Chnages in v4:
  - Added support for branch instructions that includes 'ctr'

 tools/perf/util/annotate.c | 155 +++--
 tools/perf/util/annotate.h |   3 +-
 2 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 32889ce..9de1271 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -55,10 +55,15 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
-static int call__parse(struct ins_operands *ops, const char *norm_arch)
+static int call__parse(char *ins_name, struct ins_operands *ops,
+  const char *norm_arch)
 {
char *endptr, *tok, *name;
 
+   /* Special case for powerpc */
+   if (!strcmp(norm_arch, NORM_POWERPC) && strstr(ins_name, "ctr"))
+   return 0;
+
ops->target.addr = strtoull(ops->raw, , 16);
 
name = strchr(endptr, '<');
@@ -117,7 +122,7 @@ bool ins__is_call(const struct ins *ins)
return ins->ops == _ops;
 }
 
-static int jump__parse(struct ins_operands *ops,
+static int jump__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
   const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
@@ -135,6 +140,13 @@ static int jump__parse(struct ins_operands *ops,
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
   struct ins_operands *ops)
 {
+   /*
+* Instructions that does not include target address in operand
+* like 'bctr' for powerpc.
+*/
+   if (!ops->target.addr)
+   return scnprintf(bf, size, "%-6.6s", ins->name);
+
return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, 
ops->target.offset);
 }
 
@@ -173,7 +185,8 @@ static int comment__symbol(char *raw, char *comment, u64 
*addrp, char **namep)
return 0;
 }
 
-static int lock__parse(struct ins_operands *ops, const char *norm_arch)
+static int lock__parse(char *ins_name, struct ins_operands *ops,
+  const char *norm_arch)
 {
char *name;
 
@@ -194,7 +207,8 @@ static int lock__parse(struct ins_operands *ops, const char 
*norm_arch)
return 0;
 
if (ops->locked.ins->ops->parse &&
-   ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0)
+   ops->locked.ins->ops->parse(ins_name,
+   ops->locked.ops, norm_arch) < 0)
goto out_free_ops;
 
return 0;
@@ -237,7 +251,8 @@ static struct ins_ops lock_ops = {
.scnprintf = lock__scnprintf,
 };
 
-static int mov__parse(struct ins_operands *ops, const char *norm_arch)
+static int mov__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
+ const char *norm_arch)
 {
char *s = strchr(ops->raw, ','), *target, *comment, prev;
 
@@ -304,7 +319,7 @@ static struct ins_ops mov_ops = {
.scnprintf = mov__scnprintf,
 };
 
-static int dec__parse(struct ins_operands *ops,
+static int dec__parse(char *ins_name __maybe_unused, struct ins_operands *ops,
  const char *norm_arch __maybe_unused)
 {
char *target, *comment, *s, prev;
@@ -459,6 +474,11 @@ static struct ins instructions_arm[] = {
{ .name = "bne",   .ops  = _ops, },
 };
 
+struct instructions_powerpc {
+   struct ins *ins;
+   struct list_head list;
+};
+
 static int ins__key_cmp(const void *name, const void *insp)
 {
const struct ins *ins = insp;
@@ -474,6 +494,125 @@ static int ins__cmp(const void *a, const void *b)
return strcmp(ia->name, ib->name);
 }
 
+static struct ins *list_add__ins_powerpc(struct instructions_powerpc *head,
+const char *name, struct ins_ops *ops)
+{
+   struct instructions_powerpc *ins_powerpc;
+   struct ins *ins;
+
+   ins = zalloc(sizeof(struct ins));
+   if (!ins)
+   return NULL;
+
+   ins_powerpc = zalloc(sizeof(struct instructions_powerpc));
+   if (!ins_powerpc)
+   goto out_free_ins;
+
+   ins->name = strdup(name);
+   if (!ins->name)
+   goto out_free_ins_power;
+
+   ins->ops = ops;
+   ins_powerpc->ins = ins;
+   list_add_tail(&(ins_powerpc->list), &(head->list));
+
+   return ins;
+
+out_free_ins_power:
+   zfree(_powerpc);
+out_free_ins:
+   zfree();
+   return NULL;
+}
+
+static struct ins 

[PATCH v4 2/3] perf annotate: Enable cross arch annotate

2016-07-07 Thread Ravi Bangoria
Change current data structures and function to enable cross arch
annotate.

Current implementation does not contain logic of record on one arch
and annotating on other. This remote annotate is partially possible
with current implementation for x86 (or may be arm as well) only.
But, to make remote annotation work properly, all architecture
instruction tables need to be included in the perf binary. And while
annotating, look for instruction table where perf.data was recorded.

Signed-off-by: Ravi Bangoria 
---
Changes in v4:
  - __maybe_unused was misplaced at few location. Corrected it

 tools/perf/builtin-top.c  |   2 +-
 tools/perf/ui/browsers/annotate.c |   3 +-
 tools/perf/ui/gtk/annotate.c  |   2 +-
 tools/perf/util/annotate.c| 134 --
 tools/perf/util/annotate.h|   5 +-
 5 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 07fc792..d4fd947 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -128,7 +128,7 @@ static int perf_top__parse_source(struct perf_top *top, 
struct hist_entry *he)
return err;
}
 
-   err = symbol__annotate(sym, map, 0);
+   err = symbol__annotate(sym, map, 0, NULL);
if (err == 0) {
 out_assign:
top->sym_filter_entry = he;
diff --git a/tools/perf/ui/browsers/annotate.c 
b/tools/perf/ui/browsers/annotate.c
index 29dc6d2..3a652a6f 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -1050,7 +1050,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map 
*map,
  (nr_pcnt - 1);
}
 
-   if (symbol__annotate(sym, map, sizeof_bdl) < 0) {
+   if (symbol__annotate(sym, map, sizeof_bdl,
+perf_evsel__env_arch(evsel)) < 0) {
ui__error("%s", ui_helpline__last_msg);
goto out_free_offsets;
}
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 9c7ff8d..d7150b3 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -166,7 +166,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct 
map *map,
if (map->dso->annotate_warned)
return -1;
 
-   if (symbol__annotate(sym, map, 0) < 0) {
+   if (symbol__annotate(sym, map, 0, perf_evsel__env_arch(evsel)) < 0) {
ui__error("%s", ui_helpline__current);
return -1;
}
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e9825fe..32889ce 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -20,12 +20,14 @@
 #include 
 #include 
 #include 
+#include 
+#include "../arch/common.h"
 
 const char *disassembler_style;
 const char *objdump_path;
 static regex_t  file_lineno;
 
-static struct ins *ins__find(const char *name);
+static struct ins *ins__find(const char *name, const char *norm_arch);
 static int disasm_line__parse(char *line, char **namep, char **rawp);
 
 static void ins__delete(struct ins_operands *ops)
@@ -53,7 +55,7 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
-static int call__parse(struct ins_operands *ops)
+static int call__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *endptr, *tok, *name;
 
@@ -65,10 +67,8 @@ static int call__parse(struct ins_operands *ops)
 
name++;
 
-#ifdef __arm__
-   if (strchr(name, '+'))
+   if (!strcmp(norm_arch, NORM_ARM) && strchr(name, '+'))
return -1;
-#endif
 
tok = strchr(name, '>');
if (tok == NULL)
@@ -117,7 +117,8 @@ bool ins__is_call(const struct ins *ins)
return ins->ops == _ops;
 }
 
-static int jump__parse(struct ins_operands *ops)
+static int jump__parse(struct ins_operands *ops,
+  const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
 
@@ -172,7 +173,7 @@ static int comment__symbol(char *raw, char *comment, u64 
*addrp, char **namep)
return 0;
 }
 
-static int lock__parse(struct ins_operands *ops)
+static int lock__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *name;
 
@@ -183,7 +184,7 @@ static int lock__parse(struct ins_operands *ops)
if (disasm_line__parse(ops->raw, , >locked.ops->raw) < 0)
goto out_free_ops;
 
-   ops->locked.ins = ins__find(name);
+   ops->locked.ins = ins__find(name, norm_arch);
free(name);
 
if (ops->locked.ins == NULL)
@@ -193,7 +194,7 @@ static int lock__parse(struct ins_operands *ops)
return 0;
 
if (ops->locked.ins->ops->parse &&
-   ops->locked.ins->ops->parse(ops->locked.ops) < 0)
+   ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0)
goto out_free_ops;
 
return 0;
@@ -236,7 +237,7 @@ static 

[PATCH v4 1/3] perf: Define macro for normalized arch names

2016-07-07 Thread Ravi Bangoria
Define macro for each normalized arch name and use them instead
of using arch name as string

Signed-off-by: Ravi Bangoria 
---
Changes in v4:
  - Moved position of patch

 tools/perf/arch/common.c   | 36 ++--
 tools/perf/arch/common.h   | 11 +++
 tools/perf/util/unwind-libunwind.c |  4 ++--
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index ee69668..feb2113 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -122,25 +122,25 @@ static int lookup_triplets(const char *const *triplets, 
const char *name)
 const char *normalize_arch(char *arch)
 {
if (!strcmp(arch, "x86_64"))
-   return "x86";
+   return NORM_X86;
if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
-   return "x86";
+   return NORM_X86;
if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
-   return "sparc";
+   return NORM_SPARC;
if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
-   return "arm64";
+   return NORM_ARM64;
if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
-   return "arm";
+   return NORM_ARM;
if (!strncmp(arch, "s390", 4))
-   return "s390";
+   return NORM_S390;
if (!strncmp(arch, "parisc", 6))
-   return "parisc";
+   return NORM_PARISC;
if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
-   return "powerpc";
+   return NORM_POWERPC;
if (!strncmp(arch, "mips", 4))
-   return "mips";
+   return NORM_MIPS;
if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
-   return "sh";
+   return NORM_SH;
 
return arch;
 }
@@ -180,21 +180,21 @@ static int perf_env__lookup_binutils_path(struct perf_env 
*env,
zfree();
}
 
-   if (!strcmp(arch, "arm"))
+   if (!strcmp(arch, NORM_ARM))
path_list = arm_triplets;
-   else if (!strcmp(arch, "arm64"))
+   else if (!strcmp(arch, NORM_ARM64))
path_list = arm64_triplets;
-   else if (!strcmp(arch, "powerpc"))
+   else if (!strcmp(arch, NORM_POWERPC))
path_list = powerpc_triplets;
-   else if (!strcmp(arch, "sh"))
+   else if (!strcmp(arch, NORM_SH))
path_list = sh_triplets;
-   else if (!strcmp(arch, "s390"))
+   else if (!strcmp(arch, NORM_S390))
path_list = s390_triplets;
-   else if (!strcmp(arch, "sparc"))
+   else if (!strcmp(arch, NORM_SPARC))
path_list = sparc_triplets;
-   else if (!strcmp(arch, "x86"))
+   else if (!strcmp(arch, NORM_X86))
path_list = x86_triplets;
-   else if (!strcmp(arch, "mips"))
+   else if (!strcmp(arch, NORM_MIPS))
path_list = mips_triplets;
else {
ui__error("binutils for %s not supported.\n", arch);
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index 6b01c73..14ca8ca 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -5,6 +5,17 @@
 
 extern const char *objdump_path;
 
+/* Macro for normalized arch names */
+#define NORM_X86   "x86"
+#define NORM_SPARC "sparc"
+#define NORM_ARM64 "arm64"
+#define NORM_ARM   "arm"
+#define NORM_S390  "s390"
+#define NORM_PARISC"parisc"
+#define NORM_POWERPC   "powerpc"
+#define NORM_MIPS  "mips"
+#define NORM_SH"sh"
+
 int perf_env__lookup_objdump(struct perf_env *env);
 const char *normalize_arch(char *arch);
 
diff --git a/tools/perf/util/unwind-libunwind.c 
b/tools/perf/util/unwind-libunwind.c
index 6d542a4..6199102 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -40,10 +40,10 @@ int unwind__prepare_access(struct thread *thread, struct 
map *map,
 
arch = normalize_arch(thread->mg->machine->env->arch);
 
-   if (!strcmp(arch, "x86")) {
+   if (!strcmp(arch, NORM_X86)) {
if (dso_type != DSO__TYPE_64BIT)
ops = x86_32_unwind_libunwind_ops;
-   } else if (!strcmp(arch, "arm64") || !strcmp(arch, "arm")) {
+   } else if (!strcmp(arch, NORM_ARM64) || !strcmp(arch, NORM_ARM)) {
if (dso_type == DSO__TYPE_64BIT)
ops = arm64_unwind_libunwind_ops;
}
-- 
2.5.5



[PATCH v4 0/3] perf annotate: Enable cross arch annotate

2016-07-07 Thread Ravi Bangoria
Perf can currently only support code navigation (branches and calls) in
annotate when run on the same architecture where perf.data was recorded.
But cross arch annotate is not supported.

This patchset enables cross arch annotate. Currently I've used x86
and arm instructions which are already available and adding support
for powerpc as well. Adding support for other arch will be easy.

I've created this patch on top of acme/perf/core. And tested it with
x86 and powerpc only.

Note for arm:
Few instructions were defined under #if __arm__ which I've used as a
table for arm. But I'm not sure whether instruction defined outside of
that also contains arm instructions. Apart from that, 'call__parse()'
and 'move__parse()' contains #ifdef __arm__ directive. I've changed it
to  if (!strcmp(norm_arch, arm)). I don't have a arm machine to test
these changes.

Example:

  Record on powerpc:
  $ ./perf record -a

  Report -> Annotate on x86:
  $ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc

Changes in v4:
  - powerpc: Added support for branch instructions that includes 'ctr'
  - __maybe_unused was misplaced at few location. Corrected it.
  - Moved position of v3 last patch that define macro for each arch name

v3 link: https://lkml.org/lkml/2016/6/30/99

Naveen N. Rao (1):
  perf annotate: add powerpc support

Ravi Bangoria (2):
  perf: Define macro for normalized arch names
  perf annotate: Enable cross arch annotate

 tools/perf/arch/common.c   |  36 ++---
 tools/perf/arch/common.h   |  11 ++
 tools/perf/builtin-top.c   |   2 +-
 tools/perf/ui/browsers/annotate.c  |   3 +-
 tools/perf/ui/gtk/annotate.c   |   2 +-
 tools/perf/util/annotate.c | 273 ++---
 tools/perf/util/annotate.h |   6 +-
 tools/perf/util/unwind-libunwind.c |   4 +-
 8 files changed, 265 insertions(+), 72 deletions(-)

--
2.5.5



[PATCH v4 1/3] perf: Define macro for normalized arch names

2016-07-07 Thread Ravi Bangoria
Define macro for each normalized arch name and use them instead
of using arch name as string

Signed-off-by: Ravi Bangoria 
---
Changes in v4:
  - Moved position of patch

 tools/perf/arch/common.c   | 36 ++--
 tools/perf/arch/common.h   | 11 +++
 tools/perf/util/unwind-libunwind.c |  4 ++--
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index ee69668..feb2113 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -122,25 +122,25 @@ static int lookup_triplets(const char *const *triplets, 
const char *name)
 const char *normalize_arch(char *arch)
 {
if (!strcmp(arch, "x86_64"))
-   return "x86";
+   return NORM_X86;
if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
-   return "x86";
+   return NORM_X86;
if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
-   return "sparc";
+   return NORM_SPARC;
if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
-   return "arm64";
+   return NORM_ARM64;
if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
-   return "arm";
+   return NORM_ARM;
if (!strncmp(arch, "s390", 4))
-   return "s390";
+   return NORM_S390;
if (!strncmp(arch, "parisc", 6))
-   return "parisc";
+   return NORM_PARISC;
if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
-   return "powerpc";
+   return NORM_POWERPC;
if (!strncmp(arch, "mips", 4))
-   return "mips";
+   return NORM_MIPS;
if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
-   return "sh";
+   return NORM_SH;
 
return arch;
 }
@@ -180,21 +180,21 @@ static int perf_env__lookup_binutils_path(struct perf_env 
*env,
zfree();
}
 
-   if (!strcmp(arch, "arm"))
+   if (!strcmp(arch, NORM_ARM))
path_list = arm_triplets;
-   else if (!strcmp(arch, "arm64"))
+   else if (!strcmp(arch, NORM_ARM64))
path_list = arm64_triplets;
-   else if (!strcmp(arch, "powerpc"))
+   else if (!strcmp(arch, NORM_POWERPC))
path_list = powerpc_triplets;
-   else if (!strcmp(arch, "sh"))
+   else if (!strcmp(arch, NORM_SH))
path_list = sh_triplets;
-   else if (!strcmp(arch, "s390"))
+   else if (!strcmp(arch, NORM_S390))
path_list = s390_triplets;
-   else if (!strcmp(arch, "sparc"))
+   else if (!strcmp(arch, NORM_SPARC))
path_list = sparc_triplets;
-   else if (!strcmp(arch, "x86"))
+   else if (!strcmp(arch, NORM_X86))
path_list = x86_triplets;
-   else if (!strcmp(arch, "mips"))
+   else if (!strcmp(arch, NORM_MIPS))
path_list = mips_triplets;
else {
ui__error("binutils for %s not supported.\n", arch);
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index 6b01c73..14ca8ca 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -5,6 +5,17 @@
 
 extern const char *objdump_path;
 
+/* Macro for normalized arch names */
+#define NORM_X86   "x86"
+#define NORM_SPARC "sparc"
+#define NORM_ARM64 "arm64"
+#define NORM_ARM   "arm"
+#define NORM_S390  "s390"
+#define NORM_PARISC"parisc"
+#define NORM_POWERPC   "powerpc"
+#define NORM_MIPS  "mips"
+#define NORM_SH"sh"
+
 int perf_env__lookup_objdump(struct perf_env *env);
 const char *normalize_arch(char *arch);
 
diff --git a/tools/perf/util/unwind-libunwind.c 
b/tools/perf/util/unwind-libunwind.c
index 6d542a4..6199102 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -40,10 +40,10 @@ int unwind__prepare_access(struct thread *thread, struct 
map *map,
 
arch = normalize_arch(thread->mg->machine->env->arch);
 
-   if (!strcmp(arch, "x86")) {
+   if (!strcmp(arch, NORM_X86)) {
if (dso_type != DSO__TYPE_64BIT)
ops = x86_32_unwind_libunwind_ops;
-   } else if (!strcmp(arch, "arm64") || !strcmp(arch, "arm")) {
+   } else if (!strcmp(arch, NORM_ARM64) || !strcmp(arch, NORM_ARM)) {
if (dso_type == DSO__TYPE_64BIT)
ops = arm64_unwind_libunwind_ops;
}
-- 
2.5.5



[PATCH v4 0/3] perf annotate: Enable cross arch annotate

2016-07-07 Thread Ravi Bangoria
Perf can currently only support code navigation (branches and calls) in
annotate when run on the same architecture where perf.data was recorded.
But cross arch annotate is not supported.

This patchset enables cross arch annotate. Currently I've used x86
and arm instructions which are already available and adding support
for powerpc as well. Adding support for other arch will be easy.

I've created this patch on top of acme/perf/core. And tested it with
x86 and powerpc only.

Note for arm:
Few instructions were defined under #if __arm__ which I've used as a
table for arm. But I'm not sure whether instruction defined outside of
that also contains arm instructions. Apart from that, 'call__parse()'
and 'move__parse()' contains #ifdef __arm__ directive. I've changed it
to  if (!strcmp(norm_arch, arm)). I don't have a arm machine to test
these changes.

Example:

  Record on powerpc:
  $ ./perf record -a

  Report -> Annotate on x86:
  $ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc

Changes in v4:
  - powerpc: Added support for branch instructions that includes 'ctr'
  - __maybe_unused was misplaced at few location. Corrected it.
  - Moved position of v3 last patch that define macro for each arch name

v3 link: https://lkml.org/lkml/2016/6/30/99

Naveen N. Rao (1):
  perf annotate: add powerpc support

Ravi Bangoria (2):
  perf: Define macro for normalized arch names
  perf annotate: Enable cross arch annotate

 tools/perf/arch/common.c   |  36 ++---
 tools/perf/arch/common.h   |  11 ++
 tools/perf/builtin-top.c   |   2 +-
 tools/perf/ui/browsers/annotate.c  |   3 +-
 tools/perf/ui/gtk/annotate.c   |   2 +-
 tools/perf/util/annotate.c | 273 ++---
 tools/perf/util/annotate.h |   6 +-
 tools/perf/util/unwind-libunwind.c |   4 +-
 8 files changed, 265 insertions(+), 72 deletions(-)

--
2.5.5



Re: [PATCH 1/2] HID: logitech-hidpp: add battery support for HID++ 2.0 devices

2016-07-07 Thread Peter Hutterer
On Fri, Jul 08, 2016 at 01:21:08AM +0200, Bastien Nocera wrote:
> On Wed, 2016-06-29 at 19:28 +1000, Peter Hutterer wrote:
> > If the 0x1000 Unified Battery Level Status feature exists, expose the
> > battery
> > level.
> > 
> > The main drawback is that while a device is plugged in its battery
> > level is 0.
> > To avoid exposing that as 0% charge we make up a number based on the
> > charging
> > status.
> 
> This will require changes in UPower, so that it doesn't try to access
> the Logitech unifying devices via user-space, and uses the data from
> the kernel. Did you already file a bug?

filed now: https://bugs.freedesktop.org/show_bug.cgi?id=96857

> Note that this would also mean losing the "lux" information, but I
> don't think that's something we're that interested in exposing.

Adding that HID++ request to the kernel would be easy enough but I don't 
see anything in the power_supply_property that would match this, do you? 
Also, I don't have such a device so testing would be tricky.

Cheers,
   Peter


> 
> For example, for a keyboard that recharges via solar panels, at night:
> 
> Device: /org/freedesktop/UPower/devices/keyboard_0003o046Do4002x0004
>   native-path:  
> /sys/devices/pci:00/:00:14.0/usb3/3-10/3-10:1.2/0003:046D:C52B.0003/0003:046D:4002.0004
>   vendor:   Logitech, Inc.
>   model:K750
>   serial:   197F3F23
>   power supply: no
>   updated:  Fri 08 Jul 2016 01:17:40 CEST (95 seconds ago)
>   has history:  yes
>   has statistics:   no
>   keyboard
> present: yes
> rechargeable:yes
> state:   discharging
> warning-level:   none
> luminosity:  16 lx
> percentage:  89%
> icon-name:  'battery-full-symbolic'


Re: [PATCH 1/2] HID: logitech-hidpp: add battery support for HID++ 2.0 devices

2016-07-07 Thread Peter Hutterer
On Fri, Jul 08, 2016 at 01:21:08AM +0200, Bastien Nocera wrote:
> On Wed, 2016-06-29 at 19:28 +1000, Peter Hutterer wrote:
> > If the 0x1000 Unified Battery Level Status feature exists, expose the
> > battery
> > level.
> > 
> > The main drawback is that while a device is plugged in its battery
> > level is 0.
> > To avoid exposing that as 0% charge we make up a number based on the
> > charging
> > status.
> 
> This will require changes in UPower, so that it doesn't try to access
> the Logitech unifying devices via user-space, and uses the data from
> the kernel. Did you already file a bug?

filed now: https://bugs.freedesktop.org/show_bug.cgi?id=96857

> Note that this would also mean losing the "lux" information, but I
> don't think that's something we're that interested in exposing.

Adding that HID++ request to the kernel would be easy enough but I don't 
see anything in the power_supply_property that would match this, do you? 
Also, I don't have such a device so testing would be tricky.

Cheers,
   Peter


> 
> For example, for a keyboard that recharges via solar panels, at night:
> 
> Device: /org/freedesktop/UPower/devices/keyboard_0003o046Do4002x0004
>   native-path:  
> /sys/devices/pci:00/:00:14.0/usb3/3-10/3-10:1.2/0003:046D:C52B.0003/0003:046D:4002.0004
>   vendor:   Logitech, Inc.
>   model:K750
>   serial:   197F3F23
>   power supply: no
>   updated:  Fri 08 Jul 2016 01:17:40 CEST (95 seconds ago)
>   has history:  yes
>   has statistics:   no
>   keyboard
> present: yes
> rechargeable:yes
> state:   discharging
> warning-level:   none
> luminosity:  16 lx
> percentage:  89%
> icon-name:  'battery-full-symbolic'


Re: linux-next: please clean up the lightnvm tree

2016-07-07 Thread Matias Bjørling

On 07/08/2016 06:22 AM, Stephen Rothwell wrote:

Hi Matias,

I noticed that the commits in the lightnvm tree have been applied
to the block tree as a series of patches (i.e. effectively rebased).
Could you please remove all the duplicate patches from the lightnvm tree
(which I think is the whole tree, curretly) before they start causing
merge conflicts for me.

One way to do this would be to rebase your tree on top of the block tree.



Thanks Stephen. Updated the for-next. I will make sure they are taken 
off if applied directly to the block tree in the future.


Re: linux-next: please clean up the lightnvm tree

2016-07-07 Thread Matias Bjørling

On 07/08/2016 06:22 AM, Stephen Rothwell wrote:

Hi Matias,

I noticed that the commits in the lightnvm tree have been applied
to the block tree as a series of patches (i.e. effectively rebased).
Could you please remove all the duplicate patches from the lightnvm tree
(which I think is the whole tree, curretly) before they start causing
merge conflicts for me.

One way to do this would be to rebase your tree on top of the block tree.



Thanks Stephen. Updated the for-next. I will make sure they are taken 
off if applied directly to the block tree in the future.


Re: [PATCH v7 07/11] powerpc/powernv: Add platform support for stop instruction

2016-07-07 Thread Michael Neuling

> > > 
> > > @@ -439,7 +540,18 @@ timebase_resync:
> > >    */
> > >   bne cr4,clear_lock
> > >  
> > > - /* Restore per core state */
> > > + /*
> > > +  * First thread in the core to wake up and its waking up
> > > with
> > > +  * complete hypervisor state loss. Restore per core
> > > hypervisor
> > > +  * state.
> > > +  */
> > > +BEGIN_FTR_SECTION
> > > + ld  r4,_PTCR(r1)
> > > + mtspr   SPRN_PTCR,r4
> > > + ld  r4,_RPR(r1)
> > > + mtspr   SPRN_RPR,r4
> > RPR looks wrong here.  This should be on POWER8 too.
> > 
> > This has changed since v6 and not noted in the v7 comments.  Why are
> > you
> > changing this now?
> > 
> RPR is a per-core resource in P9. So with this patch, RPR will continue
> to be restored per-subcore in P8 and will restored once per core in P9.

Ok, thanks for the explanation.

Mikey


Re: [PATCH v7 07/11] powerpc/powernv: Add platform support for stop instruction

2016-07-07 Thread Michael Neuling

> > > 
> > > @@ -439,7 +540,18 @@ timebase_resync:
> > >    */
> > >   bne cr4,clear_lock
> > >  
> > > - /* Restore per core state */
> > > + /*
> > > +  * First thread in the core to wake up and its waking up
> > > with
> > > +  * complete hypervisor state loss. Restore per core
> > > hypervisor
> > > +  * state.
> > > +  */
> > > +BEGIN_FTR_SECTION
> > > + ld  r4,_PTCR(r1)
> > > + mtspr   SPRN_PTCR,r4
> > > + ld  r4,_RPR(r1)
> > > + mtspr   SPRN_RPR,r4
> > RPR looks wrong here.  This should be on POWER8 too.
> > 
> > This has changed since v6 and not noted in the v7 comments.  Why are
> > you
> > changing this now?
> > 
> RPR is a per-core resource in P9. So with this patch, RPR will continue
> to be restored per-subcore in P8 and will restored once per core in P9.

Ok, thanks for the explanation.

Mikey


linux-next: please clean up the lightnvm tree

2016-07-07 Thread Stephen Rothwell
Hi Matias,

I noticed that the commits in the lightnvm tree have been applied
to the block tree as a series of patches (i.e. effectively rebased).
Could you please remove all the duplicate patches from the lightnvm tree
(which I think is the whole tree, curretly) before they start causing
merge conflicts for me.

One way to do this would be to rebase your tree on top of the block tree.

-- 
Cheers,
Stephen Rothwell


linux-next: please clean up the lightnvm tree

2016-07-07 Thread Stephen Rothwell
Hi Matias,

I noticed that the commits in the lightnvm tree have been applied
to the block tree as a series of patches (i.e. effectively rebased).
Could you please remove all the duplicate patches from the lightnvm tree
(which I think is the whole tree, curretly) before they start causing
merge conflicts for me.

One way to do this would be to rebase your tree on top of the block tree.

-- 
Cheers,
Stephen Rothwell


Re: [PATCH v7 07/11] powerpc/powernv: Add platform support for stop instruction

2016-07-07 Thread Shreyas B Prabhu


On 07/08/2016 07:50 AM, Michael Neuling wrote:
> 
>> diff --git a/arch/powerpc/include/asm/cpuidle.h 
>> b/arch/powerpc/include/asm/cpuidle.h
>> index d2f99ca..3d7fc06 100644
>> --- a/arch/powerpc/include/asm/cpuidle.h
>> +++ b/arch/powerpc/include/asm/cpuidle.h
>> @@ -13,6 +13,8 @@
>>  #ifndef __ASSEMBLY__
>>  extern u32 pnv_fastsleep_workaround_at_entry[];
>>  extern u32 pnv_fastsleep_workaround_at_exit[];
>> +
>> +extern u64 pnv_first_deep_stop_state;
> 
> mpe asked a question about this which you neither answered or addressed.
> "Should this have some safe initial value?"
> 
> I'm thinking we could do this which is what you have in the init call.
>u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
> 

I missed the comment. I'll make the change.
> 
>> @@ -439,7 +540,18 @@ timebase_resync:
>>   */
>>  bne cr4,clear_lock
>>  
>> -/* Restore per core state */
>> +/*
>> + * First thread in the core to wake up and its waking up with
>> + * complete hypervisor state loss. Restore per core hypervisor
>> + * state.
>> + */
>> +BEGIN_FTR_SECTION
>> +ld  r4,_PTCR(r1)
>> +mtspr   SPRN_PTCR,r4
>> +ld  r4,_RPR(r1)
>> +mtspr   SPRN_RPR,r4
> 
> RPR looks wrong here.  This should be on POWER8 too.
> 
> This has changed since v6 and not noted in the v7 comments.  Why are you
> changing this now?
> 
RPR is a per-core resource in P9. So with this patch, RPR will continue
to be restored per-subcore in P8 and will restored once per core in P9.

>> +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>> +
>>  ld  r4,_TSCR(r1)
>>  mtspr   SPRN_TSCR,r4
>>  ld  r4,_WORC(r1)
>> @@ -461,9 +573,7 @@ common_exit:
>>  
>>  /* Waking up from winkle */
>>  
>> -/* Restore per thread state */
>> -bl  __restore_cpu_power8
>> -
>> +BEGIN_MMU_FTR_SECTION
>>  /* Restore SLB  from PACA */
>>  ld  r8,PACA_SLBSHADOWPTR(r13)
>>  
>> @@ -477,6 +587,9 @@ common_exit:
>>  slbmte  r6,r5
>>  1:  addir8,r8,16
>>  .endr
>> +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX)
>> +
>> +/* Restore per thread state */
> 
> This FTR section is too big  It ends up at 25 instructions with the loop.
> Probably better like this:
> 
> BEGIN_MMU_FTR_SECTION
>   b   no_segments
> END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
>   /* Restore SLB  from PACA */
>   ld  r8,PACA_SLBSHADOWPTR(r13)
> 
>   .rept   SLB_NUM_BOLTED
>   li  r3, SLBSHADOW_SAVEAREA
>   LDX_BE  r5, r8, r3
>   addir3, r3, 8
>   LDX_BE  r6, r8, r3
>   andis.  r7,r5,SLB_ESID_V@h
>   beq 1f
>   slbmte  r6,r5
> 1:addir8,r8,16
>   .endr
> 
> no_segments:
> 
Cool. Will make the change.

Thanks,
Shreyas



Re: [PATCH v7 07/11] powerpc/powernv: Add platform support for stop instruction

2016-07-07 Thread Shreyas B Prabhu


On 07/08/2016 07:50 AM, Michael Neuling wrote:
> 
>> diff --git a/arch/powerpc/include/asm/cpuidle.h 
>> b/arch/powerpc/include/asm/cpuidle.h
>> index d2f99ca..3d7fc06 100644
>> --- a/arch/powerpc/include/asm/cpuidle.h
>> +++ b/arch/powerpc/include/asm/cpuidle.h
>> @@ -13,6 +13,8 @@
>>  #ifndef __ASSEMBLY__
>>  extern u32 pnv_fastsleep_workaround_at_entry[];
>>  extern u32 pnv_fastsleep_workaround_at_exit[];
>> +
>> +extern u64 pnv_first_deep_stop_state;
> 
> mpe asked a question about this which you neither answered or addressed.
> "Should this have some safe initial value?"
> 
> I'm thinking we could do this which is what you have in the init call.
>u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
> 

I missed the comment. I'll make the change.
> 
>> @@ -439,7 +540,18 @@ timebase_resync:
>>   */
>>  bne cr4,clear_lock
>>  
>> -/* Restore per core state */
>> +/*
>> + * First thread in the core to wake up and its waking up with
>> + * complete hypervisor state loss. Restore per core hypervisor
>> + * state.
>> + */
>> +BEGIN_FTR_SECTION
>> +ld  r4,_PTCR(r1)
>> +mtspr   SPRN_PTCR,r4
>> +ld  r4,_RPR(r1)
>> +mtspr   SPRN_RPR,r4
> 
> RPR looks wrong here.  This should be on POWER8 too.
> 
> This has changed since v6 and not noted in the v7 comments.  Why are you
> changing this now?
> 
RPR is a per-core resource in P9. So with this patch, RPR will continue
to be restored per-subcore in P8 and will restored once per core in P9.

>> +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>> +
>>  ld  r4,_TSCR(r1)
>>  mtspr   SPRN_TSCR,r4
>>  ld  r4,_WORC(r1)
>> @@ -461,9 +573,7 @@ common_exit:
>>  
>>  /* Waking up from winkle */
>>  
>> -/* Restore per thread state */
>> -bl  __restore_cpu_power8
>> -
>> +BEGIN_MMU_FTR_SECTION
>>  /* Restore SLB  from PACA */
>>  ld  r8,PACA_SLBSHADOWPTR(r13)
>>  
>> @@ -477,6 +587,9 @@ common_exit:
>>  slbmte  r6,r5
>>  1:  addir8,r8,16
>>  .endr
>> +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX)
>> +
>> +/* Restore per thread state */
> 
> This FTR section is too big  It ends up at 25 instructions with the loop.
> Probably better like this:
> 
> BEGIN_MMU_FTR_SECTION
>   b   no_segments
> END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
>   /* Restore SLB  from PACA */
>   ld  r8,PACA_SLBSHADOWPTR(r13)
> 
>   .rept   SLB_NUM_BOLTED
>   li  r3, SLBSHADOW_SAVEAREA
>   LDX_BE  r5, r8, r3
>   addir3, r3, 8
>   LDX_BE  r6, r8, r3
>   andis.  r7,r5,SLB_ESID_V@h
>   beq 1f
>   slbmte  r6,r5
> 1:addir8,r8,16
>   .endr
> 
> no_segments:
> 
Cool. Will make the change.

Thanks,
Shreyas



Re: perf bpf examples

2016-07-07 Thread Wangnan (F)



On 2016/7/8 1:58, Brendan Gregg wrote:

On Thu, Jul 7, 2016 at 10:54 AM, Brendan Gregg
 wrote:

On Wed, Jul 6, 2016 at 6:49 PM, Wangnan (F)  wrote:



On 2016/7/7 4:29, Brendan Gregg wrote:

G'Day,

Are perf bpf examples shared anywhere? I've seen many posted to lkml
(by Wang Nan), but don't see them in the linux source, or
documentation. Would be very handy to throw them all up somewhere for
searching/learning, if that hasn't already happened, eg, github.

I was also looking to see if perf bpf supports sampling yet, but I
don't think it does. Eg, imagine a:

perf record -F 99 -e bpf_process_samples.c -a -- sleep 10

which would require BPF attaching to perf_swevent_hrtimer()/etc, and
also emitting a map (eg, sampled instruction pointer counts). I don't
think perf currently does either, but was hoping for a collection of
examples to double check.


Currently perf-bpf doesn't support dumpping resuling maps, but
we are working on it. I think you have read our uBPF approach:

http://article.gmane.org/gmane.linux.kernel/2203717

and

http://article.gmane.org/gmane.linux.kernel/2253579

in them we embeded a uBPF virtual machine to perf and give it
the ability to operate the result in maps.

Now we are trying another approach, introduce LLVM to perf,
compile data analysis and report to code. It would be much
powerful.


Great, thanks!

But what about a set of examples covering the existing perf+bpf
capabilities so far? I know you've emailed them to lkml, but has
someone put them all in one place yet? If not, I can go through lkml
and at least put them on github so we can search and learn from them.


Great. Thanks a lot.


... Also, has anyone looked into perf sampling (-F 99) with bpf yet? Thanks,


Theoretically, BPF program is an additional filter to
decide whetier an event should be filtered out or pass to perf. -F 99
is another filter, which drops samples to ensure the frequence.
Filters works together. The full graph should be:

 BPF --> traditional filter --> proc (system wide of proc specific) --> 
period


See the example at the end of this mail. The BPF program returns 0 for 
half of
the events, and the result should be symmetrical. We can get similar 
result without

-F:

# ~/perf record -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 11.9908 s, 358 MB/s
[ perf record: Woken up 28 times to write data ]
[ perf record: Captured and wrote 303.915 MB perf.data (4194449 samples) ]
#
root@wn-Lenovo-Product:~# ~/perf record -a --clang-opt '-DCATCH_EVEN' -e 
./sampling.c dd if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 12.1154 s, 355 MB/s
[ perf record: Woken up 54 times to write data ]
[ perf record: Captured and wrote 303.933 MB perf.data (4194347 samples) ]


With -F99 added:

# ~/perf record -F99 -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.60126 s, 447 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.402 MB perf.data (35 samples) ]
# ~/perf record -F99 -a --clang-opt '-DCATCH_EVEN' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.76719 s, 440 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.399 MB perf.data (37 samples) ]

However, there must be something I don't understand. It takes nearly 10 
seconds to
finish the record, so we should get nearly 1000 samples. Sometimes I can 
get about 500 samples:


# ~/perf record -F99 -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.60536 s, 447 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.431 MB perf.data (555 samples) ]

/
#include 
#define SEC(NAME) __attribute__((section(NAME), used))
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
struct bpf_map_def SEC("maps") m = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1,
};
static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
   (void *)BPF_FUNC_map_lookup_elem;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
#ifdef CATCH_ODD
# define RET_ODD  1
# define RET_EVEN 0
#endif
#ifdef CATCH_EVEN
# define RET_ODD  0
# define RET_EVEN 1
#endif
SEC("func=sys_read")
int 

Re: perf bpf examples

2016-07-07 Thread Wangnan (F)



On 2016/7/8 1:58, Brendan Gregg wrote:

On Thu, Jul 7, 2016 at 10:54 AM, Brendan Gregg
 wrote:

On Wed, Jul 6, 2016 at 6:49 PM, Wangnan (F)  wrote:



On 2016/7/7 4:29, Brendan Gregg wrote:

G'Day,

Are perf bpf examples shared anywhere? I've seen many posted to lkml
(by Wang Nan), but don't see them in the linux source, or
documentation. Would be very handy to throw them all up somewhere for
searching/learning, if that hasn't already happened, eg, github.

I was also looking to see if perf bpf supports sampling yet, but I
don't think it does. Eg, imagine a:

perf record -F 99 -e bpf_process_samples.c -a -- sleep 10

which would require BPF attaching to perf_swevent_hrtimer()/etc, and
also emitting a map (eg, sampled instruction pointer counts). I don't
think perf currently does either, but was hoping for a collection of
examples to double check.


Currently perf-bpf doesn't support dumpping resuling maps, but
we are working on it. I think you have read our uBPF approach:

http://article.gmane.org/gmane.linux.kernel/2203717

and

http://article.gmane.org/gmane.linux.kernel/2253579

in them we embeded a uBPF virtual machine to perf and give it
the ability to operate the result in maps.

Now we are trying another approach, introduce LLVM to perf,
compile data analysis and report to code. It would be much
powerful.


Great, thanks!

But what about a set of examples covering the existing perf+bpf
capabilities so far? I know you've emailed them to lkml, but has
someone put them all in one place yet? If not, I can go through lkml
and at least put them on github so we can search and learn from them.


Great. Thanks a lot.


... Also, has anyone looked into perf sampling (-F 99) with bpf yet? Thanks,


Theoretically, BPF program is an additional filter to
decide whetier an event should be filtered out or pass to perf. -F 99
is another filter, which drops samples to ensure the frequence.
Filters works together. The full graph should be:

 BPF --> traditional filter --> proc (system wide of proc specific) --> 
period


See the example at the end of this mail. The BPF program returns 0 for 
half of
the events, and the result should be symmetrical. We can get similar 
result without

-F:

# ~/perf record -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 11.9908 s, 358 MB/s
[ perf record: Woken up 28 times to write data ]
[ perf record: Captured and wrote 303.915 MB perf.data (4194449 samples) ]
#
root@wn-Lenovo-Product:~# ~/perf record -a --clang-opt '-DCATCH_EVEN' -e 
./sampling.c dd if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 12.1154 s, 355 MB/s
[ perf record: Woken up 54 times to write data ]
[ perf record: Captured and wrote 303.933 MB perf.data (4194347 samples) ]


With -F99 added:

# ~/perf record -F99 -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.60126 s, 447 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.402 MB perf.data (35 samples) ]
# ~/perf record -F99 -a --clang-opt '-DCATCH_EVEN' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.76719 s, 440 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.399 MB perf.data (37 samples) ]

However, there must be something I don't understand. It takes nearly 10 
seconds to
finish the record, so we should get nearly 1000 samples. Sometimes I can 
get about 500 samples:


# ~/perf record -F99 -a --clang-opt '-DCATCH_ODD' -e ./sampling.c dd 
if=/dev/zero of=/dev/null count=8388480

8388480+0 records in
8388480+0 records out
4294901760 bytes (4.3 GB) copied, 9.60536 s, 447 MB/s
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.431 MB perf.data (555 samples) ]

/
#include 
#define SEC(NAME) __attribute__((section(NAME), used))
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
struct bpf_map_def SEC("maps") m = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1,
};
static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
   (void *)BPF_FUNC_map_lookup_elem;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
#ifdef CATCH_ODD
# define RET_ODD  1
# define RET_EVEN 0
#endif
#ifdef CATCH_EVEN
# define RET_ODD  0
# define RET_EVEN 1
#endif
SEC("func=sys_read")
int func(void *ctx)
{
int key = 0, *v;
  

Re: linux-next: manual merge of the block tree with Linus' tree

2016-07-07 Thread Stephen Rothwell
Hi all,

On Fri, 8 Jul 2016 13:07:50 +1000 Stephen Rothwell  
wrote:
>
>  + * Get the bios in the request so we can re-queue them.
>  + */
> - if (shadow[j].request->cmd_flags &
> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | 
> REQ_SECURE)) {
> ++if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
> ++req_op(shadow[j].request) == REQ_OP_DISCARD ||
> ++req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
> ++shadow[j].request->cmd_flags & REQ_FUA)) {
  ^
That is an extra ')' which I have now removed.

-- 
Cheers,
Stephen Rothwell


Re: linux-next: manual merge of the block tree with Linus' tree

2016-07-07 Thread Stephen Rothwell
Hi all,

On Fri, 8 Jul 2016 13:07:50 +1000 Stephen Rothwell  
wrote:
>
>  + * Get the bios in the request so we can re-queue them.
>  + */
> - if (shadow[j].request->cmd_flags &
> - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | 
> REQ_SECURE)) {
> ++if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
> ++req_op(shadow[j].request) == REQ_OP_DISCARD ||
> ++req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
> ++shadow[j].request->cmd_flags & REQ_FUA)) {
  ^
That is an extra ')' which I have now removed.

-- 
Cheers,
Stephen Rothwell


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/volumes.h

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/volumes.h
index dc219e259281,6613e6335ca2..
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@@ -384,13 -385,13 +384,13 @@@ int btrfs_map_sblock(struct btrfs_fs_in
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 u64 chunk_start, u64 physical, u64 devid,
 u64 **logical, int *naddrs, int *stripe_len);
 -int btrfs_read_sys_array(struct btrfs_root *root);
 -int btrfs_read_chunk_tree(struct btrfs_root *root);
 +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
  int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 -struct btrfs_root *extent_root, u64 type);
 +struct btrfs_fs_info *fs_info, u64 type);
  void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
  void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
- int btrfs_map_bio(struct btrfs_fs_info *fs_info, int rw, struct bio *bio,
 -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
++int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  int mirror_num, int async_submit);
  int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/volumes.h

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/volumes.h
index dc219e259281,6613e6335ca2..
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@@ -384,13 -385,13 +384,13 @@@ int btrfs_map_sblock(struct btrfs_fs_in
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 u64 chunk_start, u64 physical, u64 devid,
 u64 **logical, int *naddrs, int *stripe_len);
 -int btrfs_read_sys_array(struct btrfs_root *root);
 -int btrfs_read_chunk_tree(struct btrfs_root *root);
 +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
  int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 -struct btrfs_root *extent_root, u64 type);
 +struct btrfs_fs_info *fs_info, u64 type);
  void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
  void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
- int btrfs_map_bio(struct btrfs_fs_info *fs_info, int rw, struct bio *bio,
 -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
++int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  int mirror_num, int async_submit);
  int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/volumes.c

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  4e49ea4a3d27 ("block/fs/drivers: remove rw argument from submit_bio")
  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/volumes.c
index 64d2557fe7fc,14b2d19c842c..
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -5984,10 -5999,10 +5989,10 @@@ static void btrfs_end_bio(struct bio *b
   * This will add one bio to the pending list for a device and make sure
   * the work struct is scheduled.
   */
 -static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 -  struct btrfs_device *device,
 +static noinline void btrfs_schedule_bio(struct btrfs_device *device,
-   int rw, struct bio *bio)
+   struct bio *bio)
  {
 +  struct btrfs_fs_info *fs_info = device->fs_info;
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
  
@@@ -6010,10 -6025,9 +6015,9 @@@
 * made progress against dirty pages when we've really just put it
 * on a queue for later
 */
 -  atomic_inc(>fs_info->nr_async_bios);
 +  atomic_inc(_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
-   bio->bi_rw |= rw;
  
spin_lock(>io_lock);
if (bio->bi_rw & REQ_SYNC)
@@@ -6033,14 -6047,15 +6037,14 @@@
spin_unlock(>io_lock);
  
if (should_queue)
 -  btrfs_queue_work(root->fs_info->submit_workers,
 -   >work);
 +  btrfs_queue_work(fs_info->submit_workers, >work);
  }
  
 -static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 -struct bio *bio, u64 physical, int dev_nr,
 -int async)
 +static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr, int rw, int async)
++u64 physical, int dev_nr, int async)
  {
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
 +  struct btrfs_fs_info *fs_info = bbio->fs_info;
  
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
@@@ -6061,12 -6076,12 +6065,12 @@@
  #endif
bio->bi_bdev = dev->bdev;
  
 -  btrfs_bio_counter_inc_noblocked(root->fs_info);
 +  btrfs_bio_counter_inc_noblocked(fs_info);
  
if (async)
-   btrfs_schedule_bio(dev, rw, bio);
 -  btrfs_schedule_bio(root, dev, bio);
++  btrfs_schedule_bio(dev, bio);
else
-   btrfsic_submit_bio(rw, bio);
+   btrfsic_submit_bio(bio);
  }
  
  static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
@@@ -6083,7 -6098,7 +6087,7 @@@
}
  }
  
- int btrfs_map_bio(struct btrfs_fs_info *fs_info, int rw, struct bio *bio,
 -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
++int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  int mirror_num, int async_submit)
  {
struct btrfs_device *dev;
@@@ -6099,11 -6114,11 +6103,11 @@@
length = bio->bi_iter.bi_size;
map_length = length;
  
 -  btrfs_bio_counter_inc_blocked(root->fs_info);
 -  ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical,
 -  _length, , mirror_num, 1);
 +  btrfs_bio_counter_inc_blocked(fs_info);
-   ret = __btrfs_map_block(fs_info, rw, logical, _length, ,
-   mirror_num, 1);
++  ret = __btrfs_map_block(fs_info, bio_op(bio), logical, _length,
++  , mirror_num, 1);
if (ret) {
 -  btrfs_bio_counter_dec(root->fs_info);
 +  btrfs_bio_counter_dec(fs_info);
return ret;
}
  
@@@ -6115,18 -6130,17 +6119,18 @@@
atomic_set(>stripes_pending, bbio->num_stripes);
  
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
-   ((rw & WRITE) || (mirror_num > 1))) {
+   ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
   a single stripe; not the whole write */
-   if (rw & WRITE) {
+   if (bio_op(bio) == REQ_OP_WRITE) {
 -

linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/volumes.c

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  4e49ea4a3d27 ("block/fs/drivers: remove rw argument from submit_bio")
  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/volumes.c
index 64d2557fe7fc,14b2d19c842c..
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -5984,10 -5999,10 +5989,10 @@@ static void btrfs_end_bio(struct bio *b
   * This will add one bio to the pending list for a device and make sure
   * the work struct is scheduled.
   */
 -static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 -  struct btrfs_device *device,
 +static noinline void btrfs_schedule_bio(struct btrfs_device *device,
-   int rw, struct bio *bio)
+   struct bio *bio)
  {
 +  struct btrfs_fs_info *fs_info = device->fs_info;
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
  
@@@ -6010,10 -6025,9 +6015,9 @@@
 * made progress against dirty pages when we've really just put it
 * on a queue for later
 */
 -  atomic_inc(>fs_info->nr_async_bios);
 +  atomic_inc(_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
-   bio->bi_rw |= rw;
  
spin_lock(>io_lock);
if (bio->bi_rw & REQ_SYNC)
@@@ -6033,14 -6047,15 +6037,14 @@@
spin_unlock(>io_lock);
  
if (should_queue)
 -  btrfs_queue_work(root->fs_info->submit_workers,
 -   >work);
 +  btrfs_queue_work(fs_info->submit_workers, >work);
  }
  
 -static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 -struct bio *bio, u64 physical, int dev_nr,
 -int async)
 +static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr, int rw, int async)
++u64 physical, int dev_nr, int async)
  {
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
 +  struct btrfs_fs_info *fs_info = bbio->fs_info;
  
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
@@@ -6061,12 -6076,12 +6065,12 @@@
  #endif
bio->bi_bdev = dev->bdev;
  
 -  btrfs_bio_counter_inc_noblocked(root->fs_info);
 +  btrfs_bio_counter_inc_noblocked(fs_info);
  
if (async)
-   btrfs_schedule_bio(dev, rw, bio);
 -  btrfs_schedule_bio(root, dev, bio);
++  btrfs_schedule_bio(dev, bio);
else
-   btrfsic_submit_bio(rw, bio);
+   btrfsic_submit_bio(bio);
  }
  
  static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
@@@ -6083,7 -6098,7 +6087,7 @@@
}
  }
  
- int btrfs_map_bio(struct btrfs_fs_info *fs_info, int rw, struct bio *bio,
 -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
++int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  int mirror_num, int async_submit)
  {
struct btrfs_device *dev;
@@@ -6099,11 -6114,11 +6103,11 @@@
length = bio->bi_iter.bi_size;
map_length = length;
  
 -  btrfs_bio_counter_inc_blocked(root->fs_info);
 -  ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical,
 -  _length, , mirror_num, 1);
 +  btrfs_bio_counter_inc_blocked(fs_info);
-   ret = __btrfs_map_block(fs_info, rw, logical, _length, ,
-   mirror_num, 1);
++  ret = __btrfs_map_block(fs_info, bio_op(bio), logical, _length,
++  , mirror_num, 1);
if (ret) {
 -  btrfs_bio_counter_dec(root->fs_info);
 +  btrfs_bio_counter_dec(fs_info);
return ret;
}
  
@@@ -6115,18 -6130,17 +6119,18 @@@
atomic_set(>stripes_pending, bbio->num_stripes);
  
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
-   ((rw & WRITE) || (mirror_num > 1))) {
+   ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
   a single stripe; not the whole write */
-   if (rw & WRITE) {
+   if (bio_op(bio) == REQ_OP_WRITE) {
 -

linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/inode.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")
  712518c27ed2 ("Btrfs: cleanup BUG_ON in merge_bio")

from the btrfs-kdave tree and commit:

  37226b2111b0 ("btrfs: use bio op accessors")
  b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition")
  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/inode.c
index 31b1195eb3d4,1323e4faa44c..
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -1835,12 -1822,8 +1835,12 @@@ static void btrfs_clear_bit_hook(struc
  /*
   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
   * we don't create bios that span stripes or chunks
 + *
 + * return 1 if page cannot be merged to bio
 + * return 0 if page can be merged to bio
 + * return error otherwise
   */
- int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags)
  {
@@@ -1856,9 -1838,10 +1856,10 @@@
  
length = bio->bi_iter.bi_size;
map_length = length;
-   ret = btrfs_map_block(fs_info, rw, logical, _length, NULL, 0);
 -  ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
 -_length, NULL, 0);
 -  /* Will always return 0 with map_multi == NULL */
 -  BUG_ON(ret < 0);
++  ret = btrfs_map_block(fs_info, bio_op(bio), logical, _length,
++NULL, 0);
 +  if (ret < 0)
 +  return ret;
if (map_length < length + size)
return 1;
return 0;
@@@ -1872,14 -1855,14 +1873,13 @@@
   * At IO completion time the cums attached on the ordered extent record
   * are inserted into the btree
   */
- static int __btrfs_submit_bio_start(struct inode *inode, int rw,
-   struct bio *bio, int mirror_num,
-   unsigned long bio_flags,
+ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+   int mirror_num, unsigned long bio_flags,
u64 bio_offset)
  {
 -  struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
  
 -  ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
 +  ret = btrfs_csum_one_bio(inode, bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
return 0;
  }
@@@ -1896,10 -1879,10 +1896,10 @@@ static int __btrfs_submit_bio_done(stru
  int mirror_num, unsigned long bio_flags,
  u64 bio_offset)
  {
 -  struct btrfs_root *root = BTRFS_I(inode)->root;
 +  struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
  
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 1);
 -  ret = btrfs_map_bio(root, bio, mirror_num, 1);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@@ -1927,8 -1909,8 +1927,8 @@@ static int btrfs_submit_bio_hook(struc
if (btrfs_is_free_space_inode(inode))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
  
-   if (!(rw & REQ_WRITE)) {
+   if (bio_op(bio) != REQ_OP_WRITE) {
 -  ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
 +  ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
  
@@@ -1948,7 -1930,8 +1948,7 @@@
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
-   ret = btrfs_wq_submit_bio(fs_info, inode, rw, bio, mirror_num,
 -  ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 - inode, bio, mirror_num,
++  ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
   bio_flags, bio_offset,
   __btrfs_submit_bio_start,
   __btrfs_submit_bio_done);
@@@ -1960,7 -1943,7 +1960,7 @@@
}
  
  mapit:
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(root, bio, 

linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/inode.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")
  712518c27ed2 ("Btrfs: cleanup BUG_ON in merge_bio")

from the btrfs-kdave tree and commit:

  37226b2111b0 ("btrfs: use bio op accessors")
  b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition")
  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/inode.c
index 31b1195eb3d4,1323e4faa44c..
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -1835,12 -1822,8 +1835,12 @@@ static void btrfs_clear_bit_hook(struc
  /*
   * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
   * we don't create bios that span stripes or chunks
 + *
 + * return 1 if page cannot be merged to bio
 + * return 0 if page can be merged to bio
 + * return error otherwise
   */
- int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags)
  {
@@@ -1856,9 -1838,10 +1856,10 @@@
  
length = bio->bi_iter.bi_size;
map_length = length;
-   ret = btrfs_map_block(fs_info, rw, logical, _length, NULL, 0);
 -  ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
 -_length, NULL, 0);
 -  /* Will always return 0 with map_multi == NULL */
 -  BUG_ON(ret < 0);
++  ret = btrfs_map_block(fs_info, bio_op(bio), logical, _length,
++NULL, 0);
 +  if (ret < 0)
 +  return ret;
if (map_length < length + size)
return 1;
return 0;
@@@ -1872,14 -1855,14 +1873,13 @@@
   * At IO completion time the cums attached on the ordered extent record
   * are inserted into the btree
   */
- static int __btrfs_submit_bio_start(struct inode *inode, int rw,
-   struct bio *bio, int mirror_num,
-   unsigned long bio_flags,
+ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+   int mirror_num, unsigned long bio_flags,
u64 bio_offset)
  {
 -  struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
  
 -  ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
 +  ret = btrfs_csum_one_bio(inode, bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
return 0;
  }
@@@ -1896,10 -1879,10 +1896,10 @@@ static int __btrfs_submit_bio_done(stru
  int mirror_num, unsigned long bio_flags,
  u64 bio_offset)
  {
 -  struct btrfs_root *root = BTRFS_I(inode)->root;
 +  struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
  
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 1);
 -  ret = btrfs_map_bio(root, bio, mirror_num, 1);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@@ -1927,8 -1909,8 +1927,8 @@@ static int btrfs_submit_bio_hook(struc
if (btrfs_is_free_space_inode(inode))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
  
-   if (!(rw & REQ_WRITE)) {
+   if (bio_op(bio) != REQ_OP_WRITE) {
 -  ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
 +  ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
  
@@@ -1948,7 -1930,8 +1948,7 @@@
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
-   ret = btrfs_wq_submit_bio(fs_info, inode, rw, bio, mirror_num,
 -  ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 - inode, bio, mirror_num,
++  ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
   bio_flags, bio_offset,
   __btrfs_submit_bio_start,
   __btrfs_submit_bio_done);
@@@ -1960,7 -1943,7 +1960,7 @@@
}
  
  mapit:
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(root, bio, 

Re: [PATCH 4.4 00/32] 4.4.15-stable review

2016-07-07 Thread Shuah Khan
On 07/06/2016 07:19 PM, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.4.15 release.
> There are 32 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Sat Jul  9 01:16:17 UTC 2016.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.15-rc1.gz
> or in the git tree and branch at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.4.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h
> 

Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah






Re: [PATCH 4.4 00/32] 4.4.15-stable review

2016-07-07 Thread Shuah Khan
On 07/06/2016 07:19 PM, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 4.4.15 release.
> There are 32 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Sat Jul  9 01:16:17 UTC 2016.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.15-rc1.gz
> or in the git tree and branch at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.4.y
> and the diffstat can be found below.
> 
> thanks,
> 
> greg k-h
> 

Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah






Re: [PATCH 4.6 00/31] 4.6.4-stable review

2016-07-07 Thread Shuah Khan
On 07/06/2016 07:18 PM, Greg Kroah-Hartman wrote:
> ---
> Note, I'm on vacation this week, so I only took a few "easy" patches for
> the stable trees, due to me not having much time to debug anything here,
> and because, well, I'm on vacation and supposed to be ignoring patches.
> So if you have marked patches for inclusion, or emailed asking for
> things to be included, and you don't see them here (which you almost
> certainly will not), just wait a week or so before panicking please.
> ---
> 
> This is the start of the stable review cycle for the 4.6.4 release.
> There are 31 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Sat Jul  9 01:15:40 UTC 2016.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.6.4-rc1.gz
> or in the git tree and branch at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.6.y
> and the diffstat can be found below.
> 

Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah



Re: [PATCH 4.6 00/31] 4.6.4-stable review

2016-07-07 Thread Shuah Khan
On 07/06/2016 07:18 PM, Greg Kroah-Hartman wrote:
> ---
> Note, I'm on vacation this week, so I only took a few "easy" patches for
> the stable trees, due to me not having much time to debug anything here,
> and because, well, I'm on vacation and supposed to be ignoring patches.
> So if you have marked patches for inclusion, or emailed asking for
> things to be included, and you don't see them here (which you almost
> certainly will not), just wait a week or so before panicking please.
> ---
> 
> This is the start of the stable review cycle for the 4.6.4 release.
> There are 31 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
> 
> Responses should be made by Sat Jul  9 01:15:40 UTC 2016.
> Anything received after that time might be too late.
> 
> The whole patch series can be found in one patch at:
>   kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.6.4-rc1.gz
> or in the git tree and branch at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.6.y
> and the diffstat can be found below.
> 

Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah



RE: [PATCH 2/4] soc: fsl: add GUTS driver for QorIQ platforms

2016-07-07 Thread Yangbo Lu
Hi Arnd,


> -Original Message-
> From: Arnd Bergmann [mailto:a...@arndb.de]
> Sent: Thursday, July 07, 2016 4:30 PM
> To: Yangbo Lu
> Cc: Scott Wood; linuxppc-...@lists.ozlabs.org; Mark Rutland; Ulf Hansson;
> linux-kernel@vger.kernel.org; linux-...@vger.kernel.org; linux-
> c...@vger.kernel.org; Qiang Zhao; Russell King; Bhupesh Sharma; Joerg
> Roedel; Claudiu Manoil; devicet...@vger.kernel.org; Kumar Gala; Rob
> Herring; Santosh Shilimkar; linux-arm-ker...@lists.infradead.org;
> net...@vger.kernel.org; linux-...@vger.kernel.org; Xiaobo Xie; Yang-Leo
> Li; io...@lists.linux-foundation.org
> Subject: Re: [PATCH 2/4] soc: fsl: add GUTS driver for QorIQ platforms
> 
> On Thursday, July 7, 2016 2:35:33 AM CEST Yangbo Lu wrote:
> > Hi Arnd,
> >
> > Could you reply when you see the email?
> > If your method doesn’t resolve the problem, we still want to use our
> old patchset.
> >
> > This guts driver had been discussed about one year and blocked many
> workaround upstream.
> > So please help to review and comment soon.
> >
> 
> I don't really see how more discussion is going to help us here. I think
> I've made it pretty clear that I don't want to see another platform
> specific way to read an SoC revision and I've even sent a proof-of-
> concept patch to show how the interface can work, now it's up to you to
> fit the guts hardware into that and send a new patch series.
> 
>   Arnd

I think your proof-of-concept patch is still in discussion. Some answers are 
needed from you to
address Scott's comments on your patchset. Have you reached an agreement?

Thanks.

- Yangbo Lu





RE: [PATCH 2/4] soc: fsl: add GUTS driver for QorIQ platforms

2016-07-07 Thread Yangbo Lu
Hi Arnd,


> -Original Message-
> From: Arnd Bergmann [mailto:a...@arndb.de]
> Sent: Thursday, July 07, 2016 4:30 PM
> To: Yangbo Lu
> Cc: Scott Wood; linuxppc-...@lists.ozlabs.org; Mark Rutland; Ulf Hansson;
> linux-kernel@vger.kernel.org; linux-...@vger.kernel.org; linux-
> c...@vger.kernel.org; Qiang Zhao; Russell King; Bhupesh Sharma; Joerg
> Roedel; Claudiu Manoil; devicet...@vger.kernel.org; Kumar Gala; Rob
> Herring; Santosh Shilimkar; linux-arm-ker...@lists.infradead.org;
> net...@vger.kernel.org; linux-...@vger.kernel.org; Xiaobo Xie; Yang-Leo
> Li; io...@lists.linux-foundation.org
> Subject: Re: [PATCH 2/4] soc: fsl: add GUTS driver for QorIQ platforms
> 
> On Thursday, July 7, 2016 2:35:33 AM CEST Yangbo Lu wrote:
> > Hi Arnd,
> >
> > Could you reply when you see the email?
> > If your method doesn’t resolve the problem, we still want to use our
> old patchset.
> >
> > This guts driver had been discussed about one year and blocked many
> workaround upstream.
> > So please help to review and comment soon.
> >
> 
> I don't really see how more discussion is going to help us here. I think
> I've made it pretty clear that I don't want to see another platform
> specific way to read an SoC revision and I've even sent a proof-of-
> concept patch to show how the interface can work, now it's up to you to
> fit the guts hardware into that and send a new patch series.
> 
>   Arnd

I think your proof-of-concept patch is still in discussion. Some answers are 
needed from you to
address Scott's comments on your patchset. Have you reached an agreement?

Thanks.

- Yangbo Lu





Re: [PATCH 1/1] arm64/hugetlb: clear PG_dcache_clean if the page is dirty when munmap

2016-07-07 Thread Leizhen (ThunderTown)


On 2016/7/7 23:37, Catalin Marinas wrote:
> On Thu, Jul 07, 2016 at 08:09:04PM +0800, Zhen Lei wrote:
>> At present, PG_dcache_clean is only cleared when the related huge page
>> is about to be freed. But sometimes, there maybe a process is in charge
>> to copy binary codes into a shared memory, and notifies other processes
>> to execute base on that. For the first time, there is no problem, because
>> the default value of page->flags is PG_dcache_clean cleared. So the cache
>> will be maintained at the time of set_pte_at for other processes. But if
>> the content of the shared memory have been updated again, there is no
>> cache operations, because the PG_dcache_clean is still set.
>>
>> For example:
>> Process A
>>  open a hugetlbfs file
>>  mmap it as a shared memory
>>  copy some binary codes into it
>>  munmap
>>
>> Process B
>>  open the hugetlbfs file
>>  mmap it as a shared memory, executable
>>  invoke the functions in the shared memory
>>  munmap
>>
>> repeat the above steps.
> 
> Does this work as you would expect with small pages (and for example
> shared file mmap)? I don't want to have a different behaviour between
> small and huge pages.

The small pages also have this problem, I will try to fix it too.

> 



Re: [PATCH 1/1] arm64/hugetlb: clear PG_dcache_clean if the page is dirty when munmap

2016-07-07 Thread Leizhen (ThunderTown)


On 2016/7/7 23:37, Catalin Marinas wrote:
> On Thu, Jul 07, 2016 at 08:09:04PM +0800, Zhen Lei wrote:
>> At present, PG_dcache_clean is only cleared when the related huge page
>> is about to be freed. But sometimes, there maybe a process is in charge
>> to copy binary codes into a shared memory, and notifies other processes
>> to execute base on that. For the first time, there is no problem, because
>> the default value of page->flags is PG_dcache_clean cleared. So the cache
>> will be maintained at the time of set_pte_at for other processes. But if
>> the content of the shared memory have been updated again, there is no
>> cache operations, because the PG_dcache_clean is still set.
>>
>> For example:
>> Process A
>>  open a hugetlbfs file
>>  mmap it as a shared memory
>>  copy some binary codes into it
>>  munmap
>>
>> Process B
>>  open the hugetlbfs file
>>  mmap it as a shared memory, executable
>>  invoke the functions in the shared memory
>>  munmap
>>
>> repeat the above steps.
> 
> Does this work as you would expect with small pages (and for example
> shared file mmap)? I don't want to have a different behaviour between
> small and huge pages.

The small pages also have this problem, I will try to fix it too.

> 



Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrew Vagin
On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages) wrote:
> > > On 7 July 2016 at 17:01, James Bottomley
> > >  wrote:
> > [Serge already answered the parenting issue]
> > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > Hm.  Probably best-effort based on the process hierarchy.  So 
> > > > > yeah you could probably get a tree into a state that would be 
> > > > > wrongly recreated. Create a new netns, bind mount it, exit;  Have 
> > > > > another task create a new user_ns, bind mount it, exit;  Third 
> > > > > task setns()s first to the new netns then to the new user_ns.  I 
> > > > > suspect criu will recreate that wrongly.
> > > > 
> > > > This is a bit pathological, and you have to be root to do it: so 
> > > > root can set up a nesting hierarchy, bind it and destroy the pids 
> > > > but I know of no current orchestration system which does this.
> > > > 
> > > > Actually, I have to back pedal a bit: the way I currently set up
> > > > architecture emulation containers does precisely this: I set up the
> > > > namespaces unprivileged with child mount namespaces, but then I ask
> > > > root to bind the userns and kill the process that created it so I 
> > > > have a permanent handle to enter the namespace by, so I suspect 
> > > > that when our current orchestration systems get more sophisticated, 
> > > > they might eventually want to do something like this as well.
> > > > 
> > > > In theory, we could get nsfs to show this information as an option
> > > > (just add a show_options entry to the superblock ops), but the 
> > > > problem is that although each namespace has a parent user_ns, 
> > > > there's no way to get it without digging in the namespace specific 
> > > > structure.  Probably we should restructure to move it into 
> > > > ns_common, then we could display it (and enforce all namespaces 
> > > > having owning user_ns) but it would be a
> > > 
> > > I'm missing something here. Is it not already the case that all
> > > namespaces have an owning user_ns?
> > 
> > Um, yes, I don't believe I said they don't.  The problem I thought you
> > were having is that there's no way of seeing what it is.
> > 
> > nsfs is the Namespace fileystem where bound namespaces appear to a cat
> > of /proc/self/mounts.  It can display any information that's in
> > ns_common (the common core of namespaces) but the owning user_ns
> > pointer currently isn't in this structure.  Every user namespace has a
> > pointer to it, but they're all privately embedded in the individual
> > namespace specific structures.  What I was proposing was that since
> > every current namespace has a pointer somewhere to the owning user
> > namespace, we could abstract this out into ns_common so it's now
> > accessible to be displayed by nsfs, probably as a mount option.
> 
> James, I am not sure that I understood you correctly. We have one
> file system for all namespace files, how we can show per-file properties
> in mount options. I think we can show all required information in
> fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
> /proc/pid/fdinfo/X for it.

Here is a proof-of-concept patch.

How it works:

In [1]: import os

In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)

In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
pos:0
flags:  010
mnt_id: 2
userns: 4026531837

In [4]: print "/proc/self/ns/user -> %s" % os.readlink("/proc/self/ns/user")
/proc/self/ns/user -> user:[4026531837]

> 
> > 
> > James
> > 
> > 
> > ___
> > CRIU mailing list
> > c...@openvz.org
> > https://lists.openvz.org/mailman/listinfo/criu
> ___
> CRIU mailing list
> c...@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d60..bfd5bde 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,8 +8,20 @@
 
 static struct vfsmount *nsfs_mnt;
 
+static void show_fdinfo(struct seq_file *m, struct file *f)
+{
+   struct dentry *dentry = f->f_path.dentry;
+   struct inode *inode = d_inode(dentry);
+   const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+   struct ns_common *ns = inode->i_private;
+
+   if (ns_ops->show_fdinfo)
+   ns_ops->show_fdinfo(m, ns);
+}
+
 static const struct file_operations ns_file_operations = {
.llseek = no_llseek,
+   .show_fdinfo= show_fdinfo,
 };
 
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index de0e771..fed276b 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -18,6 +18,7 @@ struct proc_ns_operations {
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common 

Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread Andrew Vagin
On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages) wrote:
> > > On 7 July 2016 at 17:01, James Bottomley
> > >  wrote:
> > [Serge already answered the parenting issue]
> > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > Hm.  Probably best-effort based on the process hierarchy.  So 
> > > > > yeah you could probably get a tree into a state that would be 
> > > > > wrongly recreated. Create a new netns, bind mount it, exit;  Have 
> > > > > another task create a new user_ns, bind mount it, exit;  Third 
> > > > > task setns()s first to the new netns then to the new user_ns.  I 
> > > > > suspect criu will recreate that wrongly.
> > > > 
> > > > This is a bit pathological, and you have to be root to do it: so 
> > > > root can set up a nesting hierarchy, bind it and destroy the pids 
> > > > but I know of no current orchestration system which does this.
> > > > 
> > > > Actually, I have to back pedal a bit: the way I currently set up
> > > > architecture emulation containers does precisely this: I set up the
> > > > namespaces unprivileged with child mount namespaces, but then I ask
> > > > root to bind the userns and kill the process that created it so I 
> > > > have a permanent handle to enter the namespace by, so I suspect 
> > > > that when our current orchestration systems get more sophisticated, 
> > > > they might eventually want to do something like this as well.
> > > > 
> > > > In theory, we could get nsfs to show this information as an option
> > > > (just add a show_options entry to the superblock ops), but the 
> > > > problem is that although each namespace has a parent user_ns, 
> > > > there's no way to get it without digging in the namespace specific 
> > > > structure.  Probably we should restructure to move it into 
> > > > ns_common, then we could display it (and enforce all namespaces 
> > > > having owning user_ns) but it would be a
> > > 
> > > I'm missing something here. Is it not already the case that all
> > > namespaces have an owning user_ns?
> > 
> > Um, yes, I don't believe I said they don't.  The problem I thought you
> > were having is that there's no way of seeing what it is.
> > 
> > nsfs is the Namespace fileystem where bound namespaces appear to a cat
> > of /proc/self/mounts.  It can display any information that's in
> > ns_common (the common core of namespaces) but the owning user_ns
> > pointer currently isn't in this structure.  Every user namespace has a
> > pointer to it, but they're all privately embedded in the individual
> > namespace specific structures.  What I was proposing was that since
> > every current namespace has a pointer somewhere to the owning user
> > namespace, we could abstract this out into ns_common so it's now
> > accessible to be displayed by nsfs, probably as a mount option.
> 
> James, I am not sure that I understood you correctly. We have one
> file system for all namespace files, how we can show per-file properties
> in mount options. I think we can show all required information in
> fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
> /proc/pid/fdinfo/X for it.

Here is a proof-of-concept patch.

How it works:

In [1]: import os

In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)

In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
pos:0
flags:  010
mnt_id: 2
userns: 4026531837

In [4]: print "/proc/self/ns/user -> %s" % os.readlink("/proc/self/ns/user")
/proc/self/ns/user -> user:[4026531837]

> 
> > 
> > James
> > 
> > 
> > ___
> > CRIU mailing list
> > c...@openvz.org
> > https://lists.openvz.org/mailman/listinfo/criu
> ___
> CRIU mailing list
> c...@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d60..bfd5bde 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,8 +8,20 @@
 
 static struct vfsmount *nsfs_mnt;
 
+static void show_fdinfo(struct seq_file *m, struct file *f)
+{
+   struct dentry *dentry = f->f_path.dentry;
+   struct inode *inode = d_inode(dentry);
+   const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+   struct ns_common *ns = inode->i_private;
+
+   if (ns_ops->show_fdinfo)
+   ns_ops->show_fdinfo(m, ns);
+}
+
 static const struct file_operations ns_file_operations = {
.llseek = no_llseek,
+   .show_fdinfo= show_fdinfo,
 };
 
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index de0e771..fed276b 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -18,6 +18,7 @@ struct proc_ns_operations {
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
int (*install)(struct 

linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/extent-tree.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
[again, no commiter Signed-off-by]

from the btrfs-kdave tree and commit:

  b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/extent-tree.c
index dd7e454d1cf5,5796c4a9eec6..
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -2047,10 -2046,10 +2047,10 @@@ int btrfs_discard_extent(struct btrfs_f
 * Avoid races with device replace and make sure our bbio has devices
 * associated to its stripes that don't go away while we are discarding.
 */
 -  btrfs_bio_counter_inc_blocked(root->fs_info);
 +  btrfs_bio_counter_inc_blocked(fs_info);
/* Tell the block device(s) that the sectors can be discarded */
-   ret = btrfs_map_block(fs_info, REQ_DISCARD, bytenr, _bytes,
 -  ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
 -bytenr, _bytes, , 0);
++  ret = btrfs_map_block(fs_info, REQ_OP_DISCARD, bytenr, _bytes,
 +, 0);
/* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/extent-tree.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
[again, no commiter Signed-off-by]

from the btrfs-kdave tree and commit:

  b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/extent-tree.c
index dd7e454d1cf5,5796c4a9eec6..
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -2047,10 -2046,10 +2047,10 @@@ int btrfs_discard_extent(struct btrfs_f
 * Avoid races with device replace and make sure our bbio has devices
 * associated to its stripes that don't go away while we are discarding.
 */
 -  btrfs_bio_counter_inc_blocked(root->fs_info);
 +  btrfs_bio_counter_inc_blocked(fs_info);
/* Tell the block device(s) that the sectors can be discarded */
-   ret = btrfs_map_block(fs_info, REQ_DISCARD, bytenr, _bytes,
 -  ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
 -bytenr, _bytes, , 0);
++  ret = btrfs_map_block(fs_info, REQ_OP_DISCARD, bytenr, _bytes,
 +, 0);
/* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/disk-io.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/disk-io.c
index 9d778665c9eb,e80ef6eb17e6..
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -926,7 -921,7 +922,7 @@@ static int __btree_submit_bio_done(stru
 * when we're called for a write, we're already in the async
 * submission context.  Just jump into btrfs_map_bio
 */
-   ret = btrfs_map_bio(btrfs_sb(inode->i_sb), rw, bio, mirror_num, 1);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
++  ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@@ -958,23 -952,24 +954,23 @@@ static int btree_submit_bio_hook(struc
 * called for a read, do the setup so that checksum validation
 * can happen in the async kernel threads
 */
 -  ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 -bio, BTRFS_WQ_ENDIO_METADATA);
 +  ret = btrfs_bio_wq_end_io(fs_info, bio,
 +BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else {
/*
 * kthread helpers are used to submit writes so that
 * checksumming can happen in parallel across all CPUs
 */
-   ret = btrfs_wq_submit_bio(fs_info, inode, rw, bio,
 -  ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 -inode, bio, mirror_num, 0,
 -bio_offset,
++  ret = btrfs_wq_submit_bio(fs_info, inode, bio,
 +mirror_num, 0, bio_offset,
  __btree_submit_bio_start,
  __btree_submit_bio_done);
}


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/disk-io.c

between commit:

  b286384aac32 ("btrfs: root->fs_info cleanup, add fs_info convenience 
variables")
  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/disk-io.c
index 9d778665c9eb,e80ef6eb17e6..
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -926,7 -921,7 +922,7 @@@ static int __btree_submit_bio_done(stru
 * when we're called for a write, we're already in the async
 * submission context.  Just jump into btrfs_map_bio
 */
-   ret = btrfs_map_bio(btrfs_sb(inode->i_sb), rw, bio, mirror_num, 1);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
++  ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@@ -958,23 -952,24 +954,23 @@@ static int btree_submit_bio_hook(struc
 * called for a read, do the setup so that checksum validation
 * can happen in the async kernel threads
 */
 -  ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 -bio, BTRFS_WQ_ENDIO_METADATA);
 +  ret = btrfs_bio_wq_end_io(fs_info, bio,
 +BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
-   ret = btrfs_map_bio(fs_info, rw, bio, mirror_num, 0);
 -  ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else {
/*
 * kthread helpers are used to submit writes so that
 * checksumming can happen in parallel across all CPUs
 */
-   ret = btrfs_wq_submit_bio(fs_info, inode, rw, bio,
 -  ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 -inode, bio, mirror_num, 0,
 -bio_offset,
++  ret = btrfs_wq_submit_bio(fs_info, inode, bio,
 +mirror_num, 0, bio_offset,
  __btree_submit_bio_start,
  __btree_submit_bio_done);
}


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread James Bottomley
On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
> > > wrote:
> > > > On 7 July 2016 at 17:01, James Bottomley
> > > >  wrote:
> > > [Serge already answered the parenting issue]
> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > > Hm.  Probably best-effort based on the process hierarchy. 
> > > > > >  So 
> > > > > > yeah you could probably get a tree into a state that would
> > > > > > be 
> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
> > > > > >   Have 
> > > > > > another task create a new user_ns, bind mount it, exit; 
> > > > > >  Third 
> > > > > > task setns()s first to the new netns then to the new
> > > > > > user_ns.  I 
> > > > > > suspect criu will recreate that wrongly.
> > > > > 
> > > > > This is a bit pathological, and you have to be root to do it:
> > > > > so 
> > > > > root can set up a nesting hierarchy, bind it and destroy the
> > > > > pids 
> > > > > but I know of no current orchestration system which does
> > > > > this.
> > > > > 
> > > > > Actually, I have to back pedal a bit: the way I currently set
> > > > > up
> > > > > architecture emulation containers does precisely this: I set
> > > > > up the
> > > > > namespaces unprivileged with child mount namespaces, but then
> > > > > I ask
> > > > > root to bind the userns and kill the process that created it
> > > > > so I 
> > > > > have a permanent handle to enter the namespace by, so I
> > > > > suspect 
> > > > > that when our current orchestration systems get more
> > > > > sophisticated, 
> > > > > they might eventually want to do something like this as well.
> > > > > 
> > > > > In theory, we could get nsfs to show this information as an
> > > > > option
> > > > > (just add a show_options entry to the superblock ops), but
> > > > > the 
> > > > > problem is that although each namespace has a parent user_ns,
> > > > > there's no way to get it without digging in the namespace
> > > > > specific 
> > > > > structure.  Probably we should restructure to move it into 
> > > > > ns_common, then we could display it (and enforce all
> > > > > namespaces 
> > > > > having owning user_ns) but it would be a
> > > > 
> > > > I'm missing something here. Is it not already the case that all
> > > > namespaces have an owning user_ns?
> > > 
> > > Um, yes, I don't believe I said they don't.  The problem I
> > > thought you
> > > were having is that there's no way of seeing what it is.
> > > 
> > > nsfs is the Namespace fileystem where bound namespaces appear to
> > > a cat
> > > of /proc/self/mounts.  It can display any information that's in
> > > ns_common (the common core of namespaces) but the owning user_ns
> > > pointer currently isn't in this structure.  Every user namespace
> > > has a
> > > pointer to it, but they're all privately embedded in the
> > > individual
> > > namespace specific structures.  What I was proposing was that
> > > since
> > > every current namespace has a pointer somewhere to the owning
> > > user
> > > namespace, we could abstract this out into ns_common so it's now
> > > accessible to be displayed by nsfs, probably as a mount option.
> > 
> > James, I am not sure that I understood you correctly. We have one
> > file system for all namespace files, how we can show per-file
> > properties
> > in mount options. I think we can show all required information in
> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
> > /proc/pid/fdinfo/X for it.
> 
> Here is a proof-of-concept patch.
> 
> How it works:
> 
> In [1]: import os
> 
> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
> 
> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
> pos:  0
> flags:010
> mnt_id:   2
> userns: 4026531837
> 
> In [4]: print "/proc/self/ns/user -> %s" %
> os.readlink("/proc/self/ns/user")
> /proc/self/ns/user -> user:[4026531837]

can't you just do

readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

?

But what Michael was asking about was the parent user_ns of all the
other namespaces ... I don't think there's any way we can get that out
of any information in /proc/self/

James




Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread James Bottomley
On Thu, 2016-07-07 at 20:00 -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 07:16:18PM -0700, Andrew Vagin wrote:
> > On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
> > > wrote:
> > > > On 7 July 2016 at 17:01, James Bottomley
> > > >  wrote:
> > > [Serge already answered the parenting issue]
> > > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > > Hm.  Probably best-effort based on the process hierarchy. 
> > > > > >  So 
> > > > > > yeah you could probably get a tree into a state that would
> > > > > > be 
> > > > > > wrongly recreated. Create a new netns, bind mount it, exit;
> > > > > >   Have 
> > > > > > another task create a new user_ns, bind mount it, exit; 
> > > > > >  Third 
> > > > > > task setns()s first to the new netns then to the new
> > > > > > user_ns.  I 
> > > > > > suspect criu will recreate that wrongly.
> > > > > 
> > > > > This is a bit pathological, and you have to be root to do it:
> > > > > so 
> > > > > root can set up a nesting hierarchy, bind it and destroy the
> > > > > pids 
> > > > > but I know of no current orchestration system which does
> > > > > this.
> > > > > 
> > > > > Actually, I have to back pedal a bit: the way I currently set
> > > > > up
> > > > > architecture emulation containers does precisely this: I set
> > > > > up the
> > > > > namespaces unprivileged with child mount namespaces, but then
> > > > > I ask
> > > > > root to bind the userns and kill the process that created it
> > > > > so I 
> > > > > have a permanent handle to enter the namespace by, so I
> > > > > suspect 
> > > > > that when our current orchestration systems get more
> > > > > sophisticated, 
> > > > > they might eventually want to do something like this as well.
> > > > > 
> > > > > In theory, we could get nsfs to show this information as an
> > > > > option
> > > > > (just add a show_options entry to the superblock ops), but
> > > > > the 
> > > > > problem is that although each namespace has a parent user_ns,
> > > > > there's no way to get it without digging in the namespace
> > > > > specific 
> > > > > structure.  Probably we should restructure to move it into 
> > > > > ns_common, then we could display it (and enforce all
> > > > > namespaces 
> > > > > having owning user_ns) but it would be a
> > > > 
> > > > I'm missing something here. Is it not already the case that all
> > > > namespaces have an owning user_ns?
> > > 
> > > Um, yes, I don't believe I said they don't.  The problem I
> > > thought you
> > > were having is that there's no way of seeing what it is.
> > > 
> > > nsfs is the Namespace fileystem where bound namespaces appear to
> > > a cat
> > > of /proc/self/mounts.  It can display any information that's in
> > > ns_common (the common core of namespaces) but the owning user_ns
> > > pointer currently isn't in this structure.  Every user namespace
> > > has a
> > > pointer to it, but they're all privately embedded in the
> > > individual
> > > namespace specific structures.  What I was proposing was that
> > > since
> > > every current namespace has a pointer somewhere to the owning
> > > user
> > > namespace, we could abstract this out into ns_common so it's now
> > > accessible to be displayed by nsfs, probably as a mount option.
> > 
> > James, I am not sure that I understood you correctly. We have one
> > file system for all namespace files, how we can show per-file
> > properties
> > in mount options. I think we can show all required information in
> > fdinfo. We open a namespaces file (/proc/pid/ns/N) and then read
> > /proc/pid/fdinfo/X for it.
> 
> Here is a proof-of-concept patch.
> 
> How it works:
> 
> In [1]: import os
> 
> In [2]: fd = os.open("/proc/self/ns/pid", os.O_RDONLY)
> 
> In [3]: print open("/proc/self/fdinfo/%d" % fd).read()
> pos:  0
> flags:010
> mnt_id:   2
> userns: 4026531837
> 
> In [4]: print "/proc/self/ns/user -> %s" %
> os.readlink("/proc/self/ns/user")
> /proc/self/ns/user -> user:[4026531837]

can't you just do

readlink /proc/self/ns/user | sed 's/.*\[\(.*\)\]/\1/'

?

But what Michael was asking about was the parent user_ns of all the
other namespaces ... I don't think there's any way we can get that out
of any information in /proc/self/

James




linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/compression.c

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")
[This commit has no Signed-off-by from its committer :-(]
  d42b410c1511 ("Btrfs: fix BUG_ON in btrfs_submit_compressed_write")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/compression.c
index 36ef2e8f3c87,cefedabf0a92..
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@@ -400,11 -402,8 +401,11 @@@ int btrfs_submit_compressed_write(struc
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, WRITE, bio, 0, 1);
 -  ret = btrfs_map_bio(root, bio, 0, 1);
 -  BUG_ON(ret); /* -ENOMEM */
++  ret = btrfs_map_bio(fs_info, bio, 0, 1);
 +  if (ret) {
 +  bio->bi_error = ret;
 +  bio_endio(bio);
 +  }
  
bio_put(bio);
  
@@@ -433,11 -433,8 +435,11 @@@
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, WRITE, bio, 0, 1);
 -  ret = btrfs_map_bio(root, bio, 0, 1);
 -  BUG_ON(ret); /* -ENOMEM */
++  ret = btrfs_map_bio(fs_info, bio, 0, 1);
 +  if (ret) {
 +  bio->bi_error = ret;
 +  bio_endio(bio);
 +  }
  
bio_put(bio);
return 0;
@@@ -690,10 -688,9 +693,9 @@@ int btrfs_submit_compressed_read(struc
BUG_ON(ret); /* -ENOMEM */
}
sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 -   root->sectorsize);
 +   fs_info->sectorsize);
  
-   ret = btrfs_map_bio(fs_info, READ, comp_bio,
-   mirror_num, 0);
 -  ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);
@@@ -721,7 -720,7 +724,7 @@@
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, READ, comp_bio, mirror_num, 0);
 -  ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);


linux-next: manual merge of the block tree with the btrfs-kdave tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  fs/btrfs/compression.c

between commit:

  26112f7f4726 ("btrfs: take an fs_info parameter directly when the root is not 
used otherwise")
[This commit has no Signed-off-by from its committer :-(]
  d42b410c1511 ("Btrfs: fix BUG_ON in btrfs_submit_compressed_write")

from the btrfs-kdave tree and commit:

  81a75f6781de ("btrfs: use bio fields for op and flags")

from the block tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc fs/btrfs/compression.c
index 36ef2e8f3c87,cefedabf0a92..
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@@ -400,11 -402,8 +401,11 @@@ int btrfs_submit_compressed_write(struc
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, WRITE, bio, 0, 1);
 -  ret = btrfs_map_bio(root, bio, 0, 1);
 -  BUG_ON(ret); /* -ENOMEM */
++  ret = btrfs_map_bio(fs_info, bio, 0, 1);
 +  if (ret) {
 +  bio->bi_error = ret;
 +  bio_endio(bio);
 +  }
  
bio_put(bio);
  
@@@ -433,11 -433,8 +435,11 @@@
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, WRITE, bio, 0, 1);
 -  ret = btrfs_map_bio(root, bio, 0, 1);
 -  BUG_ON(ret); /* -ENOMEM */
++  ret = btrfs_map_bio(fs_info, bio, 0, 1);
 +  if (ret) {
 +  bio->bi_error = ret;
 +  bio_endio(bio);
 +  }
  
bio_put(bio);
return 0;
@@@ -690,10 -688,9 +693,9 @@@ int btrfs_submit_compressed_read(struc
BUG_ON(ret); /* -ENOMEM */
}
sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 -   root->sectorsize);
 +   fs_info->sectorsize);
  
-   ret = btrfs_map_bio(fs_info, READ, comp_bio,
-   mirror_num, 0);
 -  ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);
@@@ -721,7 -720,7 +724,7 @@@
BUG_ON(ret); /* -ENOMEM */
}
  
-   ret = btrfs_map_bio(fs_info, READ, comp_bio, mirror_num, 0);
 -  ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
++  ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
bio->bi_error = ret;
bio_endio(comp_bio);


Re: [PATCH v2] clkdev: add devm_of_clk_get()

2016-07-07 Thread Kuninori Morimoto

Hi Michael

Thank you for your feedback

> > struct clk *clk_get(struct device *dev, const char *con_id)
> > {
> > ...
> > if (dev) {
> > clk = __of_clk_get_by_name(dev->of_node, dev_id, con_id);
> >
> > ...
> > }
> > }
> > 
> > I would like to select specific device_node.
> 
> Do you have access to the struct device that you want to target? Can you
> pass that device into either clk_get or devm_clk_get?

If my understanding was correct, I think I can't.
In below case, "sound_soc" has its *dev, but "cpu" and "codec" doesn't
have *dev, it has node only. Thus, we are using of_clk_get() for these now.

clk = of_clk_get(cpu, xxx);
clk = of_clk_get(codec, xxx);

sound_soc {
...
cpu {
...
=>  clocks = <>;
};
codec {
...
=>  clocks = <>;
};
};


Re: [PATCH v2] clkdev: add devm_of_clk_get()

2016-07-07 Thread Kuninori Morimoto

Hi Michael

Thank you for your feedback

> > struct clk *clk_get(struct device *dev, const char *con_id)
> > {
> > ...
> > if (dev) {
> > clk = __of_clk_get_by_name(dev->of_node, dev_id, con_id);
> >
> > ...
> > }
> > }
> > 
> > I would like to select specific device_node.
> 
> Do you have access to the struct device that you want to target? Can you
> pass that device into either clk_get or devm_clk_get?

If my understanding was correct, I think I can't.
In below case, "sound_soc" has its *dev, but "cpu" and "codec" doesn't
have *dev, it has node only. Thus, we are using of_clk_get() for these now.

clk = of_clk_get(cpu, xxx);
clk = of_clk_get(codec, xxx);

sound_soc {
...
cpu {
...
=>  clocks = <>;
};
codec {
...
=>  clocks = <>;
};
};


Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread James Bottomley
On Thu, 2016-07-07 at 19:16 -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
> > wrote:
> > > On 7 July 2016 at 17:01, James Bottomley
> > >  wrote:
> > [Serge already answered the parenting issue]
> > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > Hm.  Probably best-effort based on the process hierarchy.  So
> > > > > yeah you could probably get a tree into a state that would be
> > > > > wrongly recreated. Create a new netns, bind mount it, exit; 
> > > > >  Have 
> > > > > another task create a new user_ns, bind mount it, exit; 
> > > > >  Third 
> > > > > task setns()s first to the new netns then to the new user_ns.
> > > > >   I 
> > > > > suspect criu will recreate that wrongly.
> > > > 
> > > > This is a bit pathological, and you have to be root to do it:
> > > > so 
> > > > root can set up a nesting hierarchy, bind it and destroy the
> > > > pids 
> > > > but I know of no current orchestration system which does this.
> > > > 
> > > > Actually, I have to back pedal a bit: the way I currently set
> > > > up
> > > > architecture emulation containers does precisely this: I set up
> > > > the
> > > > namespaces unprivileged with child mount namespaces, but then I
> > > > ask
> > > > root to bind the userns and kill the process that created it so
> > > > I 
> > > > have a permanent handle to enter the namespace by, so I suspect
> > > > that when our current orchestration systems get more
> > > > sophisticated, 
> > > > they might eventually want to do something like this as well.
> > > > 
> > > > In theory, we could get nsfs to show this information as an
> > > > option
> > > > (just add a show_options entry to the superblock ops), but the 
> > > > problem is that although each namespace has a parent user_ns, 
> > > > there's no way to get it without digging in the namespace
> > > > specific 
> > > > structure.  Probably we should restructure to move it into 
> > > > ns_common, then we could display it (and enforce all namespaces
> > > > having owning user_ns) but it would be a
> > > 
> > > I'm missing something here. Is it not already the case that all
> > > namespaces have an owning user_ns?
> > 
> > Um, yes, I don't believe I said they don't.  The problem I thought
> > you
> > were having is that there's no way of seeing what it is.
> > 
> > nsfs is the Namespace fileystem where bound namespaces appear to a
> > cat
> > of /proc/self/mounts.  It can display any information that's in
> > ns_common (the common core of namespaces) but the owning user_ns
> > pointer currently isn't in this structure.  Every user namespace
> > has a
> > pointer to it, but they're all privately embedded in the individual
> > namespace specific structures.  What I was proposing was that since
> > every current namespace has a pointer somewhere to the owning user
> > namespace, we could abstract this out into ns_common so it's now
> > accessible to be displayed by nsfs, probably as a mount option.
> 
> James, I am not sure that I understood you correctly. We have one
> file system for all namespace files, how we can show per-file 
> properties in mount options.

We have two ways of getting information.  For a namespace that only
exists as a bind mount we only have what the mount/mountinfo shows, so
you see something like this:

jejb@jarvis:~> mount|grep nsfs
nsfs on /run/build-container/userns type nsfs (rw)
nsfs on /run/build-container/ppc64 type nsfs (rw)

the (rw) are the mount options.  We could add the ability to add other
mount options to this via the superblock .show_options callback.  We
could make it show the type and parent user namespace.

>  I think we can show all required information in fdinfo. We open a
> namespaces file (/proc/pid/ns/N) and then read /proc/pid/fdinfo/X for
> it.

Not if we don't have an extant process in the namespace, we can't use
these files because they don't exist, plus fdinfo on the
/proc//ns/X doesn't tell you what the parent user_ns of X is
(again, we could add this information somewhere ... not sure where
yet).

James



Re: [CRIU] Introspecting userns relationships to other namespaces?

2016-07-07 Thread James Bottomley
On Thu, 2016-07-07 at 19:16 -0700, Andrew Vagin wrote:
> On Thu, Jul 07, 2016 at 12:17:35PM -0700, James Bottomley wrote:
> > On Thu, 2016-07-07 at 20:21 +0200, Michael Kerrisk (man-pages)
> > wrote:
> > > On 7 July 2016 at 17:01, James Bottomley
> > >  wrote:
> > [Serge already answered the parenting issue]
> > > > On Thu, 2016-07-07 at 08:36 -0500, Serge E. Hallyn wrote:
> > > > > Hm.  Probably best-effort based on the process hierarchy.  So
> > > > > yeah you could probably get a tree into a state that would be
> > > > > wrongly recreated. Create a new netns, bind mount it, exit; 
> > > > >  Have 
> > > > > another task create a new user_ns, bind mount it, exit; 
> > > > >  Third 
> > > > > task setns()s first to the new netns then to the new user_ns.
> > > > >   I 
> > > > > suspect criu will recreate that wrongly.
> > > > 
> > > > This is a bit pathological, and you have to be root to do it:
> > > > so 
> > > > root can set up a nesting hierarchy, bind it and destroy the
> > > > pids 
> > > > but I know of no current orchestration system which does this.
> > > > 
> > > > Actually, I have to back pedal a bit: the way I currently set
> > > > up
> > > > architecture emulation containers does precisely this: I set up
> > > > the
> > > > namespaces unprivileged with child mount namespaces, but then I
> > > > ask
> > > > root to bind the userns and kill the process that created it so
> > > > I 
> > > > have a permanent handle to enter the namespace by, so I suspect
> > > > that when our current orchestration systems get more
> > > > sophisticated, 
> > > > they might eventually want to do something like this as well.
> > > > 
> > > > In theory, we could get nsfs to show this information as an
> > > > option
> > > > (just add a show_options entry to the superblock ops), but the 
> > > > problem is that although each namespace has a parent user_ns, 
> > > > there's no way to get it without digging in the namespace
> > > > specific 
> > > > structure.  Probably we should restructure to move it into 
> > > > ns_common, then we could display it (and enforce all namespaces
> > > > having owning user_ns) but it would be a
> > > 
> > > I'm missing something here. Is it not already the case that all
> > > namespaces have an owning user_ns?
> > 
> > Um, yes, I don't believe I said they don't.  The problem I thought
> > you
> > were having is that there's no way of seeing what it is.
> > 
> > nsfs is the Namespace fileystem where bound namespaces appear to a
> > cat
> > of /proc/self/mounts.  It can display any information that's in
> > ns_common (the common core of namespaces) but the owning user_ns
> > pointer currently isn't in this structure.  Every user namespace
> > has a
> > pointer to it, but they're all privately embedded in the individual
> > namespace specific structures.  What I was proposing was that since
> > every current namespace has a pointer somewhere to the owning user
> > namespace, we could abstract this out into ns_common so it's now
> > accessible to be displayed by nsfs, probably as a mount option.
> 
> James, I am not sure that I understood you correctly. We have one
> file system for all namespace files, how we can show per-file 
> properties in mount options.

We have two ways of getting information.  For a namespace that only
exists as a bind mount we only have what the mount/mountinfo shows, so
you see something like this:

jejb@jarvis:~> mount|grep nsfs
nsfs on /run/build-container/userns type nsfs (rw)
nsfs on /run/build-container/ppc64 type nsfs (rw)

the (rw) are the mount options.  We could add the ability to add other
mount options to this via the superblock .show_options callback.  We
could make it show the type and parent user namespace.

>  I think we can show all required information in fdinfo. We open a
> namespaces file (/proc/pid/ns/N) and then read /proc/pid/fdinfo/X for
> it.

Not if we don't have an extant process in the namespace, we can't use
these files because they don't exist, plus fdinfo on the
/proc//ns/X doesn't tell you what the parent user_ns of X is
(again, we could add this information somewhere ... not sure where
yet).

James



Re: [PATCH v3] f2fs: fix to avoid data update racing between GC and DIO

2016-07-07 Thread Jaegeuk Kim
Hi Chao,

Could you take a look at this in xfstests/generic/013?

[  502.480850] ==
[  502.480864] [ INFO: possible circular locking dependency detected ]
[  502.480877] 4.7.0-rc1+ #124 Tainted: G   OE  
[  502.480886] ---
[  502.480897] fsstress/10729 is trying to acquire lock:
[  502.480906]  (>s_type->i_mutex_key#18){+.+.+.}, at: [] 
do_blockdev_direct_IO+0x1db/0x2310
[  502.480948] 
[  502.480948] but task is already holding lock:
[  502.480959]  (>dio_rwsem){.+.+.+}, at: [] 
f2fs_direct_IO+0xd1/0x3d0 [f2fs]
[  502.481003] 
[  502.481003] which lock already depends on the new lock.
[  502.481003] 
[  502.481018] 
[  502.481018] the existing dependency chain (in reverse order) is:
[  502.481030] 
[  502.481030] -> #1 (>dio_rwsem){.+.+.+}:
[  502.481054][] lock_acquire+0xd3/0x220
[  502.481071][] down_read+0x51/0xa0
[  502.481089][] f2fs_direct_IO+0xd1/0x3d0 [f2fs]
[  502.481114][] generic_file_direct_write+0xa7/0x160
[  502.481133][] __generic_file_write_iter+0xbd/0x1e0
[  502.481149][] f2fs_file_write_iter+0xdb/0x100 
[f2fs]
[  502.481173][] __vfs_write+0xc8/0x140
[  502.481190][] vfs_write+0xb5/0x1b0
[  502.481205][] SyS_write+0x49/0xa0
[  502.481220][] entry_SYSCALL_64_fastpath+0x23/0xc1
[  502.481236] 
[  502.481236] -> #0 (>s_type->i_mutex_key#18){+.+.+.}:
[  502.481264][] __lock_acquire+0x161c/0x1940
[  502.481280][] lock_acquire+0xd3/0x220
[  502.481296][] down_write+0x5a/0xc0
[  502.481312][] do_blockdev_direct_IO+0x1db/0x2310
[  502.481328][] __blockdev_direct_IO+0x3a/0x40
[  502.481344][] f2fs_direct_IO+0x104/0x3d0 [f2fs]
[  502.481368][] generic_file_read_iter+0x689/0x7e0
[  502.481384][] __vfs_read+0xc1/0x130
[  502.481399][] vfs_read+0x91/0x140
[  502.481414][] SyS_read+0x49/0xa0
[  502.481429][] entry_SYSCALL_64_fastpath+0x23/0xc1
[  502.481445] 
[  502.481445] other info that might help us debug this:
[  502.481445] 
[  502.481459]  Possible unsafe locking scenario:
[  502.481459] 
[  502.481726]CPU0CPU1
[  502.481987]
[  502.482242]   lock(>dio_rwsem);
[  502.482501]lock(>s_type->i_mutex_key#18);
[  502.482765]lock(>dio_rwsem);
[  502.483025]   lock(>s_type->i_mutex_key#18);
[  502.483285] 
[  502.483285]  *** DEADLOCK ***
[  502.483285] 
[  502.484018] 1 lock held by fsstress/10729:
[  502.484262]  #0:  (>dio_rwsem){.+.+.+}, at: [] 
f2fs_direct_IO+0xd1/0x3d0 [f2fs]

Thanks,

On Thu, Jul 07, 2016 at 12:49:12PM +0800, Chao Yu wrote:
> From: Chao Yu 
> 
> Datas in file can be operated by GC and DIO simultaneously, so we will
> face race case as below:
> 
> For write case:
> Thread A  Thread B
> - generic_file_direct_write
>  - invalidate_inode_pages2_range
>  - f2fs_direct_IO
>   - do_blockdev_direct_IO
>- do_direct_IO
> - get_more_blocks
>   - f2fs_gc
>- do_garbage_collect
> - gc_data_segment
>  - move_data_page
>   - do_write_data_page
>   migrate data block to new block 
> address
>- dio_bio_submit
>update user data to old block address
> 
> For read case:
> Thread AThread B
> - generic_file_direct_write
>  - invalidate_inode_pages2_range
>  - f2fs_direct_IO
>   - do_blockdev_direct_IO
>- do_direct_IO
> - get_more_blocks
>   - f2fs_balance_fs
>- f2fs_gc
> - do_garbage_collect
>  - gc_data_segment
>   - move_data_page
>- do_write_data_page
>migrate data block to new block 
> address
> - write_checkpoint
>  - do_checkpoint
>   - clear_prefree_segments
>- f2fs_issue_discard
>  discard old block adress
>- dio_bio_submit
>update user buffer from obsolete block address
> 
> In order to fix this, for one file, we should let DIO and GC getting exclusion
> against with each other.
> 
> Signed-off-by: Chao Yu 
> ---
> v3: use semaphore to avoid racing in between read dio and write dio.
>  fs/f2fs/data.c  |  4 
>  fs/f2fs/f2fs.h  |  1 +
>  

Re: [PATCH v3] f2fs: fix to avoid data update racing between GC and DIO

2016-07-07 Thread Jaegeuk Kim
Hi Chao,

Could you take a look at this in xfstests/generic/013?

[  502.480850] ==
[  502.480864] [ INFO: possible circular locking dependency detected ]
[  502.480877] 4.7.0-rc1+ #124 Tainted: G   OE  
[  502.480886] ---
[  502.480897] fsstress/10729 is trying to acquire lock:
[  502.480906]  (>s_type->i_mutex_key#18){+.+.+.}, at: [] 
do_blockdev_direct_IO+0x1db/0x2310
[  502.480948] 
[  502.480948] but task is already holding lock:
[  502.480959]  (>dio_rwsem){.+.+.+}, at: [] 
f2fs_direct_IO+0xd1/0x3d0 [f2fs]
[  502.481003] 
[  502.481003] which lock already depends on the new lock.
[  502.481003] 
[  502.481018] 
[  502.481018] the existing dependency chain (in reverse order) is:
[  502.481030] 
[  502.481030] -> #1 (>dio_rwsem){.+.+.+}:
[  502.481054][] lock_acquire+0xd3/0x220
[  502.481071][] down_read+0x51/0xa0
[  502.481089][] f2fs_direct_IO+0xd1/0x3d0 [f2fs]
[  502.481114][] generic_file_direct_write+0xa7/0x160
[  502.481133][] __generic_file_write_iter+0xbd/0x1e0
[  502.481149][] f2fs_file_write_iter+0xdb/0x100 
[f2fs]
[  502.481173][] __vfs_write+0xc8/0x140
[  502.481190][] vfs_write+0xb5/0x1b0
[  502.481205][] SyS_write+0x49/0xa0
[  502.481220][] entry_SYSCALL_64_fastpath+0x23/0xc1
[  502.481236] 
[  502.481236] -> #0 (>s_type->i_mutex_key#18){+.+.+.}:
[  502.481264][] __lock_acquire+0x161c/0x1940
[  502.481280][] lock_acquire+0xd3/0x220
[  502.481296][] down_write+0x5a/0xc0
[  502.481312][] do_blockdev_direct_IO+0x1db/0x2310
[  502.481328][] __blockdev_direct_IO+0x3a/0x40
[  502.481344][] f2fs_direct_IO+0x104/0x3d0 [f2fs]
[  502.481368][] generic_file_read_iter+0x689/0x7e0
[  502.481384][] __vfs_read+0xc1/0x130
[  502.481399][] vfs_read+0x91/0x140
[  502.481414][] SyS_read+0x49/0xa0
[  502.481429][] entry_SYSCALL_64_fastpath+0x23/0xc1
[  502.481445] 
[  502.481445] other info that might help us debug this:
[  502.481445] 
[  502.481459]  Possible unsafe locking scenario:
[  502.481459] 
[  502.481726]CPU0CPU1
[  502.481987]
[  502.482242]   lock(>dio_rwsem);
[  502.482501]lock(>s_type->i_mutex_key#18);
[  502.482765]lock(>dio_rwsem);
[  502.483025]   lock(>s_type->i_mutex_key#18);
[  502.483285] 
[  502.483285]  *** DEADLOCK ***
[  502.483285] 
[  502.484018] 1 lock held by fsstress/10729:
[  502.484262]  #0:  (>dio_rwsem){.+.+.+}, at: [] 
f2fs_direct_IO+0xd1/0x3d0 [f2fs]

Thanks,

On Thu, Jul 07, 2016 at 12:49:12PM +0800, Chao Yu wrote:
> From: Chao Yu 
> 
> Datas in file can be operated by GC and DIO simultaneously, so we will
> face race case as below:
> 
> For write case:
> Thread A  Thread B
> - generic_file_direct_write
>  - invalidate_inode_pages2_range
>  - f2fs_direct_IO
>   - do_blockdev_direct_IO
>- do_direct_IO
> - get_more_blocks
>   - f2fs_gc
>- do_garbage_collect
> - gc_data_segment
>  - move_data_page
>   - do_write_data_page
>   migrate data block to new block 
> address
>- dio_bio_submit
>update user data to old block address
> 
> For read case:
> Thread AThread B
> - generic_file_direct_write
>  - invalidate_inode_pages2_range
>  - f2fs_direct_IO
>   - do_blockdev_direct_IO
>- do_direct_IO
> - get_more_blocks
>   - f2fs_balance_fs
>- f2fs_gc
> - do_garbage_collect
>  - gc_data_segment
>   - move_data_page
>- do_write_data_page
>migrate data block to new block 
> address
> - write_checkpoint
>  - do_checkpoint
>   - clear_prefree_segments
>- f2fs_issue_discard
>  discard old block adress
>- dio_bio_submit
>update user buffer from obsolete block address
> 
> In order to fix this, for one file, we should let DIO and GC getting exclusion
> against with each other.
> 
> Signed-off-by: Chao Yu 
> ---
> v3: use semaphore to avoid racing in between read dio and write dio.
>  fs/f2fs/data.c  |  4 
>  fs/f2fs/f2fs.h  |  1 +
>  fs/f2fs/gc.c| 13 +
>  

RE: [PATCH v2 0/3] Cache id

2016-07-07 Thread Yu, Fenghua
> From: Borislav Petkov [mailto:b...@suse.de]
> Sent: Thursday, July 07, 2016 9:21 AM
> On Wed, Jul 06, 2016 at 03:07:15PM -0700, Fenghua Yu wrote:
> > From: Fenghua Yu 
> >
> > This patch set introduces cache id to identify a cache in platform. It
> > can be useful in such areas as Cach Allocation Technology (CAT) where
> > user needs to specify how much cache is allocated on which cache.
> > Cache id provides a concise way to identify the cache. CAT patches
> > will be released separately.
> >
> > Changes:
> > v2: Split one patch into three patches and add ABI documentation.
> >
> > Fenghua Yu (3):
> >   cacheinfo: Introduce cache id
> >   Documentation, ABI: Add a document entry for cache id
> >   x86, intel_cacheinfo: Enable cache id in x86
> >
> >  Documentation/ABI/testing/sysfs-devices-system-cpu | 13
> +
> >  arch/x86/kernel/cpu/intel_cacheinfo.c  | 20
> 
> >  drivers/base/cacheinfo.c   |  5 +
> >  include/linux/cacheinfo.h  |  3 +++
> >  4 files changed, 41 insertions(+)
> 
> All 3:
> 
> Acked-by: Borislav Petkov 

That's great!

Is it possible to merge the patches to 4.8? Then I don't need to carry these 
patches with upcoming CAT enabling patches:)

Thanks.

-Fenghua


RE: [PATCH v2 0/3] Cache id

2016-07-07 Thread Yu, Fenghua
> From: Borislav Petkov [mailto:b...@suse.de]
> Sent: Thursday, July 07, 2016 9:21 AM
> On Wed, Jul 06, 2016 at 03:07:15PM -0700, Fenghua Yu wrote:
> > From: Fenghua Yu 
> >
> > This patch set introduces cache id to identify a cache in platform. It
> > can be useful in such areas as Cach Allocation Technology (CAT) where
> > user needs to specify how much cache is allocated on which cache.
> > Cache id provides a concise way to identify the cache. CAT patches
> > will be released separately.
> >
> > Changes:
> > v2: Split one patch into three patches and add ABI documentation.
> >
> > Fenghua Yu (3):
> >   cacheinfo: Introduce cache id
> >   Documentation, ABI: Add a document entry for cache id
> >   x86, intel_cacheinfo: Enable cache id in x86
> >
> >  Documentation/ABI/testing/sysfs-devices-system-cpu | 13
> +
> >  arch/x86/kernel/cpu/intel_cacheinfo.c  | 20
> 
> >  drivers/base/cacheinfo.c   |  5 +
> >  include/linux/cacheinfo.h  |  3 +++
> >  4 files changed, 41 insertions(+)
> 
> All 3:
> 
> Acked-by: Borislav Petkov 

That's great!

Is it possible to merge the patches to 4.8? Then I don't need to carry these 
patches with upcoming CAT enabling patches:)

Thanks.

-Fenghua


linux-next: manual merge of the block tree with Linus' tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  drivers/block/xen-blkfront.c

between commit:

  7b427a59538a ("xen-blkfront: save uncompleted reqs in blkfront_resume()")

from Linus' tree and commit:

  c2df40dfb8c0 ("drivers: use req op accessor")
  3a5e02ced11e ("block, drivers: add REQ_OP_FLUSH operation")
  288dab8a35a0 ("block: add a separate operation type for secure erase")

from the block tree.

I fixed it up (I *think* - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/block/xen-blkfront.c
index fcc5b4e0aef2,10711292da2c..
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -2093,38 -2143,6 +2089,43 @@@ static int blkfront_resume(struct xenbu
  
dev_dbg(>dev, "blkfront_resume: %s\n", dev->nodename);
  
 +  bio_list_init(>bio_list);
 +  INIT_LIST_HEAD(>requests);
 +  for (i = 0; i < info->nr_rings; i++) {
 +  struct blkfront_ring_info *rinfo = >rinfo[i];
 +  struct bio_list merge_bio;
 +  struct blk_shadow *shadow = rinfo->shadow;
 +
 +  for (j = 0; j < BLK_RING_SIZE(info); j++) {
 +  /* Not in use? */
 +  if (!shadow[j].request)
 +  continue;
 +
 +  /*
 +   * Get the bios in the request so we can re-queue them.
 +   */
-   if (shadow[j].request->cmd_flags &
-   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | 
REQ_SECURE)) {
++  if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
++  req_op(shadow[j].request) == REQ_OP_DISCARD ||
++  req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
++  shadow[j].request->cmd_flags & REQ_FUA)) {
 +  /*
 +   * Flush operations don't contain bios, so
 +   * we need to requeue the whole request
++   *
++   * XXX: but this doesn't make any sense for a
++   * write with the FUA flag set..
 +   */
 +  list_add([j].request->queuelist, 
>requests);
 +  continue;
 +  }
 +  merge_bio.head = shadow[j].request->bio;
 +  merge_bio.tail = shadow[j].request->biotail;
 +  bio_list_merge(>bio_list, _bio);
 +  shadow[j].request->bio = NULL;
 +  blk_mq_end_request(shadow[j].request, 0);
 +  }
 +  }
 +
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
  
err = negotiate_mq(info);


linux-next: manual merge of the block tree with Linus' tree

2016-07-07 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  drivers/block/xen-blkfront.c

between commit:

  7b427a59538a ("xen-blkfront: save uncompleted reqs in blkfront_resume()")

from Linus' tree and commit:

  c2df40dfb8c0 ("drivers: use req op accessor")
  3a5e02ced11e ("block, drivers: add REQ_OP_FLUSH operation")
  288dab8a35a0 ("block: add a separate operation type for secure erase")

from the block tree.

I fixed it up (I *think* - see below) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/block/xen-blkfront.c
index fcc5b4e0aef2,10711292da2c..
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -2093,38 -2143,6 +2089,43 @@@ static int blkfront_resume(struct xenbu
  
dev_dbg(>dev, "blkfront_resume: %s\n", dev->nodename);
  
 +  bio_list_init(>bio_list);
 +  INIT_LIST_HEAD(>requests);
 +  for (i = 0; i < info->nr_rings; i++) {
 +  struct blkfront_ring_info *rinfo = >rinfo[i];
 +  struct bio_list merge_bio;
 +  struct blk_shadow *shadow = rinfo->shadow;
 +
 +  for (j = 0; j < BLK_RING_SIZE(info); j++) {
 +  /* Not in use? */
 +  if (!shadow[j].request)
 +  continue;
 +
 +  /*
 +   * Get the bios in the request so we can re-queue them.
 +   */
-   if (shadow[j].request->cmd_flags &
-   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | 
REQ_SECURE)) {
++  if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
++  req_op(shadow[j].request) == REQ_OP_DISCARD ||
++  req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
++  shadow[j].request->cmd_flags & REQ_FUA)) {
 +  /*
 +   * Flush operations don't contain bios, so
 +   * we need to requeue the whole request
++   *
++   * XXX: but this doesn't make any sense for a
++   * write with the FUA flag set..
 +   */
 +  list_add([j].request->queuelist, 
>requests);
 +  continue;
 +  }
 +  merge_bio.head = shadow[j].request->bio;
 +  merge_bio.tail = shadow[j].request->biotail;
 +  bio_list_merge(>bio_list, _bio);
 +  shadow[j].request->bio = NULL;
 +  blk_mq_end_request(shadow[j].request, 0);
 +  }
 +  }
 +
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
  
err = negotiate_mq(info);


Re: [PATCH v11 00/22] Add HiSilicon RoCE driver

2016-07-07 Thread oulijun
在 2016/7/2 17:39, Lijun Ou 写道:
> The HiSilicon Network Substem is a long term evolution IP which is
> supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network
> Sybsystem) also has a hardware support of performing RDMA with
> RoCEE.
> The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and
> will support mulitple versions of SOCs in future. This version of driver
> is meant to support Hip06 SoC(which confirms to RoCEv1 hardware
> specifications).
> 
> Changes v10 -> v11:
> [1/22]:
> 1. modify the print description of chip don't support roce
> 2. remove explicit values for enums for patch series
> [3/22]:
> 3. remove non-essential headers for patch series
> 4. add judgement for port_cnt is zero
> 5. Keep unified print style for "set mask..." vs. "No usable
>..."
> 6. modify the MODULE_LICENSE
> 7. remove MODULE_ALIAS
> [4/22]:
> 8. Move this line out of if-else and leave "if (enable)" part only
> 9. renaming the meaningful definition to 20 for patch series
> 10. delete extern keyword for hns_dsaf_roce_reset function
> 11. delete void keyword for hr_dev->hw->reset when driver removed
> [5/22]:
> 12. remove few unnecessary variables and some lines.
> 13. remove the function for one line of code which will be called
> once only for patch series
> [6/22]:
> 14. redesign the method for calculating token_mask' value
> [7/22]:
> 15. delete hns_roce_status_to_errno
> 16. modify the one enum only for all patches
> 17. remove the spin_lock in hns_roce_cq_event function
> 18. add comment here that 0x10 and 0x11 in hns_roce_event struct
> 19. refactor hns_roce_aeq_int function and It has switch in switch
> and it is almost 200 LOCs
> 20. simplify the lines for err_out_free_pages branch
> [8/22]:
> 21. remove icm and redesign it for patch series
> 
> Changes v9 -> v10:
> 1. delete redundant lines which it is netdevice.h in hns_roce_main.c
> 2. adjust the indentation for HNS_ROCE_V1_NUM_ASYNC_EQE
> 3. simplify the lines in hns_roce_init_qp_table function
> 4. add static type for hns_roce_unregister_device
> 5. move the call with hns_roce_unregister_device from the tenth patch to
>the eleventh patch in hns_roce_remove function
> 6. readjuest the alphabetic order in MAINTAINERS
> 7. redesigned the way for getting irq names
> 8. avoid the memory leakage because mr->pbl is not free in
>hns_roce_mr function
> 9. avoid the memory leakage because not kfree table->icm when exception
> 10. add the link from LKML as well whose comment in all
> 
> Changes v8 -> v9:
> 1. delete the definition of ADDR_SHIFT_n, use literal 12, 32 and 44 and
>add comments
> 2. use roce_read/roce_write/readl/write instead of roce_readl/roce_writel
> 3. delete the print error/debug messages for memory allocation errors
> 4. use exit instead of uninit, for example hw->uninit -> hw->exit
> 5. use roce_raw_write instead of _raw_writel in eq_set_cons_index
> 6. modify the label with underscore
> 7. adjust the indentation for the macro definitions in hns_roce_hw_v1.c
> 8. simplify some lines in few functions and structures
> 9. adjust the alphabetic order in MAINTAINERS
> 
> Changes v7 -> v8:
> 1. add a verbs operation named get_port_immutable. It is an 
>independent patch
> 2. add a comment for the definition of ADDR_SHIFT_n, n are 12,32
>and 44
> 3. restructures the code to align with naming convention of the Linux
>according to the review of Doug Ledford
> 4. modify the state for all .c and .h files
> 
> Changes v6 -> v7:
> 1. modify some type of parameter, use bool replace the original type
> 2. add the Signed-off-by signatures in the first patch
> 3. delete the improper print sentence in hns_roce_create_eq.
> 
> Changes v5 -> v6:
> 1. modify the type of obj for unsigned long according the reviews, and
>modify the same questions in RoCE module
> 2. fix the spelling error
> 3. fix the Signed-off-by signatures
> 
> Changes v4 -> v5:
> 1. redesign the patchset for RoCE modules in order to split the huge
>patch into small patches
> 2. fix the directory path for RoCE module. Delete the hisilicon level.
> 3. modify the name of roce_v1_hw into roce_hw_v1
> 
> Changes v3 -> v4:
> 1. modify roce.o into hns-roce.o in Makefile and Kconfig file
> 
> Changes v2 -> v3:
> 1. modify the formats of RoCE driver code base v2 by the experts 
>reviewing. also, it used kmalloc_array instead of kmalloc, kcalloc
>instead of kzalloc, when refer to memory allocation for array
> 2. remove some functions without use and unconnected macros
> 3. modify the binding document with RoCE DT base v2 which added
>interrupt-names
> 4. redesign the port_map and si_map in hns_dsaf_roce_reset
> 5. add HiSilicon RoCE driver maintainers introduction in MAINTAINERS
>document
> 
> Changes v1 -> v2:
> 1. modify the formats of roce driver code by the experts reviewing
> 2. modify the bindings file with roce dts. add the attribute named 
>interrput-names.
> 3. modify the way of defining port mode in hns_dsaf_main.c
> 

Re: [PATCH v11 00/22] Add HiSilicon RoCE driver

2016-07-07 Thread oulijun
在 2016/7/2 17:39, Lijun Ou 写道:
> The HiSilicon Network Substem is a long term evolution IP which is
> supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network
> Sybsystem) also has a hardware support of performing RDMA with
> RoCEE.
> The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and
> will support mulitple versions of SOCs in future. This version of driver
> is meant to support Hip06 SoC(which confirms to RoCEv1 hardware
> specifications).
> 
> Changes v10 -> v11:
> [1/22]:
> 1. modify the print description of chip don't support roce
> 2. remove explicit values for enums for patch series
> [3/22]:
> 3. remove non-essential headers for patch series
> 4. add judgement for port_cnt is zero
> 5. Keep unified print style for "set mask..." vs. "No usable
>..."
> 6. modify the MODULE_LICENSE
> 7. remove MODULE_ALIAS
> [4/22]:
> 8. Move this line out of if-else and leave "if (enable)" part only
> 9. renaming the meaningful definition to 20 for patch series
> 10. delete extern keyword for hns_dsaf_roce_reset function
> 11. delete void keyword for hr_dev->hw->reset when driver removed
> [5/22]:
> 12. remove few unnecessary variables and some lines.
> 13. remove the function for one line of code which will be called
> once only for patch series
> [6/22]:
> 14. redesign the method for calculating token_mask' value
> [7/22]:
> 15. delete hns_roce_status_to_errno
> 16. modify the one enum only for all patches
> 17. remove the spin_lock in hns_roce_cq_event function
> 18. add comment here that 0x10 and 0x11 in hns_roce_event struct
> 19. refactor hns_roce_aeq_int function and It has switch in switch
> and it is almost 200 LOCs
> 20. simplify the lines for err_out_free_pages branch
> [8/22]:
> 21. remove icm and redesign it for patch series
> 
> Changes v9 -> v10:
> 1. delete redundant lines which it is netdevice.h in hns_roce_main.c
> 2. adjust the indentation for HNS_ROCE_V1_NUM_ASYNC_EQE
> 3. simplify the lines in hns_roce_init_qp_table function
> 4. add static type for hns_roce_unregister_device
> 5. move the call with hns_roce_unregister_device from the tenth patch to
>the eleventh patch in hns_roce_remove function
> 6. readjuest the alphabetic order in MAINTAINERS
> 7. redesigned the way for getting irq names
> 8. avoid the memory leakage because mr->pbl is not free in
>hns_roce_mr function
> 9. avoid the memory leakage because not kfree table->icm when exception
> 10. add the link from LKML as well whose comment in all
> 
> Changes v8 -> v9:
> 1. delete the definition of ADDR_SHIFT_n, use literal 12, 32 and 44 and
>add comments
> 2. use roce_read/roce_write/readl/write instead of roce_readl/roce_writel
> 3. delete the print error/debug messages for memory allocation errors
> 4. use exit instead of uninit, for example hw->uninit -> hw->exit
> 5. use roce_raw_write instead of _raw_writel in eq_set_cons_index
> 6. modify the label with underscore
> 7. adjust the indentation for the macro definitions in hns_roce_hw_v1.c
> 8. simplify some lines in few functions and structures
> 9. adjust the alphabetic order in MAINTAINERS
> 
> Changes v7 -> v8:
> 1. add a verbs operation named get_port_immutable. It is an 
>independent patch
> 2. add a comment for the definition of ADDR_SHIFT_n, n are 12,32
>and 44
> 3. restructures the code to align with naming convention of the Linux
>according to the review of Doug Ledford
> 4. modify the state for all .c and .h files
> 
> Changes v6 -> v7:
> 1. modify some type of parameter, use bool replace the original type
> 2. add the Signed-off-by signatures in the first patch
> 3. delete the improper print sentence in hns_roce_create_eq.
> 
> Changes v5 -> v6:
> 1. modify the type of obj for unsigned long according the reviews, and
>modify the same questions in RoCE module
> 2. fix the spelling error
> 3. fix the Signed-off-by signatures
> 
> Changes v4 -> v5:
> 1. redesign the patchset for RoCE modules in order to split the huge
>patch into small patches
> 2. fix the directory path for RoCE module. Delete the hisilicon level.
> 3. modify the name of roce_v1_hw into roce_hw_v1
> 
> Changes v3 -> v4:
> 1. modify roce.o into hns-roce.o in Makefile and Kconfig file
> 
> Changes v2 -> v3:
> 1. modify the formats of RoCE driver code base v2 by the experts 
>reviewing. also, it used kmalloc_array instead of kmalloc, kcalloc
>instead of kzalloc, when refer to memory allocation for array
> 2. remove some functions without use and unconnected macros
> 3. modify the binding document with RoCE DT base v2 which added
>interrupt-names
> 4. redesign the port_map and si_map in hns_dsaf_roce_reset
> 5. add HiSilicon RoCE driver maintainers introduction in MAINTAINERS
>document
> 
> Changes v1 -> v2:
> 1. modify the formats of roce driver code by the experts reviewing
> 2. modify the bindings file with roce dts. add the attribute named 
>interrput-names.
> 3. modify the way of defining port mode in hns_dsaf_main.c
> 

Re: [PATCH v7 00/11] powerpc/powernv/cpuidle: Add support for POWER ISA v3 idle states

2016-07-07 Thread Michael Neuling
Except for the issue with patch 7 I've already commented on the rest of
this series is good with me.  FWIW:

Acked-by: Michael Neuling 

Thanks.

On Fri, 2016-07-08 at 02:17 +0530, Shreyas B. Prabhu wrote:
> POWER ISA v3 defines a new idle processor core mechanism. In summary,
>  a) new instruction named stop is added. This instruction replaces
>   instructions like nap, sleep, rvwinkle.
>  b) new per thread SPR named PSSCR is added which controls the behavior
>   of stop instruction. 
>   
> PSSCR has following key fields
>   Bits 0:3  - Power-Saving Level Status. This field indicates the
>   lowest power-saving state the thread entered since stop
>   instruction was last executed.
>   
>   Bit 42 - Enable State Loss  
>   0 - No state is lost irrespective of other fields  
>   1 - Allows state loss
>   
>   Bits 44:47 - Power-Saving Level Limit  
>   This limits the power-saving level that can be entered into.
>   
>   Bits 60:63 - Requested Level  
>   Used to specify which power-saving level must be entered on
>   executing stop instruction
>   
> Stop idle states and their properties like name, latency, target
> residency, psscr value are exposed via device tree.
> 
> This patch series adds support for this new mechanism.
> 
> Patches 1-6 are cleanups and code movement.
> Patch 7 adds platform specific support for stop and psscr handling.
> Patch 8 and 9 are minor cleanup in cpuidle driver.
> Patch 10 adds cpuidle driver support.
> Patch 11 makes offlined cpu use deepest stop state.
> 
> Note: Documentation for the device tree bindings is posted here-
> http://patchwork.ozlabs.org/patch/629125/
> 
> Changes in v7
> =
>  - File renamed to idle_book3s.S instead of idle_power_common.S
>  - Comment changes
>  - power_stop0, power_stop renamed to power9_idle and power_idle_stop
>  - PSSCR template is now a macro instead of storing in paca
>  - power9_idle in C file instead of assembly
>  - Fixed TOC related bug
>  - Handling subcore within FTR section
>  - Functions in idle.c reordered and broken into multiple functions
>  - calling __restore_cpu_power8/9 via cur_cpu_spec->cpu_restore 
>  - Added a minor patch with minor cleanups in cpuidle-powernv.c . This
>    was mainly to make the existing code consistent with the review
>    comments for new code
>  - Using stack for variables while probing for idle states instead of
>    kzalloc/kcalloc
> 
> Changes in v6
> =
>  - Restore new POWER ISA v3 SPRS when waking up from deep idle
> 
> Changes in v5
> =
>  - Use generic cpuidle constant CPUIDLE_NAME_LEN
>  - Fix return code handling for of_property_read_string_array
>  - Use DT flags to determine if are using stop instruction, instead of
>    cpu_has_feature
>  - Removed uncessary cast with names
>  - _loop -> stop_loop
>  - Added POWERNV_THRESHOLD_LATENCY_NS to filter out idle states with high 
> latency
> 
> Changes in v4
> =
>  - Added a patch to use PNV_THREAD_WINKLE macro while requesting for winkle
>  - Moved power7_powersave_common rename to more appropriate patch
>  - renaming power7_enter_nap_mode to pnv_enter_arch207_idle_mode
>  - Added PSSCR layout to Patch 7's commit message
>  - Improved / Fixed comments
>  - Fixed whitespace error in paca.h
>  - Using MAX_POSSIBLE_STOP_STATE macro instead of hardcoding 0xF has
>    max possible stop state
> 
> Changes in v3
> =
>  - Rebased on powerpc-next
>  - Dropping patch 1 since we are not adding a new file for P9 idle support
>  - Improved comments in multiple places
>  - Moved GET_PACA from power7_restore_hyp_resource to System Reset
>  - Instead of moving few functions from idle_power7 to idle_power_common,
>    renaming idle_power7.S to idle_power_common.S
>  - Moved HSTATE_HWTHREAD_STATE updation to power_powersave_common
>  - Dropped earlier patch 5 which moved few macros from idle_power_common to
>    asm/cpuidle.h. 
>  - Added a patch to rename reusable power7_* idle functions to pnv_*
>  - Added new patch that creates abstraction for saving SPRs before
>    entering deep idle states
>  - Instead of introducing new file idle_power_stop.S, P9 idle support
>    is added to idle_power_common.S using CPU_FTR sections.
>  - Fixed r4 reg clobbering in power_stop0
> 
> Changes in v2
> =
>  - Rebased on v4.6-rc6
>  - Using CPU_FTR_ARCH_300 bit instead of CPU_FTR_STOP_INST
> 
> Cc: Rafael J. Wysocki 
> Cc: Daniel Lezcano 
> Cc: linux...@vger.kernel.org
> Cc: Benjamin Herrenschmidt 
> Cc: Michael Ellerman 
> Cc: Paul Mackerras 
> Cc: Michael Neuling 
> Cc: linuxppc-...@lists.ozlabs.org
> Cc: Rob Herring 
> Cc: Lorenzo Pieralisi 
> 
> Shreyas 

Re: [PATCH v7 00/11] powerpc/powernv/cpuidle: Add support for POWER ISA v3 idle states

2016-07-07 Thread Michael Neuling
Except for the issue with patch 7 I've already commented on the rest of
this series is good with me.  FWIW:

Acked-by: Michael Neuling 

Thanks.

On Fri, 2016-07-08 at 02:17 +0530, Shreyas B. Prabhu wrote:
> POWER ISA v3 defines a new idle processor core mechanism. In summary,
>  a) new instruction named stop is added. This instruction replaces
>   instructions like nap, sleep, rvwinkle.
>  b) new per thread SPR named PSSCR is added which controls the behavior
>   of stop instruction. 
>   
> PSSCR has following key fields
>   Bits 0:3  - Power-Saving Level Status. This field indicates the
>   lowest power-saving state the thread entered since stop
>   instruction was last executed.
>   
>   Bit 42 - Enable State Loss  
>   0 - No state is lost irrespective of other fields  
>   1 - Allows state loss
>   
>   Bits 44:47 - Power-Saving Level Limit  
>   This limits the power-saving level that can be entered into.
>   
>   Bits 60:63 - Requested Level  
>   Used to specify which power-saving level must be entered on
>   executing stop instruction
>   
> Stop idle states and their properties like name, latency, target
> residency, psscr value are exposed via device tree.
> 
> This patch series adds support for this new mechanism.
> 
> Patches 1-6 are cleanups and code movement.
> Patch 7 adds platform specific support for stop and psscr handling.
> Patch 8 and 9 are minor cleanup in cpuidle driver.
> Patch 10 adds cpuidle driver support.
> Patch 11 makes offlined cpu use deepest stop state.
> 
> Note: Documentation for the device tree bindings is posted here-
> http://patchwork.ozlabs.org/patch/629125/
> 
> Changes in v7
> =
>  - File renamed to idle_book3s.S instead of idle_power_common.S
>  - Comment changes
>  - power_stop0, power_stop renamed to power9_idle and power_idle_stop
>  - PSSCR template is now a macro instead of storing in paca
>  - power9_idle in C file instead of assembly
>  - Fixed TOC related bug
>  - Handling subcore within FTR section
>  - Functions in idle.c reordered and broken into multiple functions
>  - calling __restore_cpu_power8/9 via cur_cpu_spec->cpu_restore 
>  - Added a minor patch with minor cleanups in cpuidle-powernv.c . This
>    was mainly to make the existing code consistent with the review
>    comments for new code
>  - Using stack for variables while probing for idle states instead of
>    kzalloc/kcalloc
> 
> Changes in v6
> =
>  - Restore new POWER ISA v3 SPRS when waking up from deep idle
> 
> Changes in v5
> =
>  - Use generic cpuidle constant CPUIDLE_NAME_LEN
>  - Fix return code handling for of_property_read_string_array
>  - Use DT flags to determine if are using stop instruction, instead of
>    cpu_has_feature
>  - Removed uncessary cast with names
>  - _loop -> stop_loop
>  - Added POWERNV_THRESHOLD_LATENCY_NS to filter out idle states with high 
> latency
> 
> Changes in v4
> =
>  - Added a patch to use PNV_THREAD_WINKLE macro while requesting for winkle
>  - Moved power7_powersave_common rename to more appropriate patch
>  - renaming power7_enter_nap_mode to pnv_enter_arch207_idle_mode
>  - Added PSSCR layout to Patch 7's commit message
>  - Improved / Fixed comments
>  - Fixed whitespace error in paca.h
>  - Using MAX_POSSIBLE_STOP_STATE macro instead of hardcoding 0xF has
>    max possible stop state
> 
> Changes in v3
> =
>  - Rebased on powerpc-next
>  - Dropping patch 1 since we are not adding a new file for P9 idle support
>  - Improved comments in multiple places
>  - Moved GET_PACA from power7_restore_hyp_resource to System Reset
>  - Instead of moving few functions from idle_power7 to idle_power_common,
>    renaming idle_power7.S to idle_power_common.S
>  - Moved HSTATE_HWTHREAD_STATE updation to power_powersave_common
>  - Dropped earlier patch 5 which moved few macros from idle_power_common to
>    asm/cpuidle.h. 
>  - Added a patch to rename reusable power7_* idle functions to pnv_*
>  - Added new patch that creates abstraction for saving SPRs before
>    entering deep idle states
>  - Instead of introducing new file idle_power_stop.S, P9 idle support
>    is added to idle_power_common.S using CPU_FTR sections.
>  - Fixed r4 reg clobbering in power_stop0
> 
> Changes in v2
> =
>  - Rebased on v4.6-rc6
>  - Using CPU_FTR_ARCH_300 bit instead of CPU_FTR_STOP_INST
> 
> Cc: Rafael J. Wysocki 
> Cc: Daniel Lezcano 
> Cc: linux...@vger.kernel.org
> Cc: Benjamin Herrenschmidt 
> Cc: Michael Ellerman 
> Cc: Paul Mackerras 
> Cc: Michael Neuling 
> Cc: linuxppc-...@lists.ozlabs.org
> Cc: Rob Herring 
> Cc: Lorenzo Pieralisi 
> 
> Shreyas B. Prabhu (11):
>   powerpc/powernv: Use PNV_THREAD_WINKLE macro while requesting for
> winkle
>   powerpc/kvm: make hypervisor state restore a function
>   powerpc/powernv: Rename idle_power7.S 

Re: [PATCH v7 09/11] cpuidle/powernv: cleanup powernv_add_idle_states

2016-07-07 Thread Michael Neuling
>   /*
> @@ -230,7 +238,7 @@ static int powernv_add_idle_states(void)
>   strcpy(powernv_states[nr_idle_states].desc, 
> "FastSleep");
>   powernv_states[nr_idle_states].flags = 
> CPUIDLE_FLAG_TIMER_STOP;
>   powernv_states[nr_idle_states].target_residency = 
> 30;
> - powernv_states[nr_idle_states].enter = _loop;
> + powernv_states[nr_idle_states].enter = fastsleep_loop;

You can change this code too with the same thing.

static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
.exit_latency = 0,
.target_residency = 0,
.enter = _loop },
};

Mikey


Re: [PATCH v7 09/11] cpuidle/powernv: cleanup powernv_add_idle_states

2016-07-07 Thread Michael Neuling
>   /*
> @@ -230,7 +238,7 @@ static int powernv_add_idle_states(void)
>   strcpy(powernv_states[nr_idle_states].desc, 
> "FastSleep");
>   powernv_states[nr_idle_states].flags = 
> CPUIDLE_FLAG_TIMER_STOP;
>   powernv_states[nr_idle_states].target_residency = 
> 30;
> - powernv_states[nr_idle_states].enter = _loop;
> + powernv_states[nr_idle_states].enter = fastsleep_loop;

You can change this code too with the same thing.

static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
.exit_latency = 0,
.target_residency = 0,
.enter = _loop },
};

Mikey


Re: [PATCH 08/31] mm, vmscan: simplify the logic deciding whether kswapd sleeps

2016-07-07 Thread Joonsoo Kim
On Thu, Jul 07, 2016 at 11:17:01AM +0100, Mel Gorman wrote:
> On Thu, Jul 07, 2016 at 10:20:39AM +0900, Joonsoo Kim wrote:
> > > @@ -3249,9 +3249,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, 
> > > int order,
> > >  
> > >   prepare_to_wait(>kswapd_wait, , TASK_INTERRUPTIBLE);
> > >  
> > > + /*
> > > +  * If kswapd has not been woken recently, then kswapd goes fully
> > > +  * to sleep. kcompactd may still need to wake if the original
> > > +  * request was high-order.
> > > +  */
> > > + if (classzone_idx == -1) {
> > > + wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
> > > + classzone_idx = MAX_NR_ZONES - 1;
> > > + goto full_sleep;
> > > + }
> > 
> > Passing -1 to kcompactd would cause the problem?
> > 
> 
> No, it ends up doing a wakeup and then going back to sleep which is not
> what is required. I'll fix it.
> 
> > > @@ -3390,12 +3386,24 @@ static int kswapd(void *p)
> > >* We can speed up thawing tasks if we don't call balance_pgdat
> > >* after returning from the refrigerator
> > >*/
> > > - if (!ret) {
> > > - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
> > > + if (ret)
> > > + continue;
> > >  
> > > - /* return value ignored until next patch */
> > > - balance_pgdat(pgdat, order, classzone_idx);
> > > - }
> > > + /*
> > > +  * Reclaim begins at the requested order but if a high-order
> > > +  * reclaim fails then kswapd falls back to reclaiming for
> > > +  * order-0. If that happens, kswapd will consider sleeping
> > > +  * for the order it finished reclaiming at (reclaim_order)
> > > +  * but kcompactd is woken to compact for the original
> > > +  * request (alloc_order).
> > > +  */
> > > + trace_mm_vmscan_kswapd_wake(pgdat->node_id, alloc_order);
> > > + reclaim_order = balance_pgdat(pgdat, alloc_order, 
> > > classzone_idx);
> > > + if (reclaim_order < alloc_order)
> > > + goto kswapd_try_sleep;
> > 
> > This 'goto' would cause kswapd to sleep prematurely. We need to check
> > *new* pgdat->kswapd_order and classzone_idx even in this case.
> > 
> 
> It only matters if the next request coming is also high-order requests but
> one thing that needs to be avoided is kswapd staying awake periods of time
> constantly reclaiming for high-order pages. This is why the check means
> "If we reclaimed for high-order and failed, then consider sleeping now".
> If allocations still require it, they direct reclaim instead.

But, assume that next request is zone-constrained allocation. We need
to balance memory for it but kswapd would skip it.

> 
> "Fixing" this potentially causes reclaim storms from kswapd.
> 
> > > @@ -3418,10 +3426,10 @@ void wakeup_kswapd(struct zone *zone, int order, 
> > > enum zone_type classzone_idx)
> > >   if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
> > >   return;
> > >   pgdat = zone->zone_pgdat;
> > > - if (pgdat->kswapd_max_order < order) {
> > > - pgdat->kswapd_max_order = order;
> > > - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
> > > - }
> > > + if (pgdat->kswapd_classzone_idx == -1)
> > > + pgdat->kswapd_classzone_idx = classzone_idx;
> > > + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, 
> > > classzone_idx);
> > > + pgdat->kswapd_order = max(pgdat->kswapd_order, order);
> > 
> > Now, updating pgdat->skwapd_max_order and classzone_idx happens
> > unconditionally. Before your patch, it is only updated toward hard
> > constraint (e.g. higher order).
> > 
> 
> So? It's updating the request to suit the requirements of all pending
> allocation requests that woke kswapd.
> 
> > And, I'd like to know why max() is used for classzone_idx rather than
> > min()? I think that kswapd should balance the lowest zone requested.
> > 
> 
> If there are two allocation requests -- one zone-constraned and the other
> zone-unconstrained, it does not make sense to have kswapd skip the pages
> usable for the zone-unconstrained and waste a load of CPU. You could

I agree that, in this case, it's not good to skip the pages usable
for the zone-unconstrained request. But, what I am concerned is that
kswapd stop reclaim prematurely in the view of zone-constrained
requestor. Kswapd decide to stop reclaim if one of eligible zone is
balanced and this max() makes eligible zone higher than the one
zone-unconstrained requestor want.

Thanks.

> argue that using min would satisfy the zone-constrained allocation faster
> but that's at the cost of delaying the zone-unconstrained allocation and
> wasting CPU. Bear in mind that using max may mean some lowmem pages get
> freed anyway due to LRU order.
> 
> -- 
> Mel Gorman
> SUSE Labs
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more 

Re: [PATCH 08/31] mm, vmscan: simplify the logic deciding whether kswapd sleeps

2016-07-07 Thread Joonsoo Kim
On Thu, Jul 07, 2016 at 11:17:01AM +0100, Mel Gorman wrote:
> On Thu, Jul 07, 2016 at 10:20:39AM +0900, Joonsoo Kim wrote:
> > > @@ -3249,9 +3249,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, 
> > > int order,
> > >  
> > >   prepare_to_wait(>kswapd_wait, , TASK_INTERRUPTIBLE);
> > >  
> > > + /*
> > > +  * If kswapd has not been woken recently, then kswapd goes fully
> > > +  * to sleep. kcompactd may still need to wake if the original
> > > +  * request was high-order.
> > > +  */
> > > + if (classzone_idx == -1) {
> > > + wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
> > > + classzone_idx = MAX_NR_ZONES - 1;
> > > + goto full_sleep;
> > > + }
> > 
> > Passing -1 to kcompactd would cause the problem?
> > 
> 
> No, it ends up doing a wakeup and then going back to sleep which is not
> what is required. I'll fix it.
> 
> > > @@ -3390,12 +3386,24 @@ static int kswapd(void *p)
> > >* We can speed up thawing tasks if we don't call balance_pgdat
> > >* after returning from the refrigerator
> > >*/
> > > - if (!ret) {
> > > - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
> > > + if (ret)
> > > + continue;
> > >  
> > > - /* return value ignored until next patch */
> > > - balance_pgdat(pgdat, order, classzone_idx);
> > > - }
> > > + /*
> > > +  * Reclaim begins at the requested order but if a high-order
> > > +  * reclaim fails then kswapd falls back to reclaiming for
> > > +  * order-0. If that happens, kswapd will consider sleeping
> > > +  * for the order it finished reclaiming at (reclaim_order)
> > > +  * but kcompactd is woken to compact for the original
> > > +  * request (alloc_order).
> > > +  */
> > > + trace_mm_vmscan_kswapd_wake(pgdat->node_id, alloc_order);
> > > + reclaim_order = balance_pgdat(pgdat, alloc_order, 
> > > classzone_idx);
> > > + if (reclaim_order < alloc_order)
> > > + goto kswapd_try_sleep;
> > 
> > This 'goto' would cause kswapd to sleep prematurely. We need to check
> > *new* pgdat->kswapd_order and classzone_idx even in this case.
> > 
> 
> It only matters if the next request coming is also high-order requests but
> one thing that needs to be avoided is kswapd staying awake periods of time
> constantly reclaiming for high-order pages. This is why the check means
> "If we reclaimed for high-order and failed, then consider sleeping now".
> If allocations still require it, they direct reclaim instead.

But, assume that next request is zone-constrained allocation. We need
to balance memory for it but kswapd would skip it.

> 
> "Fixing" this potentially causes reclaim storms from kswapd.
> 
> > > @@ -3418,10 +3426,10 @@ void wakeup_kswapd(struct zone *zone, int order, 
> > > enum zone_type classzone_idx)
> > >   if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
> > >   return;
> > >   pgdat = zone->zone_pgdat;
> > > - if (pgdat->kswapd_max_order < order) {
> > > - pgdat->kswapd_max_order = order;
> > > - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
> > > - }
> > > + if (pgdat->kswapd_classzone_idx == -1)
> > > + pgdat->kswapd_classzone_idx = classzone_idx;
> > > + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, 
> > > classzone_idx);
> > > + pgdat->kswapd_order = max(pgdat->kswapd_order, order);
> > 
> > Now, updating pgdat->skwapd_max_order and classzone_idx happens
> > unconditionally. Before your patch, it is only updated toward hard
> > constraint (e.g. higher order).
> > 
> 
> So? It's updating the request to suit the requirements of all pending
> allocation requests that woke kswapd.
> 
> > And, I'd like to know why max() is used for classzone_idx rather than
> > min()? I think that kswapd should balance the lowest zone requested.
> > 
> 
> If there are two allocation requests -- one zone-constraned and the other
> zone-unconstrained, it does not make sense to have kswapd skip the pages
> usable for the zone-unconstrained and waste a load of CPU. You could

I agree that, in this case, it's not good to skip the pages usable
for the zone-unconstrained request. But, what I am concerned is that
kswapd stop reclaim prematurely in the view of zone-constrained
requestor. Kswapd decide to stop reclaim if one of eligible zone is
balanced and this max() makes eligible zone higher than the one
zone-unconstrained requestor want.

Thanks.

> argue that using min would satisfy the zone-constrained allocation faster
> but that's at the cost of delaying the zone-unconstrained allocation and
> wasting CPU. Bear in mind that using max may mean some lowmem pages get
> freed anyway due to LRU order.
> 
> -- 
> Mel Gorman
> SUSE Labs
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more 

Re: [PATCH v3 3/4] drm/bridge: analogix_dp: add the PSR function support

2016-07-07 Thread Yakir Yang



On 07/08/2016 10:26 AM, Yakir Yang wrote:

Sean,

Thanks for your review.

On 07/02/2016 03:46 AM, Sean Paul wrote:

On Fri, Jul 1, 2016 at 5:19 AM, Yakir Yang  wrote:

The full name of PSR is Panel Self Refresh, panel device could refresh
itself with the hardware framebuffer in panel, this would make lots of
sense to save the power consumption.

This patch have exported two symbols for platform driver to implement
the PSR function in hardware side:
- analogix_dp_active_psr()
- analogix_dp_inactive_psr()

Signed-off-by: Yakir Yang 
---
Changes in v3:
- split analogix_dp_enable_psr(), make it more clearly
 analogix_dp_detect_sink_psr()
 analogix_dp_enable_sink_psr()
- remove some nosie register setting comments

Changes in v2:
- introduce in v2, splite the common Analogix DP changes out

  drivers/gpu/drm/bridge/analogix/analogix_dp_core.c | 64 
++

  drivers/gpu/drm/bridge/analogix/analogix_dp_core.h |  4 ++
  drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c  | 54 
++

  drivers/gpu/drm/bridge/analogix/analogix_dp_reg.h  | 28 ++
  include/drm/bridge/analogix_dp.h   |  3 +
  5 files changed, 153 insertions(+)

diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c 
b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c

index 32715da..b557097 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -97,6 +97,66 @@ static int analogix_dp_detect_hpd(struct 
analogix_dp_device *dp)

 return 0;
  }

+int analogix_dp_active_psr(struct device *dev)
+{
+   struct analogix_dp_device *dp = dev_get_drvdata(dev);
+
+   if (!dp->psr_support)
+   return -EINVAL;
+
+   analogix_dp_send_psr_spd(dp, EDP_VSC_PSR_STATE_ACTIVE |
+ EDP_VSC_PSR_CRC_VALUES_VALID);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(analogix_dp_active_psr);
+
+int analogix_dp_inactive_psr(struct device *dev)
+{
+   struct analogix_dp_device *dp = dev_get_drvdata(dev);
+
+   if (!dp->psr_support)
+   return -EINVAL;
+
+   analogix_dp_send_psr_spd(dp, 0);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(analogix_dp_inactive_psr);
+
+static bool analogix_dp_detect_sink_psr(struct analogix_dp_device *dp)
+{
+   unsigned char psr_version;
+
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_SUPPORT, 
_version);

+   dev_info(dp->dev, "Panel PSR version : %x\n", psr_version);
+

This info message is likely to be spammy since it's printed everytime
the panel toggle on. Perhaps downgrade to debug level.


Okay, done.


+   return (psr_version & DP_PSR_IS_SUPPORTED) ? true : false;
+}
+
+static int analogix_dp_enable_sink_psr(struct analogix_dp_device *dp)

Return type is int, but the function never fails and you don't check
the return value when calling it. Seems like this should be void.


Done.


+{
+   unsigned char psr_en;
+
+   /* Disable psr function */
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en &= ~DP_PSR_ENABLE;
+   analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   /* Main-Link transmitter remains active during PSR active 
states */

+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en = DP_PSR_MAIN_LINK_ACTIVE | DP_PSR_CRC_VERIFICATION;
Why read psr_en if you're just going to overwrite it? Perhaps you 
meant |= here.




Yes, it's my mistaken, no need to read the DP_PSR_EN_CFG, just 
configure it directly is enough.



+ analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   /* Enable psr function */
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en = DP_PSR_ENABLE | DP_PSR_MAIN_LINK_ACTIVE |
+DP_PSR_CRC_VERIFICATION;

Again, no need to read if you're just overwriting.


Yes, ditto


+ analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   analogix_dp_enable_psr_crc(dp);
+
+   return 0;
+}
+
  static unsigned char analogix_dp_calc_edid_check_sum(unsigned char 
*edid_data)

  {
 int i;
@@ -921,6 +981,10 @@ static void analogix_dp_commit(struct 
analogix_dp_device *dp)


 /* Enable video */
 analogix_dp_start_video(dp);
+
+   dp->psr_support = analogix_dp_detect_sink_psr(dp);
+   if (dp->psr_support)
+   analogix_dp_enable_sink_psr(dp);
  }

  int analogix_dp_get_modes(struct drm_connector *connector)
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h 
b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h

index b456380..6ca5dde 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
@@ -177,6 +177,7 @@ struct analogix_dp_device {
 int hpd_gpio;
 boolforce_hpd;
 unsigned char   edid[EDID_BLOCK_LENGTH * 2];
+   boolpsr_support;

  

RE: [PATCH v2] irqchip/qeic: move qeic driver from drivers/soc/fsl/qe

2016-07-07 Thread Qiang Zhao
On Thu, Jul 07, 2016 at 10:25PM , Jason Cooper  wrote:
> -Original Message-
> From: Jason Cooper [mailto:ja...@lakedaemon.net]
> Sent: Thursday, July 07, 2016 10:25 PM
> To: Qiang Zhao 
> Cc: o...@buserror.net; t...@linutronix.de; marc.zyng...@arm.com; linuxppc-
> d...@lists.ozlabs.org; linux-kernel@vger.kernel.org; Xiaobo Xie
> 
> Subject: Re: [PATCH v2] irqchip/qeic: move qeic driver from drivers/soc/fsl/qe
> 
> Hi Zhao Qiang,
> 
> On Thu, Jul 07, 2016 at 09:23:55AM +0800, Zhao Qiang wrote:
> > The driver stays the same.
> >
> > Signed-off-by: Zhao Qiang 
> > ---
> > Changes for v2:
> > - modify the subject and commit msg
> >
> >  drivers/irqchip/Makefile| 1 +
> >  drivers/{soc/fsl/qe => irqchip}/qe_ic.c | 0  drivers/{soc/fsl/qe =>
> > irqchip}/qe_ic.h | 0
> >  drivers/soc/fsl/qe/Makefile | 2 +-
> >  4 files changed, 2 insertions(+), 1 deletion(-)  rename
> > drivers/{soc/fsl/qe => irqchip}/qe_ic.c (100%)  rename
> > drivers/{soc/fsl/qe => irqchip}/qe_ic.h (100%)
> 
> Please merge the include file into the C file and rename to follow the naming
> convention in drivers/irqchip/.  e.g. irq-qeic.c or irq-qe_ic.c.
> 
> Once you have that, please resend the entire series with this as the first 
> patch.
> 

OK, I will modify the next version. 

-Zhao Qiang
BR


Re: [PATCH v3 3/4] drm/bridge: analogix_dp: add the PSR function support

2016-07-07 Thread Yakir Yang



On 07/08/2016 10:26 AM, Yakir Yang wrote:

Sean,

Thanks for your review.

On 07/02/2016 03:46 AM, Sean Paul wrote:

On Fri, Jul 1, 2016 at 5:19 AM, Yakir Yang  wrote:

The full name of PSR is Panel Self Refresh, panel device could refresh
itself with the hardware framebuffer in panel, this would make lots of
sense to save the power consumption.

This patch have exported two symbols for platform driver to implement
the PSR function in hardware side:
- analogix_dp_active_psr()
- analogix_dp_inactive_psr()

Signed-off-by: Yakir Yang 
---
Changes in v3:
- split analogix_dp_enable_psr(), make it more clearly
 analogix_dp_detect_sink_psr()
 analogix_dp_enable_sink_psr()
- remove some nosie register setting comments

Changes in v2:
- introduce in v2, splite the common Analogix DP changes out

  drivers/gpu/drm/bridge/analogix/analogix_dp_core.c | 64 
++

  drivers/gpu/drm/bridge/analogix/analogix_dp_core.h |  4 ++
  drivers/gpu/drm/bridge/analogix/analogix_dp_reg.c  | 54 
++

  drivers/gpu/drm/bridge/analogix/analogix_dp_reg.h  | 28 ++
  include/drm/bridge/analogix_dp.h   |  3 +
  5 files changed, 153 insertions(+)

diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c 
b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c

index 32715da..b557097 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -97,6 +97,66 @@ static int analogix_dp_detect_hpd(struct 
analogix_dp_device *dp)

 return 0;
  }

+int analogix_dp_active_psr(struct device *dev)
+{
+   struct analogix_dp_device *dp = dev_get_drvdata(dev);
+
+   if (!dp->psr_support)
+   return -EINVAL;
+
+   analogix_dp_send_psr_spd(dp, EDP_VSC_PSR_STATE_ACTIVE |
+ EDP_VSC_PSR_CRC_VALUES_VALID);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(analogix_dp_active_psr);
+
+int analogix_dp_inactive_psr(struct device *dev)
+{
+   struct analogix_dp_device *dp = dev_get_drvdata(dev);
+
+   if (!dp->psr_support)
+   return -EINVAL;
+
+   analogix_dp_send_psr_spd(dp, 0);
+   return 0;
+}
+EXPORT_SYMBOL_GPL(analogix_dp_inactive_psr);
+
+static bool analogix_dp_detect_sink_psr(struct analogix_dp_device *dp)
+{
+   unsigned char psr_version;
+
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_SUPPORT, 
_version);

+   dev_info(dp->dev, "Panel PSR version : %x\n", psr_version);
+

This info message is likely to be spammy since it's printed everytime
the panel toggle on. Perhaps downgrade to debug level.


Okay, done.


+   return (psr_version & DP_PSR_IS_SUPPORTED) ? true : false;
+}
+
+static int analogix_dp_enable_sink_psr(struct analogix_dp_device *dp)

Return type is int, but the function never fails and you don't check
the return value when calling it. Seems like this should be void.


Done.


+{
+   unsigned char psr_en;
+
+   /* Disable psr function */
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en &= ~DP_PSR_ENABLE;
+   analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   /* Main-Link transmitter remains active during PSR active 
states */

+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en = DP_PSR_MAIN_LINK_ACTIVE | DP_PSR_CRC_VERIFICATION;
Why read psr_en if you're just going to overwrite it? Perhaps you 
meant |= here.




Yes, it's my mistaken, no need to read the DP_PSR_EN_CFG, just 
configure it directly is enough.



+ analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   /* Enable psr function */
+   analogix_dp_read_byte_from_dpcd(dp, DP_PSR_EN_CFG, _en);
+   psr_en = DP_PSR_ENABLE | DP_PSR_MAIN_LINK_ACTIVE |
+DP_PSR_CRC_VERIFICATION;

Again, no need to read if you're just overwriting.


Yes, ditto


+ analogix_dp_write_byte_to_dpcd(dp, DP_PSR_EN_CFG, psr_en);
+
+   analogix_dp_enable_psr_crc(dp);
+
+   return 0;
+}
+
  static unsigned char analogix_dp_calc_edid_check_sum(unsigned char 
*edid_data)

  {
 int i;
@@ -921,6 +981,10 @@ static void analogix_dp_commit(struct 
analogix_dp_device *dp)


 /* Enable video */
 analogix_dp_start_video(dp);
+
+   dp->psr_support = analogix_dp_detect_sink_psr(dp);
+   if (dp->psr_support)
+   analogix_dp_enable_sink_psr(dp);
  }

  int analogix_dp_get_modes(struct drm_connector *connector)
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h 
b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h

index b456380..6ca5dde 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.h
@@ -177,6 +177,7 @@ struct analogix_dp_device {
 int hpd_gpio;
 boolforce_hpd;
 unsigned char   edid[EDID_BLOCK_LENGTH * 2];
+   boolpsr_support;

 struct analogix_dp_plat_data 

RE: [PATCH v2] irqchip/qeic: move qeic driver from drivers/soc/fsl/qe

2016-07-07 Thread Qiang Zhao
On Thu, Jul 07, 2016 at 10:25PM , Jason Cooper  wrote:
> -Original Message-
> From: Jason Cooper [mailto:ja...@lakedaemon.net]
> Sent: Thursday, July 07, 2016 10:25 PM
> To: Qiang Zhao 
> Cc: o...@buserror.net; t...@linutronix.de; marc.zyng...@arm.com; linuxppc-
> d...@lists.ozlabs.org; linux-kernel@vger.kernel.org; Xiaobo Xie
> 
> Subject: Re: [PATCH v2] irqchip/qeic: move qeic driver from drivers/soc/fsl/qe
> 
> Hi Zhao Qiang,
> 
> On Thu, Jul 07, 2016 at 09:23:55AM +0800, Zhao Qiang wrote:
> > The driver stays the same.
> >
> > Signed-off-by: Zhao Qiang 
> > ---
> > Changes for v2:
> > - modify the subject and commit msg
> >
> >  drivers/irqchip/Makefile| 1 +
> >  drivers/{soc/fsl/qe => irqchip}/qe_ic.c | 0  drivers/{soc/fsl/qe =>
> > irqchip}/qe_ic.h | 0
> >  drivers/soc/fsl/qe/Makefile | 2 +-
> >  4 files changed, 2 insertions(+), 1 deletion(-)  rename
> > drivers/{soc/fsl/qe => irqchip}/qe_ic.c (100%)  rename
> > drivers/{soc/fsl/qe => irqchip}/qe_ic.h (100%)
> 
> Please merge the include file into the C file and rename to follow the naming
> convention in drivers/irqchip/.  e.g. irq-qeic.c or irq-qe_ic.c.
> 
> Once you have that, please resend the entire series with this as the first 
> patch.
> 

OK, I will modify the next version. 

-Zhao Qiang
BR


  1   2   3   4   5   6   7   8   9   10   >