[PATCH 1/4] w1: fix netlink refcnt leak on error path

2014-02-02 Thread David Fries
If the message type is W1_MASTER_CMD or W1_SLAVE_CMD, then a reference
is taken when searching for the slave or master device.  If there
isn't any following data m-len (mlen is a copy) is 0 and packing up
the message for later execution is skipped leaving nothing to
decrement the reference counts.

Way back when, m-len was checked before the search that increments the
reference count, but W1_LIST_MASTERS has no additional data, the check
was moved in 9be62e0b2fadaf5ff causing this bug.

This change reorders to put the check before the reference count is
incremented avoiding the problem.

Signed-off-by: David Fries da...@fries.net
---
 drivers/w1/w1_netlink.c |   44 ++--
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c
index 5234964..a02704a 100644
--- a/drivers/w1/w1_netlink.c
+++ b/drivers/w1/w1_netlink.c
@@ -300,12 +300,6 @@ static int w1_process_command_root(struct cn_msg *msg,
struct w1_netlink_msg *w;
u32 *id;
 
-   if (mcmd-type != W1_LIST_MASTERS) {
-   printk(KERN_NOTICE %s: msg: %x.%x, wrong type: %u, len: %u.\n,
-   __func__, msg-id.idx, msg-id.val, mcmd-type, 
mcmd-len);
-   return -EPROTO;
-   }
-
cn = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!cn)
return -ENOMEM;
@@ -441,6 +435,9 @@ static void w1_process_cb(struct w1_master *dev, struct 
w1_async_cmd *async_cmd)
w1_netlink_send_error(node-block-msg, node-m, cmd,
node-block-portid, err);
 
+   /* ref taken in w1_search_slave or w1_search_master_id when building
+* the block
+*/
if (sl)
w1_unref_slave(sl);
else
@@ -503,30 +500,42 @@ static void w1_cn_callback(struct cn_msg *msg, struct 
netlink_skb_parms *nsp)
 
msg_len = msg-len;
while (msg_len  !err) {
-   struct w1_reg_num id;
-   u16 mlen = m-len;
 
dev = NULL;
sl = NULL;
 
-   memcpy(id, m-id.id, sizeof(id));
-#if 0
-   printk(%s: %02x.%012llx.%02x: type=%02x, len=%u.\n,
-   __func__, id.family, (unsigned long long)id.id, 
id.crc, m-type, m-len);
-#endif
if (m-len + sizeof(struct w1_netlink_msg)  msg_len) {
err = -E2BIG;
break;
}
 
+   /* execute on this thread, no need to process later */
+   if (m-type == W1_LIST_MASTERS) {
+   err = w1_process_command_root(msg, m, nsp-portid);
+   goto out_cont;
+   }
+
+   /* All following message types require additional data,
+* check here before references are taken.
+*/
+   if (!m-len) {
+   err = -EPROTO;
+   goto out_cont;
+   }
+
+   /* both search calls take reference counts */
if (m-type == W1_MASTER_CMD) {
dev = w1_search_master_id(m-id.mst.id);
} else if (m-type == W1_SLAVE_CMD) {
-   sl = w1_search_slave(id);
+   sl = w1_search_slave((struct w1_reg_num *)m-id.id);
if (sl)
dev = sl-master;
} else {
-   err = w1_process_command_root(msg, m, nsp-portid);
+   printk(KERN_NOTICE
+   %s: msg: %x.%x, wrong type: %u, len: %u.\n,
+   __func__, msg-id.idx, msg-id.val,
+   m-type, m-len);
+   err = -EPROTO;
goto out_cont;
}
 
@@ -536,8 +545,6 @@ static void w1_cn_callback(struct cn_msg *msg, struct 
netlink_skb_parms *nsp)
}
 
err = 0;
-   if (!mlen)
-   goto out_cont;
 
atomic_inc(block-refcnt);
node-async.cb = w1_process_cb;
@@ -557,7 +564,8 @@ out_cont:
if (err)
w1_netlink_send_error(msg, m, NULL, nsp-portid, err);
msg_len -= sizeof(struct w1_netlink_msg) + m-len;
-   m = (struct w1_netlink_msg *)(((u8 *)m) + sizeof(struct 
w1_netlink_msg) + m-len);
+   m = (struct w1_netlink_msg *)(((u8 *)m) +
+   sizeof(struct w1_netlink_msg) + m-len);
 
/*
 * Let's allow requests for nonexisting devices.
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] w1: document struct w1_netlink_msg and struct w1_netlink_cmd

2014-02-02 Thread David Fries
I wasn't sure on the length, so I looked it up and documented it.

Signed-off-by: David Fries da...@fries.net
---
 drivers/w1/w1_netlink.h |   25 +
 1 file changed, 25 insertions(+)

diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h
index 1e9504e..c646a98 100644
--- a/drivers/w1/w1_netlink.h
+++ b/drivers/w1/w1_netlink.h
@@ -49,6 +49,19 @@ enum w1_netlink_message_types {
W1_LIST_MASTERS,
 };
 
+/**
+ * struct w1_netlink_msg - holds w1 message type, id, and result
+ *
+ * @type: one of enum w1_netlink_message_types
+ * @status: kernel feedback for success 0 or errno failure value
+ * @len: length of data following w1_netlink_msg
+ * @id: union holding master bus id (msg.id) and slave device id (id[8]).
+ * @data: start address of any following data
+ *
+ * The base message structure for w1 messages over netlink.
+ * The netlink connector data sequence is, struct nlmsghdr, struct cn_msg,
+ * then one or more struct w1_netlink_msg (each with optional data).
+ */
 struct w1_netlink_msg
 {
__u8type;
@@ -66,6 +79,7 @@ struct w1_netlink_msg
 
 /**
  * enum w1_commands - commands available for master or slave operations
+ *
  * @W1_CMD_READ: read len bytes
  * @W1_CMD_WRITE: write len bytes
  * @W1_CMD_SEARCH: initiate a standard search, returns only the slave
@@ -93,6 +107,17 @@ enum w1_commands {
W1_CMD_MAX
 };
 
+/**
+ * struct w1_netlink_cmd - holds the command and data
+ *
+ * @cmd: one of enum w1_commands
+ * @res: reserved
+ * @len: length of data following w1_netlink_cmd
+ * @data: start address of any following data
+ *
+ * One or more struct w1_netlink_cmd is placed starting at w1_netlink_msg.data
+ * each with optional data.
+ */
 struct w1_netlink_cmd
 {
__u8cmd;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH,RFC] random: collect cpu randomness

2014-02-02 Thread Jörn Engel
On Sun, 2 February 2014 22:25:31 +0100, Stephan Mueller wrote:
 Am Sonntag, 2. Februar 2014, 15:36:17 schrieb Jörn Engel:
 
  Collects entropy from random behaviour all modern cpus exhibit.  The
  scheduler and slab allocator are instrumented for this purpose.  How
  much randomness can be gathered is clearly hardware-dependent and hard
  to estimate.  Therefore the entropy estimate is zero, but random bits
  still get mixed into the pools.
 
 May I ask what the purpose of the patches is when no entropy is implied? I 
 see 
 that the pool is stirred more. But is that really a problem that needs 
 addressing?

For my part, I think the whole business of estimating entropy is
bordering on the esoteric.  If the hash on the output side is any
good, you have a completely unpredictable prng once the entropy pool
is unpredictable.  Additional random bits are nice, but not all that
useful.  Blocking /dev/random based on entropy estimates is likewise
not all that useful.

Key phrase is once the entropy pool is unpredictable.  So early in
bootup it may make sense to estimate the entropy.  But here the
problem is that you cannot measure entropy, at least not within a
single system and a reasonable amount of time.  That leaves you with a
heuristic that, like all heuristics, is wrong.

I personally care more about generating high-quality randomness as
soon as possible and with low cost to the system.  Feel free to
disagree or set your priorities differently.

 Please, do not get me wrong with the presented critisism here -- the approach 
 in general looks interesting.
 
 However, the following patches makes me wonder big time.
 
   extern void get_random_bytes(void *buf, int nbytes);
  diff --git a/kernel/sched/core.c b/kernel/sched/core.c
  index a88f4a485c5e..7af6389f9b9e 100644
  --- a/kernel/sched/core.c
  +++ b/kernel/sched/core.c
  @@ -2511,6 +2511,7 @@ need_resched:
  rq = cpu_rq(cpu);
  rcu_note_context_switch(cpu);
  prev = rq-curr;
  +   __add_cpu_randomness(__builtin_return_address(1), prev);
  
  schedule_debug(prev);
  
  diff --git a/mm/slab.c b/mm/slab.c
  index eb043bf05f4c..ea5a30d44ad1 100644
  --- a/mm/slab.c
  +++ b/mm/slab.c
  @@ -3587,6 +3587,7 @@ static __always_inline void *__do_kmalloc(size_t size,
  gfp_t flags, trace_kmalloc(caller, ret,
size, cachep-size, flags);
  
  +   add_cpu_randomness(__builtin_return_address(2), ret);
  return ret;
   }
 
 First, the noise source you add is constantly triggered throughout the 
 execution of the kernel. Entropy is very important, we (who are interested in 
 crypto) know that. But how often is entropy needed? Other folks wonder about 
 the speed of the kernel. And with these two patches, every kmalloc and every 
 scheduling invocation now dives into the random.c code to do something. I 
 would think this is a bit expensive, especially to stir the pool without 
 increasing the entropy estimator. I think entropy collection should be 
 performed when it is needed and not throughout the lifetime of the system.

Please measure how expensive it really is.  My measurement gave me a
doesn't matter result, surprising as it may seem.

If the cost actually matters, we can either disable or rate-limit the
randomness collection at some point after boot.  But that would bring
us back into the estimation business.

 Second, when I offered my initial patch which independently collects some 
 entropy on the CPU execution timing, I got shot down with one concern raised 
 by Ted, and that was about whether a user can influence the entropy 
 collection 
 process. When I am trying to measure CPU execution timing in the RNG, the 
 concern was raised that the measured timing variations was due to CPU states 
 that were influenced by users. Your patch here clearly hooks into code paths 
 which are definitely affected by user actions. So, this patch therefore would 
 be subject to the same concerns. I personally think that this is not so much 
 an issue, yet it was raised previously.

The nice thing about the random pool is that mixing any amount of
deterministic data into it does not diminish the randomness already in
it.  Given that attribute, I don't understand the concern.

 It seems I have a bad timing, because just two days ago I released a new 
 attempt on the CPU jitter RNG [1] with a new noise source, and I was just 
 about to prepare a release email. With that attempt, both issues raised above 
 are addressed, including a theoretical foundation of the noise source.
 
 [1] http://www.chronox.de/

I am not married to my patch.  If the approach makes sense, let's
merge it.  If the approach does not make sense or there is a better
alternative, drop it on the floor.

The problem I see with your approach is this:
The only prerequisite is the availability of a high-resolution timer
that is available in modern CPUs.

Given a modern CPU with a high-resolution timer, you will almost
certainly collect enough randomness for good random 

Re: [PATCH,RFC] random: collect cpu randomness

2014-02-02 Thread H. Peter Anvin
On 02/02/2014 05:24 PM, Jörn Engel wrote:
 
 For my part, I think the whole business of estimating entropy is
 bordering on the esoteric.  If the hash on the output side is any
 good, you have a completely unpredictable prng once the entropy pool
 is unpredictable.  Additional random bits are nice, but not all that
 useful.  Blocking /dev/random based on entropy estimates is likewise
 not all that useful.
 
 Key phrase is once the entropy pool is unpredictable.  So early in
 bootup it may make sense to estimate the entropy.  But here the
 problem is that you cannot measure entropy, at least not within a
 single system and a reasonable amount of time.  That leaves you with a
 heuristic that, like all heuristics, is wrong.
 

The entropy bound needs to be a conservative lower bound.  Its main use
is to provide backpressure (should we spend more CPU time producing
entropy) although the forward pressure on /dev/random is potentially
useful for high security applications.

This does NOT mean that zero-credit entropy generation is useless, far
from it.  It just means that we are doing it on an it can't hurt
basis, rather than I know for sure that this is valuable.

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ACPI / hotplug / PCI: Scan root bus under the PCI rescan-remove lock

2014-02-02 Thread Rafael J. Wysocki
From: Rafael J. Wysocki rafael.j.wyso...@intel.com

Since acpiphp_check_bridge() called by acpiphp_check_host_bridge()
does things that require PCI rescan-remove locking around it,
make acpiphp_check_host_bridge() use that locking.

Signed-off-by: Rafael J. Wysocki rafael.j.wyso...@intel.com
---

One more thing I overlooked in the PCI rescan-remove locking patchset.

I just found it sitting there in a dark dusty corner and staring at me in
horror when I approached it with a vacuum cleaner ...

Anyway, 3.14-rc2 material on top of patches [1-2/13] from this series:
https://lkml.org/lkml/2014/2/1/123

Thanks,
Rafael

---
 drivers/pci/hotplug/acpiphp_glue.c |4 
 1 file changed, 4 insertions(+)

Index: linux-pm/drivers/pci/hotplug/acpiphp_glue.c
===
--- linux-pm.orig/drivers/pci/hotplug/acpiphp_glue.c
+++ linux-pm/drivers/pci/hotplug/acpiphp_glue.c
@@ -829,7 +829,11 @@ void acpiphp_check_host_bridge(acpi_hand
 
bridge = acpiphp_handle_to_bridge(handle);
if (bridge) {
+   pci_lock_rescan_remove();
+
acpiphp_check_bridge(bridge);
+
+   pci_unlock_rescan_remove();
put_bridge(bridge);
}
 }

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH,RFC] random: collect cpu randomness

2014-02-02 Thread Theodore Ts'o
On Sun, Feb 02, 2014 at 10:25:31PM +0100, Stephan Mueller wrote:
 Second, when I offered my initial patch which independently collects some 
 entropy on the CPU execution timing, I got shot down with one concern raised 
 by Ted, and that was about whether a user can influence the entropy 
 collection 
 process.

Um, that wasn't my concern.  After all, when we sample keyboard timing
while trying to generate a GPG key, of course the user can and does
influence the entropy collection process.

The question is whether an attacker who has deep knowledge of the how
the CPU works internally, perhaps made worse with quantization effects
(i.e., it doesn't matter if analog-generated settling time is measured
in microseconds if the output is being clocked out in milliseconds),
such that it is predictable.

I really like Jörn's tests doing repeated boot testing and observing
on a SMP system, the slab allocation pattern is quite deterministic.
So even though the numbers might *look* random, an attacker with deep
knowledge of how the kernel was compiled and what memory allocations
get done during the boot sequence would be able to quite successfuly
measure it.

I'm guessing that indeed, on a 4-CPU KVM system, what you're measuring
is the when the host OS happens to be scheduling the KVM threads, with
some variability caused by external networking interrupts, etc.  It
would definitely be a good idea to retry that experiment on a real
4-CPU system to see what sort of results you might get.  It might very
well be that the attacker who knows the relative ordering of the
slab/thread activations but for which it's not entirely clear whether
one cpu will be ahead of another, that there is *some* entropy, but
perhaps only a handful bits.  It's the fact that we can't be sure how
much uncertainty there might be with an attacker with very deep
knowledge the CPU which is why Jörn's conservatism of not crediting
the entropy counter is quite understandable.

Of course, this doesn't help someone who is trying to speed up the
time it takes GPG to generate a new key pair.  But in terms of
improving /dev/urandom as it is used by many crypto applications, it
certainly can't hurt.

The real question is how much overhead does it add, and is it worth
it.  Jörn, I take it that was the reason for creating an even faster,
but weaker mixing function?  Was the existing fast mix causing a
measurable overhead, or was this your just being really paranoid about
not adding anything to the various kernel fastpaths?

  - Ted
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:x86/x32] compat: Fix sparse address space warnings

2014-02-02 Thread tip-bot for H. Peter Anvin
Commit-ID:  dce44e03b0a3448ad11ac6c6e0cbe299e0400791
Gitweb: http://git.kernel.org/tip/dce44e03b0a3448ad11ac6c6e0cbe299e0400791
Author: H. Peter Anvin h...@linux.intel.com
AuthorDate: Sun, 2 Feb 2014 17:57:28 -0800
Committer:  H. Peter Anvin h...@linux.intel.com
CommitDate: Sun, 2 Feb 2014 18:00:29 -0800

compat: Fix sparse address space warnings

In compat_sys_old_getrlimit() we pass a kernel pointer to
sys_old_getrlimit() inside a set_fs() bracket.  This is okay, so we
can safely cast the affected pointer to __user.

In compat_clock_nanosleep_restart(), the variable rmtp holds a user
pointer.  Annotate it as such.

Both of these warnings are ancient, but were reported by Fengguang
Wu's test system due to other changes.

Signed-off-by: H. Peter Anvin h...@linux.intel.com
Cc: Toyo Abe to...@mvista.com
Link: http://lkml.kernel.org/n/tip-507h7cq5e45eg6ygtykon...@git.kernel.org
---
 kernel/compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/compat.c b/kernel/compat.c
index 3afc524..7076b57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -451,7 +451,7 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int 
resource,
mm_segment_t old_fs = get_fs();
 
set_fs(KERNEL_DS);
-   ret = sys_old_getrlimit(resource, r);
+   ret = sys_old_getrlimit(resource, (struct rlimit __user *)r);
set_fs(old_fs);
 
if (!ret) {
@@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct 
restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
-   struct compat_timespec *rmtp = restart-nanosleep.compat_rmtp;
+   struct compat_timespec __user *rmtp = restart-nanosleep.compat_rmtp;
 
restart-nanosleep.rmtp = (struct timespec __user *) tu;
oldfs = get_fs();
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH 0/3] epoll: read(),write(),ioctl() interface

2014-02-02 Thread Nathaniel Yazdani
Hi everyone,

This patch series adds support for read(), write(), and ioctl() operations
on eventpolls as well as an associated userspace structure to format the
eventpoll entries delivered via read()/write() buffers. The new structure,
struct epoll, differs from struct epoll_event mainly in that it also holds
the associated file descriptor. Using the normal I/O interface to manipulate
eventpolls is much neater than using epoll-specific syscalls while also
allowing for greater flexibility (theoretically, pipes could be used to
filter access). Specifically, write() creates, modifies, and/or removes event
entries stored in the supplied buffer, using the userspace identifier to
check whether an entry exists and removing it if no events are set to trigger
it, while read() simply waits for enough events to fill the provided buffer.
As timeout control is essential for polling to be practical, ioctl() is used
to configure an optional timeout, which is infinite by default.

 Documentation/ioctl/ioctl-number.txt |   1 +
 fs/eventpoll.c   | 534 ---
 include/uapi/linux/eventpoll.h   |  10 +
 3 files changed, 384 insertions(+), 161 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH 2/3] epoll: add struct epoll ioctl() commands

2014-02-02 Thread Nathaniel Yazdani
Add a new 'struct epoll' to the userspace eventpoll interface. Buffers
supplied to read()  write() calls on eventpolls are interpreted as
arrays of this structure. The new structure's only functional difference
from epoll_event is it also holds the associated file descriptor (needed
for write() to properly create events but useful information in general).
Also define the ioctl() command macros to set  get the timeout of an
eventpoll.

Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com
---
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index bc81fb2..73f817c 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -56,11 +56,21 @@
 #define EPOLL_PACKED
 #endif
 
+/* ioctl() requests */
+#define EPIOC_GETTIMEOUT   _IOR('$', 0x10, int)
+#define EPIOC_SETTIMEOUT   _IOW('$', 0x11, int)
+
 struct epoll_event {
__u32 events;
__u64 data;
 } EPOLL_PACKED;
 
+struct epoll {
+   int ep_fildes; /* file descriptor */
+   int ep_events; /* triggering events */
+   long long ep_ident; /* entry ID (cf. epoll_event-data) */
+} EPOLL_PACKED; /* A.K.A. epe for eventpoll entry */
+
 #ifdef CONFIG_PM_SLEEP
 static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
 {
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH 1/3] epoll: reserve small ioctl() space

2014-02-02 Thread Nathaniel Yazdani
Reserve a small ioctl() command space for eventpolls, of which only two
are currently utilized.

Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com
---
diff --git a/Documentation/ioctl/ioctl-number.txt 
b/Documentation/ioctl/ioctl-number.txt
index d7e43fa..3c6f8ac 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -81,6 +81,7 @@ Code  Seq#(hex)   Include FileComments
 0x22   all scsi/sg.h
 '#'00-3F   IEEE 1394 Subsystem Block for the entire subsystem
 '$'00-0F   linux/perf_counter.h, linux/perf_event.h
+'$'10-1F   include/uapi/linux/eventpoll.h
 ''00-07   drivers/firewire/nosy-user.h
 '1'00-1F   linux/timepps.h   PPS kit from Ulrich Windl

ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH 3/3] epoll: add read()/write()/ioctl() operations

2014-02-02 Thread Nathaniel Yazdani
The eventpoll implementation is largely interface-agnostic, aside from the
userspace structure format and epoll_ctl(). Particularly as each field of the
structure is handled independently, replacing usage of epoll_event internally
was straighforward and clarifies the code some. As for epoll_ctl(), its
functionality was moved into the new ep_eventpoll_write() function, and
epoll_ctl() just hands off its work to it. The ep_eventpoll_read() function is
very similar to epoll_wait(), which remains independent but shares the vast
majority of code for minimal redundancy. Finally, ep_eventpoll_ioctl() is a
simple interface to configure a default timeout for read() operations on the
given eventpoll.

Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com
---
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index af90312..7f0ce59 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -168,8 +168,11 @@ struct epitem {
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
 
-   /* The structure that describe the interested events and the source fd 
*/
-   struct epoll_event event;
+   /* Interested events */
+   int events;
+
+   /* The userspace identifier for this entry */
+   long long ident;
 };
 
 /*
@@ -216,6 +219,9 @@ struct eventpoll {
 
struct file *file;
 
+   /* Default timeout */
+   int timeout;
+
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
@@ -251,6 +257,13 @@ struct ep_send_events_data {
struct epoll_event __user *events;
 };
 
+/* ep_scan_ready_list() callback data for ep_send_epes() */
+struct ep_send_epes_data
+{
+   int max;
+   struct epoll __user *epes;
+};
+
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
@@ -795,9 +808,9 @@ static int ep_eventpoll_release(struct inode *inode, struct 
file *file)
 
 static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
 {
-   pt-_key = epi-event.events;
+   pt-_key = epi-events;
 
-   return epi-ffd.file-f_op-poll(epi-ffd.file, pt)  epi-event.events;
+   return epi-ffd.file-f_op-poll(epi-ffd.file, pt)  epi-events;
 }
 
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -881,8 +894,8 @@ static int ep_show_fdinfo(struct seq_file *m, struct file 
*f)
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
 
ret = seq_printf(m, tfd: %8d events: %8x data: %16llx\n,
-epi-ffd.fd, epi-event.events,
-(long long)epi-event.data);
+epi-ffd.fd, epi-events,
+(long long)epi-ident);
if (ret)
break;
}
@@ -892,6 +905,15 @@ static int ep_show_fdinfo(struct seq_file *m, struct file 
*f)
 }
 #endif
 
+static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf,
+ size_t bufsz, loff_t *pos);
+
+static ssize_t ep_eventpoll_read(struct file *file, char __user *buf,
+size_t bufsz, loff_t *pos);
+
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+  unsigned long arg);
+
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 #ifdef CONFIG_PROC_FS
@@ -899,6 +921,9 @@ static const struct file_operations eventpoll_fops = {
 #endif
.release= ep_eventpoll_release,
.poll   = ep_eventpoll_poll,
+   .read   = ep_eventpoll_read,
+   .write  = ep_eventpoll_write,
+   .unlocked_ioctl = ep_eventpoll_ioctl,
.llseek = noop_llseek,
 };
 
@@ -1025,7 +1050,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned 
mode, int sync, void *k
 * EPOLLONESHOT bit that disables the descriptor when an event is 
received,
 * until the next EPOLL_CTL_MOD will be issued.
 */
-   if (!(epi-event.events  ~EP_PRIVATE_BITS))
+   if (!(epi-events  ~EP_PRIVATE_BITS))
goto out_unlock;
 
/*
@@ -1034,7 +1059,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned 
mode, int sync, void *k
 * callback. We need to be able to handle both cases here, hence the
 * test for key != NULL before the event match test.
 */
-   if (key  !((unsigned long) key  epi-event.events))
+   if (key  !((unsigned long) key  epi-events))
goto out_unlock;
 
/*
@@ -1264,7 +1289,7 @@ static noinline void ep_destroy_wakeup_source(struct 
epitem *epi)
 /*
  * Must be called with mtx held.
  */
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+static int ep_insert(struct eventpoll *ep, long long ident, int events,
 struct file *tfile, int fd, int full_check)
 {

Re: [livelock, 3.13.0] livelock when run out of swap space

2014-02-02 Thread Hugh Dickins
On Mon, 3 Feb 2014, Dave Chinner wrote:
 Hi folks,
 
 I just had a test machine livelock when running a concurrent rm -rf
 workload on an XFS filesystem with 64k directory block sizes. The
 buffer allocation code started reporting this 5 times a second:
 
 XFS: possible memory allocation deadlock in kmem_alloc (mode:0x8250)
 
 Which is in GFP_NOFS|GFP_ZERO context. It is likely to have been a
 high order allocation (up to 64k), but there was still lenty of free
 memory available (2.8GB of 16GB):
 
 $ free
  total   used   free sharedbuffers cached
 Mem:  16424296   135937322830564  0136   3184
 -/+ buffers/cache:   135904122833884
 Swap:   497976 497976  0
 $
 
 But clearly there was no page cache being used. All of the memory in
 use was in the inode/dentry caches:
 
   OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME
   9486678 9483271  99%1.19K 364874   26  11675968K xfs_inode
   4820508 4820508 100%0.21K 130284   37   1042272K dentry
   4820224 4820224 100%0.06K  75316   64301264K kmalloc-64
 
 The issue is that memory allocation was not making progress - the
 shrinkers we not doing anything because they were under GFP_NOFS
 allocation context, and kswapd was never woken to take over. The
 system was compeltely out of swap space, and all the CPU was being
 burnt in this function:
 
44.91%  [kernel]  [k] scan_swap_map
 
 The typical stack trace of a looping memory allocation is this:
 
 [211699.924006] CPU: 2 PID: 21939 Comm: rm Not tainted 3.13.0-dgc+ #172
 [211699.924006] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 [211699.924006] task: 88041a7dde40 ti: 8803bbeec000 task.ti: 
 8803bbeec000
 [211699.924006] RIP: 0010:[81187dd8]  [81187dd8] 
 scan_swap_map+0x118/0x520
 [211699.924006] RSP: 0018:8803bbeed508  EFLAGS: 0297
 [211699.924006] RAX: a1ba RBX: 0032 RCX: 
 
 [211699.924006] RDX: 0001 RSI: 0001e64e RDI: 
 00019def
 [211699.924006] RBP: 8803bbeed558 R08: 002c6ba0 R09: 
 
 [211699.924006] R10: 57ffb90ace3d4f80 R11: 00019def R12: 
 88041a682900
 [211699.924006] R13: 01ff R14: 0040 R15: 
 88041a6829a0
 [211699.924006] FS:  7fae682c8700() GS:88031bc0() 
 knlGS:
 [211699.924006] CS:  0010 DS:  ES:  CR0: 8005003b
 [211699.924006] CR2: 004353b0 CR3: 0002ea498000 CR4: 
 06e0
 [211699.924006] Stack:
 [211699.924006]  8803bbeed538 88031a9dc720 a1ba 
 4893
 [211699.924006]   88041a682900  
 0001
 [211699.924006]   88041a6829a0 8803bbeed598 
 8118837f
 [211699.924006] Call Trace:
 [211699.924006]  [8118837f] get_swap_page+0xef/0x1e0
 [211699.924006]  [81184e34] add_to_swap+0x24/0x70
 [211699.924006]  [8115f110] shrink_page_list+0x300/0xa20
 [211699.924006]  [81169089] ? __mod_zone_page_state+0x49/0x50
 [211699.924006]  [8116a3b9] ? wait_iff_congested+0xa9/0x150
 [211699.924006]  [8115fe03] shrink_inactive_list+0x243/0x480
 [211699.924006]  [811606f1] shrink_lruvec+0x371/0x670
 [211699.924006]  [81cdb4ce] ? _raw_spin_unlock+0xe/0x10
 [211699.924006]  [81160dea] do_try_to_free_pages+0x11a/0x360
 [211699.924006]  [81161220] try_to_free_pages+0x110/0x190
 [211699.924006]  [81156422] __alloc_pages_nodemask+0x5a2/0x8a0
 [211699.924006]  [8118fac2] alloc_pages_current+0xb2/0x170
 [211699.924006]  [81151bde] __get_free_pages+0xe/0x50
 [211699.924006]  [8116d199] kmalloc_order_trace+0x39/0xb0
 [211699.924006]  [810cf4c3] ? finish_wait+0x63/0x80
 [211699.924006]  [81197156] __kmalloc+0x176/0x180
 [211699.924006]  [810cf520] ? __init_waitqueue_head+0x40/0x40
 [211699.924006]  [814a74f7] kmem_alloc+0x77/0xf0
 [211699.924006]  [814feb54] xfs_log_commit_cil+0x3c4/0x5a0
 [211699.924006]  [814a6be3] xfs_trans_commit+0xc3/0x2d0
 [211699.924006]  [814e913e] xfs_remove+0x3be/0x440
 [211699.924006]  [811b7d8d] ? __d_lookup+0x11d/0x170
 [211699.924006]  [8149b842] xfs_vn_unlink+0x52/0xa0
 [211699.924006]  [811acc22] vfs_unlink+0xf2/0x160
 [211699.924006]  [811acef0] do_unlinkat+0x260/0x2a0
 [211699.924006]  [811b003b] SyS_unlinkat+0x1b/0x40
 [211699.924006]  [81ce3ea9] system_call_fastpath+0x16/0x1b
 
 i.e. trying to do memory allocation during a transaction commit in
 XFS, and that is looping in kmem_alloc().
 
 THe problem in this case is that kswapd was not being started to
 free slab cache memory (i.e. to handle the defered GFP_NOFS slab
 reclaim).  It stayed in the livelock state for over an hour before I
 broke it by running echo 2  

Re: [PATCH 1/2] irq_work: allow certain work in hard irq context

2014-02-02 Thread Mike Galbraith
On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:

 So CPU5  CPU52 were eating 100% CPU doing nothing instead of running
 cc1  objdump right?

Yeah.

 According to the backtrace both of them are trying to access the
 per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
 to get the timer lock here. They shouldn't spin there for minutes, I
 have no idea why they did so…

I dumped it for later-guy.. but he tends to get busy doing other crap,
and just whacks my carefully saved data ;-)

 I guess this problem does not occur without -RT and before that patch
 you saw only that one warning from can_stop_full_tick()?

I didn't try it without -RT, and yes, without, you just get the warning.

-Mike

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)

2014-02-02 Thread Alexandre Courbot

On 02/03/2014 04:10 AM, Ilia Mirkin wrote:

Hi Alexandre,

On Fri, Jan 31, 2014 at 10:16 PM, Alexandre Courbot acour...@nvidia.com wrote:

I guess my email address might surprise some of you, so let me anticipate some
questions you might have. :P Yes, this work is endorsed by NVIDIA. Several other
NVIDIAns (CC'd), including core GPU experts, have provided significant technical
guidance and will continue their involvement. Special thanks go to Terje
Bergstrom and Ken Adams for their invaluable GPU expertise, and Thierry Reding
(at FOSDEM this weekend) for help with debugging and user-space testing.

Let me also stress that although very exciting, this effort is still
experimental, so I would like to make sure that nobody makes excessive
expectations based on these few patches. The scope of this work is strictly
limited to Tegra (although given the similarities desktop GPU support will
certainly benefit from it indirectly), and we do not have any plan to work on
user-space support. So do not uninstall that proprietary driver just yet. ;)

With this being clarified, we are looking forward to getting your feedback and
working with you guys to bring and improve Tegra K1 support into Nouveau! :)


I've sent a couple of fairly trivial comments, as you saw, and I
suspect that others with a better understanding of the guts will have
more substantial architectural feedback, esp after the weekend/FOSDEM.
However, since no one's said it already -- welcome to Nouveau!


Thanks! ^_^v

One beginner question: is it appropriate to send kernel patches to the 
nouveau list in addition to dri-devel? The moderation messages I receive 
make me think that this list might rather be intended for general 
discussion.



 From the looks of it, you could bring up a full open-source stack with
your patches (i.e. Xorg + nouveau DDX + mesa) and use PRIME to render
stuff (assuming the actual display hw has an X ddx).  Although I
suspect that you're going to want to use your own drivers. Still a
little curious if you've tried the open-source stack and whether it
worked. [Not sure what the status is of render-node support is in
mesa, but perhaps it's enough to try running piglit tests, if you
can't get X going with the display HW.]


We are still testing things at libdrm level, but are eventually 
interested in bringing up the existing open-source stack. Our guess (and 
hope) is that it will work nicely almost as-is, minus the fact that the 
display hardware is not handled by Nouveau and we only support render 
nodes (I have yet to look at what the state of render nodes in Mesa is).


For X, Thierry is IIUC working on the display driver, and at some point 
these efforts should join to connect tegradrm and Nouveau using PRIME. 
We are not quite there yet, and since we are working with limited 
resources it will likely require some time, but the fact we could bring 
up a (seemingly) working Nouveau kernel driver with so little code is 
encouraging.


Thanks,
Alex.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [git pull] Please pull powerpc.git next branch

2014-02-02 Thread Michael Ellerman
On Wed, 2014-01-29 at 13:29 +1100, Alistair Popple wrote:
 Looks like I missed the dart iommu code when changing the iommu table
 initialisation. The patch below should fix it, would you mind testing
 it Ben? Thanks.

Any reason not to add the following to save ourselves in future?

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d773dd4..6ab7b53 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid)
unsigned int i;
struct iommu_pool *p;
 
+   BUG_ON(!tbl-it_page_shift);
+
/* number of bytes needed for the bitmap */
sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long);
 

cheers



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] pci: fix kernel-doc notation warning

2014-02-02 Thread Randy Dunlap
From: Randy Dunlap rdun...@infradead.org

Fix a blank kernel-doc line to have an asterisk instead of
being totally empty.  This fixes the kernel-doc warning:

Warning(drivers/pci/msi.c:962): bad line: 

Signed-off-by: Randy Dunlap rdun...@infradead.org
---
 drivers/pci/msi.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- lnx-314-rc1.orig/drivers/pci/msi.c
+++ lnx-314-rc1/drivers/pci/msi.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(pci_disable_msi);
 /**
  * pci_msix_vec_count - return the number of device's MSI-X table entries
  * @dev: pointer to the pci_dev data structure of MSI-X device function
-
+ *
  * This function returns the number of device's MSI-X table entries and
  * therefore the number of MSI-X vectors device is capable of sending.
  * It returns a negative errno if the device is not capable of sending MSI-X
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)

2014-02-02 Thread Ilia Mirkin
On Sun, Feb 2, 2014 at 9:44 PM, Alexandre Courbot acour...@nvidia.com wrote:
 One beginner question: is it appropriate to send kernel patches to the
 nouveau list in addition to dri-devel? The moderation messages I receive
 make me think that this list might rather be intended for general
 discussion.

I usually do. The main thing is to make sure that they're To: Ben,
since he's the one who will be ultimately be picking them up. I think
that if you're not subscribed, all the lists.freedesktop.org lists
moderate you, but dri-devel is configured not to tell you about it.
Also I've been getting bounce messages from nouveau@ complaining of
too many cc's and so it's getting auto-moderated -- not sure who, if
anyone, is an admin of the nouveau list. Hopefully someone :)

  -ilia
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: Tree for Feb 3

2014-02-02 Thread Stephen Rothwell
Hi all,

This tree fails (more than usual) the powerpc allyesconfig build.

Changes since 20140131:

Dropped tree: btrfs (needs cleaning up)

The powerpc tree still had its build failure.

The btrfs tree had lost of conflicts against Linus' tree so I dropped it
for today.

Non-merge commits (relative to Linus' tree): 983
 1521 files changed, 21062 insertions(+), 7748 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use git pull
to do so as that will try to merge the new linux-next release with the
old one.  You should use git fetch as mentioned in the FAQ on the wiki
(see below).

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64 and a
multi_v7_defconfig for arm. After the final fixups (if any), it is also
built with powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig and
allyesconfig (minus CONFIG_PROFILE_ALL_BRANCHES - this fails its final
link) and i386, sparc, sparc64 and arm defconfig. These builds also have
CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and
CONFIG_DEBUG_INFO disabled when necessary.

Below is a summary of the state of the merge.

I am currently merging 208 trees (counting Linus' and 28 trees of patches
pending for Linus' tree).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

There is a wiki covering stuff to do with linux-next at
http://linux.f-seidel.de/linux-next/pmwiki/ .  Thanks to Frank Seidel.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (602456bf1699 Merge branch 'hwmon-for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jdelvare/staging)
Merging fixes/master (b0031f227e47 Merge tag 's2mps11-build' of 
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator)
Merging kbuild-current/rc-fixes (19514fc665ff arm, kbuild: make make install 
not depend on vmlinux)
Merging arc-current/for-curr (7e22e91102c6 Linux 3.13-rc8)
Merging arm-current/fixes (d326b65c57d6 ARM: fix building with gcc 4.6.4)
Merging m68k-current/for-linus (56931d73697c m68k/mac: Make SCC reset work more 
reliably)
Merging metag-fixes/fixes (3b2f64d00c46 Linux 3.11-rc2)
Merging powerpc-merge/merge (b3084f4db3ae powerpc/thp: Fix crash on mremap)
Merging sparc/master (9b0cd304f26b Merge branch 'drm-next' of 
git://people.freedesktop.org/~airlied/linux)
Merging net/master (4fe46b9a4d0b vxlan: remove extra newline after function 
definition)
Merging ipsec/master (965cdea82569 dccp: catch failed request_module call in 
dccp_probe init)
Merging sound-current/for-linus (75fae117a5db ALSA: hda/hdmi - allow PIN_OUT to 
be dynamically enabled)
Merging pci-current/for-linus (f0b75693cbb2 MAINTAINERS: Add DesignWare, i.MX6, 
Armada, R-Car PCI host maintainers)
Merging wireless/master (7d0d46da750a Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net)
Merging driver-core.current/driver-core-linus (90804ed61f24 Merge branch 
'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging tty.current/tty-linus (413541dd66d5 Linux 3.13-rc5)
Merging usb.current/usb-linus (90804ed61f24 Merge branch 'for_linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging staging.current/staging-linus (77d143de7581 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml)
Merging char-misc.current/char-misc-linus (90804ed61f24 Merge branch 
'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs)
Merging input-current/for-linus (55df811f2066 Merge branch 'next' into 
for-linus)
Merging md-current/for-linus (d47648fcf061 raid5: avoid finding discard 
stripe)
Merging crypto-current/master (ee97dc7db4cb crypto: s390 - fix des and des3_ede 
ctr concurrency issue)
Merging ide/master (9b0cd304f26b Merge branch 'drm-next' of 
git://people.freedesktop.org/~airlied/linux)
Merging dwmw2/master (5950f0803ca9 pcmcia: remove RPX board stuff)
Merging devicetree-current/devicetree/merge (6f041e99fc7b of: Fix NULL 
dereference in unflatten_and_copy())
Merging rr-fixes/fixes (7122c3e9154b scripts/link-vmlinux.sh: only filter 
kernel symbols for arm)
Merging mfd-fixes/master (73beb63d290f mfd: 

[PATCH v3 6/8] ARM: dts: sun7i: cubieboard2: Enable GMAC instead of EMAC

2014-02-02 Thread Chen-Yu Tsai
GMAC has better performance and fewer hardware issues.
Use the GMAC in MII mode for ethernet instead of the EMAC.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20-cubieboard2.dts | 27 ---
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts 
b/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts
index 5c51cb8..7bf4935 100644
--- a/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts
+++ b/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts
@@ -19,21 +19,6 @@
compatible = cubietech,cubieboard2, allwinner,sun7i-a20;
 
soc@01c0 {
-   emac: ethernet@01c0b000 {
-   pinctrl-names = default;
-   pinctrl-0 = emac_pins_a;
-   phy = phy1;
-   status = okay;
-   };
-
-   mdio@01c0b080 {
-   status = okay;
-
-   phy1: ethernet-phy@1 {
-   reg = 1;
-   };
-   };
-
pinctrl@01c20800 {
led_pins_cubieboard2: led_pins@0 {
allwinner,pins = PH20, PH21;
@@ -60,6 +45,18 @@
pinctrl-0 = i2c1_pins_a;
status = okay;
};
+
+   gmac: ethernet@01c5 {
+   pinctrl-names = default;
+   pinctrl-0 = gmac_pins_mii_a;
+   phy = phy1;
+   phy-mode = mii;
+   status = okay;
+
+   phy1: ethernet-phy@1 {
+   reg = 1;
+   };
+   };
};
 
leds {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 5/8] ARM: dts: sun7i: cubietruck: Enable the GMAC

2014-02-02 Thread Chen-Yu Tsai
The CubieTruck uses the GMAC with an RGMII phy.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20-cubietruck.dts | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/arm/boot/dts/sun7i-a20-cubietruck.dts 
b/arch/arm/boot/dts/sun7i-a20-cubietruck.dts
index f9dcb61..025ce52 100644
--- a/arch/arm/boot/dts/sun7i-a20-cubietruck.dts
+++ b/arch/arm/boot/dts/sun7i-a20-cubietruck.dts
@@ -51,6 +51,18 @@
pinctrl-0 = i2c2_pins_a;
status = okay;
};
+
+   gmac: ethernet@01c5 {
+   pinctrl-names = default;
+   pinctrl-0 = gmac_pins_rgmii_a;
+   phy = phy1;
+   phy-mode = rgmii;
+   status = okay;
+
+   phy1: ethernet-phy@1 {
+   reg = 1;
+   };
+   };
};
 
leds {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 8/8] ARM: dts: sun7i: Add ethernet alias for GMAC

2014-02-02 Thread Chen-Yu Tsai
U-Boot will insert MAC address into the device tree image.
It looks up ethernet[0-5] aliases to find the ethernet nodes.
Alias GMAC as ethernet0, as it is the only ethernet controller used.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index 65fb8d0..c48fb11 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -17,7 +17,7 @@
interrupt-parent = gic;
 
aliases {
-   ethernet0 = emac;
+   ethernet0 = gmac;
};
 
cpus {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 4/8] ARM: dts: sun7i: Add pin muxing options for the GMAC

2014-02-02 Thread Chen-Yu Tsai
The A20 has EMAC and GMAC muxed on the same pins.
Add pin sets with gmac function for MII and RGMII mode to the DTSI.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20.dtsi | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index 5fbac23..65fb8d0 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -469,6 +469,32 @@
allwinner,drive = 0;
allwinner,pull = 0;
};
+
+   gmac_pins_mii_a: gmac_mii@0 {
+   allwinner,pins = PA0, PA1, PA2,
+   PA3, PA4, PA5, PA6,
+   PA7, PA8, PA9, PA10,
+   PA11, PA12, PA13, PA14,
+   PA15, PA16;
+   allwinner,function = gmac;
+   allwinner,drive = 0;
+   allwinner,pull = 0;
+   };
+
+   gmac_pins_rgmii_a: gmac_rgmii@0 {
+   allwinner,pins = PA0, PA1, PA2,
+   PA3, PA4, PA5, PA6,
+   PA7, PA8, PA10,
+   PA11, PA12, PA13,
+   PA15, PA16;
+   allwinner,function = gmac;
+   /*
+* data lines in RGMII mode use DDR mode
+* and need a higher signal drive strength
+*/
+   allwinner,drive = 3;
+   allwinner,pull = 0;
+   };
};
 
timer@01c20c00 {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 7/8] ARM: dts: sun7i: olinuxino-micro: Enable GMAC instead of EMAC

2014-02-02 Thread Chen-Yu Tsai
GMAC has better performance and fewer hardware issues.
Use the GMAC in MII mode for ethernet instead of the EMAC.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts | 27 +++--
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts 
b/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts
index ead3013..b02a796 100644
--- a/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts
+++ b/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts
@@ -19,21 +19,6 @@
compatible = olimex,a20-olinuxino-micro, allwinner,sun7i-a20;
 
soc@01c0 {
-   emac: ethernet@01c0b000 {
-   pinctrl-names = default;
-   pinctrl-0 = emac_pins_a;
-   phy = phy1;
-   status = okay;
-   };
-
-   mdio@01c0b080 {
-   status = okay;
-
-   phy1: ethernet-phy@1 {
-   reg = 1;
-   };
-   };
-
pinctrl@01c20800 {
led_pins_olinuxino: led_pins@0 {
allwinner,pins = PH2;
@@ -78,6 +63,18 @@
pinctrl-0 = i2c2_pins_a;
status = okay;
};
+
+   gmac: ethernet@01c5 {
+   pinctrl-names = default;
+   pinctrl-0 = gmac_pins_mii_a;
+   phy = phy1;
+   phy-mode = mii;
+   status = okay;
+
+   phy1: ethernet-phy@1 {
+   reg = 1;
+   };
+   };
};
 
leds {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH,RFC] random: collect cpu randomness

2014-02-02 Thread Jörn Engel
On Sun, 2 February 2014 20:39:22 -0500, Theodore Ts'o wrote:
 
 The real question is how much overhead does it add, and is it worth
 it.  Jörn, I take it that was the reason for creating an even faster,
 but weaker mixing function?  Was the existing fast mix causing a
 measurable overhead, or was this your just being really paranoid about
 not adding anything to the various kernel fastpaths?

It was paranoia.  And I am still somewhat paranoid and don't trust my
benchmark results yet.  Maybe on an 1024-CPU Altix with a 100k-thread
workload the overhead is too much.  Just because I couldn't measure a
difference on my wimpy notebook does not mean much.

Jörn

--
One of the painful things about our time is that those who feel certainty
are stupid, and those with any imagination and understanding are filled
with doubt and indecision.
-- Bertrand Russell
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/8] Add Allwinner A20 GMAC ethernet support

2014-02-02 Thread Chen-Yu Tsai
Hi,

This is the remaining part of v3 of the Allwinner A20 GMAC glue layer for
stmmac. The stmmac driver changes have been merged through net-next. The
remaining bits are clock and DT patches. The patches should be applied
over my clock renaming patches.

The Allwinner A20 SoC integrates an early version of dwmac
IP from Synopsys. On top of that is a hardware glue layer.
This layer needs to be configured before the dwmac can be
used.

Part of the glue layer is a clock mux, which controls the
source and direction of the TX clock used by GMAC.

Changes since v2:

  * Added more comments on GMAC clock driver
  * Drop CLK_SET_PARENT_GATE in GMAC clock driver
  * Use macro for max clock parents
  * Line wrapping

Changes since v1:

  * Added optional reset control to stmmac driver core
  * Added non CONFIG_RESET_CONROLLER routines for the above change
  * Extended callback API, as discussed with Srinivas
  * Used new stmmac_of_data to pass features and callbacks,
instead of platform data, as discussed
  * Seperated clock module glue layer into clock driver

Cheers,
ChenYu


Chen-Yu Tsai (8):
  clk: sunxi: Add Allwinner A20/A31 GMAC clock unit
  ARM: dts: sun7i: Add GMAC clock node to sun7i DTSI
  ARM: dts: sun7i: Add GMAC controller node to sun7i DTSI
  ARM: dts: sun7i: Add pin muxing options for the GMAC
  ARM: dts: sun7i: cubietruck: Enable the GMAC
  ARM: dts: sun7i: cubieboard2: Enable GMAC instead of EMAC
  ARM: dts: sun7i: olinuxino-micro: Enable GMAC instead of EMAC
  ARM: dts: sun7i: Add ethernet alias for GMAC

 Documentation/devicetree/bindings/clock/sunxi.txt | 26 +++
 arch/arm/boot/dts/sun7i-a20-cubieboard2.dts   | 27 
 arch/arm/boot/dts/sun7i-a20-cubietruck.dts| 12 
 arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts   | 27 
 arch/arm/boot/dts/sun7i-a20.dtsi  | 71 ++-
 drivers/clk/sunxi/clk-sunxi.c | 83 +++
 6 files changed, 215 insertions(+), 31 deletions(-)

-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/8] clk: sunxi: Add Allwinner A20/A31 GMAC clock unit

2014-02-02 Thread Chen-Yu Tsai
The Allwinner A20/A31 clock module controls the transmit clock source
and interface type of the GMAC ethernet controller. Model this as
a single clock for GMAC drivers to use.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 Documentation/devicetree/bindings/clock/sunxi.txt | 26 +++
 drivers/clk/sunxi/clk-sunxi.c | 83 +++
 2 files changed, 109 insertions(+)

diff --git a/Documentation/devicetree/bindings/clock/sunxi.txt 
b/Documentation/devicetree/bindings/clock/sunxi.txt
index 0cf679b..f43b4c0 100644
--- a/Documentation/devicetree/bindings/clock/sunxi.txt
+++ b/Documentation/devicetree/bindings/clock/sunxi.txt
@@ -37,6 +37,7 @@ Required properties:
allwinner,sun6i-a31-apb2-gates-clk - for the APB2 gates on A31
allwinner,sun4i-mod0-clk - for the module 0 family of clocks
allwinner,sun7i-a20-out-clk - for the external output clocks
+   allwinner,sun7i-a20-gmac-clk - for the GMAC clock module on A20/A31
 
 Required properties for all clocks:
 - reg : shall be the control register address for the clock.
@@ -50,6 +51,9 @@ Required properties for all clocks:
If the clock module only has one output, the name shall be the
module name.
 
+For allwinner,sun7i-a20-gmac-clk, the parent clocks shall be fixed rate
+dummy clocks at 25 MHz and 125 MHz, respectively. See example.
+
 Clock consumers should specify the desired clocks they use with a
 clocks phandle cell. Consumers that are using a gated clock should
 provide an additional ID in their clock property. This ID is the
@@ -96,3 +100,25 @@ mmc0_clk: clk@01c20088 {
clocks = osc24M, pll6 1, pll5 1;
clock-output-names = mmc0;
 };
+
+mii_phy_tx_clk: clk@2 {
+   #clock-cells = 0;
+   compatible = fixed-clock;
+   clock-frequency = 2500;
+   clock-output-names = mii_phy_tx;
+};
+
+gmac_int_tx_clk: clk@3 {
+   #clock-cells = 0;
+   compatible = fixed-clock;
+   clock-frequency = 12500;
+   clock-output-names = gmac_int_tx;
+};
+
+gmac_clk: clk@01c20164 {
+   #clock-cells = 0;
+   compatible = allwinner,sun7i-a20-gmac-clk;
+   reg = 0x01c20164 0x4;
+   clocks = mii_phy_tx_clk, gmac_int_tx_clk;
+   clock-output-names = gmac;
+};
diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
index 736fb60..0b361d2 100644
--- a/drivers/clk/sunxi/clk-sunxi.c
+++ b/drivers/clk/sunxi/clk-sunxi.c
@@ -379,6 +379,89 @@ static void sun7i_a20_get_out_factors(u32 *freq, u32 
parent_rate,
 
 
 /**
+ * sun7i_a20_gmac_clk_setup - Setup function for A20/A31 GMAC clock module
+ *
+ * This clock looks something like this
+ *   
+ *  MII TX clock from PHY -|____| to GMAC core
+ *  GMAC Int. RGMII TX clk |___\__/__gate---| to PHY
+ *  Ext. 125MHz RGMII TX clk --|__divider__/|
+ *  ||
+ *
+ * The external 125 MHz reference is optional, i.e. GMAC can use its
+ * internal TX clock just fine. The A31 GMAC clock module does not have
+ * the divider controls for the external reference.
+ *
+ * To keep it simple, let the GMAC use either the MII TX clock for MII mode,
+ * and its internal TX clock for GMII and RGMII modes. The GMAC driver should
+ * select the appropriate source and gate/ungate the output to the PHY.
+ *
+ * Only the GMAC should use this clock. Altering the clock so that it doesn't
+ * match the GMAC's operation parameters will result in the GMAC not being
+ * able to send traffic out. The GMAC driver should set the clock rate and
+ * enable/disable this clock to configure the required state. The clock
+ * driver then responds by auto-reparenting the clock.
+ */
+
+#define SUN7I_A20_GMAC_GPIT2
+#define SUN7I_A20_GMAC_MASK0x3
+#define SUN7I_A20_GMAC_MAX_PARENTS 2
+
+static void __init sun7i_a20_gmac_clk_setup(struct device_node *node)
+{
+   struct clk *clk;
+   struct clk_mux *mux;
+   struct clk_gate *gate;
+   const char *clk_name = node-name;
+   const char *parents[SUN7I_A20_GMAC_MAX_PARENTS];
+   void *reg;
+   int i = 0;
+
+   /* allocate mux and gate clock structs */
+   mux = kzalloc(sizeof(struct clk_mux), GFP_KERNEL);
+   if (!mux)
+   return;
+   gate = kzalloc(sizeof(struct clk_gate), GFP_KERNEL);
+   if (!gate) {
+   kfree(mux);
+   return;
+   }
+
+   reg = of_iomap(node, 0);
+
+   of_property_read_string(node, clock-output-names, clk_name);
+
+   while (i  SUN7I_A20_GMAC_MAX_PARENTS 
+   (parents[i] = of_clk_get_parent_name(node, i)) != NULL)
+   i++;
+
+   /* set up gate and fixed rate properties */
+   gate-reg = reg;
+   gate-bit_idx = SUN7I_A20_GMAC_GPIT;
+   gate-lock = clk_lock;
+   mux-reg = reg;
+   mux-mask = SUN7I_A20_GMAC_MASK;
+   mux-flags = 

[PATCH v3 2/8] ARM: dts: sun7i: Add GMAC clock node to sun7i DTSI

2014-02-02 Thread Chen-Yu Tsai
The GMAC uses 1 of 2 sources for its transmit clock, depending on the
PHY interface mode. Add both sources as dummy clocks, and as parents
to the GMAC clock node.

Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20.dtsi | 28 
 1 file changed, 28 insertions(+)

diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index 1595e9a..fc7f470 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -314,6 +314,34 @@
};
 
/*
+* The following two are dummy clocks, placeholders used
+* on gmac_tx clock. The actual frequency and availability
+* depends on the external PHY, operation mode and link
+* speed.
+*/
+   mii_phy_tx_clk: clk@2 {
+   #clock-cells = 0;
+   compatible = fixed-clock;
+   clock-frequency = 2500;
+   clock-output-names = mii_phy_tx;
+   };
+
+   gmac_int_tx_clk: clk@3 {
+   #clock-cells = 0;
+   compatible = fixed-clock;
+   clock-frequency = 12500;
+   clock-output-names = gmac_int_tx;
+   };
+
+   gmac_tx_clk: clk@01c20164 {
+   #clock-cells = 0;
+   compatible = allwinner,sun7i-a20-gmac-clk;
+   reg = 0x01c20164 0x4;
+   clocks = mii_phy_tx_clk, gmac_int_tx_clk;
+   clock-output-names = gmac_tx;
+   };
+
+   /*
 * Dummy clock used by output clocks
 */
osc24M_32k: clk@1 {
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 3/8] ARM: dts: sun7i: Add GMAC controller node to sun7i DTSI

2014-02-02 Thread Chen-Yu Tsai
Signed-off-by: Chen-Yu Tsai w...@csie.org
---
 arch/arm/boot/dts/sun7i-a20.dtsi | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi
index fc7f470..5fbac23 100644
--- a/arch/arm/boot/dts/sun7i-a20.dtsi
+++ b/arch/arm/boot/dts/sun7i-a20.dtsi
@@ -630,6 +630,21 @@
status = disabled;
};
 
+   gmac: ethernet@01c5 {
+   compatible = allwinner,sun7i-a20-gmac;
+   reg = 0x01c5 0x1;
+   interrupts = 0 85 4;
+   interrupt-names = macirq;
+   clocks = ahb_gates 49, gmac_tx_clk;
+   clock-names = stmmaceth, allwinner_gmac_tx;
+   snps,pbl = 2;
+   snps,fixed-burst;
+   snps,force_sf_dma_mode;
+   status = disabled;
+   #address-cells = 1;
+   #size-cells = 0;
+   };
+
hstimer@01c6 {
compatible = allwinner,sun7i-a20-hstimer;
reg = 0x01c6 0x1000;
-- 
1.9.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)

2014-02-02 Thread Ben Skeggs
On Mon, Feb 3, 2014 at 1:14 PM, Ilia Mirkin imir...@alum.mit.edu wrote:
 On Sun, Feb 2, 2014 at 9:44 PM, Alexandre Courbot acour...@nvidia.com wrote:
 One beginner question: is it appropriate to send kernel patches to the
 nouveau list in addition to dri-devel? The moderation messages I receive
 make me think that this list might rather be intended for general
 discussion.

 I usually do. The main thing is to make sure that they're To: Ben,
 since he's the one who will be ultimately be picking them up. I think
 that if you're not subscribed, all the lists.freedesktop.org lists
 moderate you, but dri-devel is configured not to tell you about it.
 Also I've been getting bounce messages from nouveau@ complaining of
 too many cc's and so it's getting auto-moderated -- not sure who, if
 anyone, is an admin of the nouveau list. Hopefully someone :)
The Nouveau list seems the most appropriate.  There's not really any
need to explicitly CC me either, I do watch the list :)


   -ilia
 ___
 dri-devel mailing list
 dri-de...@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/dri-devel
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] irq_work: allow certain work in hard irq context

2014-02-02 Thread Mike Galbraith
On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote:

 According to the backtrace both of them are trying to access the
 per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail
 to get the timer lock here. They shouldn't spin there for minutes, I
 have no idea why they did so…

Hm. per-cpu...

I've been chasing an rt hotplug heisenbug that is pointing to per-cpu
oddness.  During sched domain re-construction while running Steven's
stress script on 64 core box, we hit a freshly constructed domain with
_no span_, build_sched_groups()-get_group() explodes when we meeting
it.  But if you try to watch the thing appear... it just doesn't.

static int build_sched_domains(const struct cpumask *cpu_map,
   struct sched_domain_attr *attr)
{
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;

alloc_state = __visit_domain_allocation_hell(d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;

/* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;

sd = NULL;
for_each_sd_topology(tl) {
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
BUG_ON(sd == spanless-alien) here..
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (tl-flags  SDTL_OVERLAP || 
sched_feat(FORCE_SD_OVERLAP))
sd-flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
}

/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd-parent) {
sd-span_weight = cpumask_weight(sched_domain_span(sd));
if (sd-flags  SD_OVERLAP) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
if (build_sched_groups(sd, i))
..prevents meeting that alien here.. while hotplug locked.

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
struct sched_domain *sd = *per_cpu_ptr(sdd-sd, cpu);
struct sched_domain *child = sd-child;
if (child)
cpu = cpumask_first(sched_domain_span(child));
^^^nr_cpus
if (sg) {
*sg = *per_cpu_ptr(sdd-sg, cpu); BOOM


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/6] fat: add i_disksize to represent uninitialized size

2014-02-02 Thread OGAWA Hirofumi
Namjae Jeon linkinj...@gmail.com writes:

 From: Namjae Jeon namjae.j...@samsung.com

 Add i_disksize to represent uninitialized allocated size.
 And mmu_private represent initialized allocated size.

Don't we need to update -i_disksize after cont_write_begin()?
-- 
OGAWA Hirofumi hirof...@mail.parknet.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region

2014-02-02 Thread OGAWA Hirofumi
Namjae Jeon linkinj...@gmail.com writes:

 From: Namjae Jeon namjae.j...@samsung.com

 Make the fibmap call the return the proper physical block number for any
 offset request in the fallocated range.

 Signed-off-by: Namjae Jeon namjae.j...@samsung.com
 Signed-off-by: Amit Sahrawat a.sahra...@samsung.com
 ---
  fs/fat/cache.c |   13 ++---
  fs/fat/fat.h   |3 +++
  fs/fat/inode.c |3 +++
  3 files changed, 16 insertions(+), 3 deletions(-)

 diff --git a/fs/fat/cache.c b/fs/fat/cache.c
 index a132666..d22c1a2 100644
 --- a/fs/fat/cache.c
 +++ b/fs/fat/cache.c
 @@ -325,19 +325,26 @@ int fat_bmap(struct inode *inode, sector_t sector, 
 sector_t *phys,
  
   last_block = (i_size_read(inode) + (blocksize - 1))  blocksize_bits;
   if (sector = last_block) {
 - if (!create)
 - return 0;
 -
   /*
* Both -mmu_private and -i_disksize can access
* on only allocation path. (caller must hold -i_mutex)
*/
   last_block = (MSDOS_I(inode)-i_disksize + (blocksize - 1))
blocksize_bits;
 + if (!create) {
 + /* Map a block in fallocated region */
 + if (atomic_read(MSDOS_I(inode)-beyond_isize))
 + if (sector  last_block)
 + goto out_map_cluster;
 +
 + return 0;
 + }
 +
   if (sector = last_block)
   return 0;
   }
  
 +out_map_cluster:
   cluster = sector  (sbi-cluster_bits - sb-s_blocksize_bits);
   offset  = sector  (sbi-sec_per_clus - 1);
   cluster = fat_bmap_cluster(inode, cluster);
 diff --git a/fs/fat/fat.h b/fs/fat/fat.h
 index 7b5851f..b884276 100644
 --- a/fs/fat/fat.h
 +++ b/fs/fat/fat.h
 @@ -129,6 +129,9 @@ struct msdos_inode_info {
   struct hlist_node i_dir_hash;   /* hash by i_logstart */
   struct rw_semaphore truncate_lock; /* protect bmap against truncate */
   struct inode vfs_inode;
 +
 + /* for getting block number beyond file size in case of fallocate */
 + atomic_t beyond_isize;
  };
  
  struct fat_slot_info {
 diff --git a/fs/fat/inode.c b/fs/fat/inode.c
 index 3636617..1c3192b 100644
 --- a/fs/fat/inode.c
 +++ b/fs/fat/inode.c
 @@ -256,7 +256,10 @@ static sector_t _fat_bmap(struct address_space *mapping, 
 sector_t block)
  
   /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
   down_read(MSDOS_I(mapping-host)-truncate_lock);
 + /* To get block number beyond file size in fallocated region */
 + atomic_set(MSDOS_I(mapping-host)-beyond_isize, 1);
   blocknr = generic_block_bmap(mapping, block, fat_get_block);
 + atomic_set(MSDOS_I(mapping-host)-beyond_isize, 0);
   up_read(MSDOS_I(mapping-host)-truncate_lock);

This is racy. While user is using bmap, kernel can allocate new blocks.
We should use another function for this.

For example, something like

fat_get_block_bmap()
{
[...]
fat_get_block2(inode, iblock, max_blocks, bh_result, create, bmap);
[...]
}

blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
-- 
OGAWA Hirofumi hirof...@mail.parknet.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 2/6] fat: add fat_fallocate operation

2014-02-02 Thread OGAWA Hirofumi

Sorry for long delay.

Namjae Jeon linkinj...@gmail.com writes:

 + if (mode  FALLOC_FL_KEEP_SIZE) {
 + /* First compute the number of clusters to be allocated */
 + mm_bytes = offset + len - round_up(MSDOS_I(inode)-mmu_private,
 + sbi-cluster_size);

This should use -i_disksize?

[...]

 + /* Release unwritten fallocated blocks on inode eviction. */
 + if (MSDOS_I(inode)-mmu_private  MSDOS_I(inode)-i_disksize) {
 + int err;
 + fat_truncate_blocks(inode, MSDOS_I(inode)-mmu_private);
 + /* Fallocate results in updating the i_start/iogstart
 +  * for the zero byte file. So, make it return to
 +  * original state during evict and commit it
 +  * synchrnously to avoid any corruption on the next
 +  * access to the cluster chain for the file.
 +  */
 + err = fat_sync_inode(inode);

Ah, good catch. We have to update i_size. I was forgetting about this.
Well, sync inode unconditionally would not be good. Maybe, we better to
use __fat_write_inode() with inode_needs_sync() or such.
-- 
OGAWA Hirofumi hirof...@mail.parknet.co.jp
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] arm: document mach-virt platform.

2014-02-02 Thread Christoffer Dall
On Thu, Jan 30, 2014 at 04:11:02PM +, Ian Campbell wrote:
 mach-virt has existed for a while but it is not written down what it actually
 consists of. Although it seems a bit unusual to document a binding for an
 entire platform since mach-virt is entirely virtual it is helpful to have
 something to refer to in the absence of a single concrete implementation.
 
 I've done my best to capture the requirements based on the git log and my
 memory/understanding.

[...]

 
 +
 +The platform may also provide hypervisor specific functionality
 +(e.g. PV I/O), if it does so then this functionality must be
 +discoverable (directly or indirectly) via device tree.

While this is obviously true, I'm not sure I see the value of this text.

Isn't it more essential to just say that *any* functionality provided to
the platform must be discoverable via device tree?

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] arm: document mach-virt platform.

2014-02-02 Thread Christoffer Dall
On Thu, Jan 30, 2014 at 11:54:46AM -0500, Christopher Covington wrote:
 Hi Ian,
 
 On 01/30/2014 11:11 AM, Ian Campbell wrote:
  mach-virt has existed for a while but it is not written down what it 
  actually
  consists of. Although it seems a bit unusual to document a binding for an
  entire platform since mach-virt is entirely virtual it is helpful to have
  something to refer to in the absence of a single concrete implementation.
  
  I've done my best to capture the requirements based on the git log and my
  memory/understanding.
  
  While here remove the xenvm dts example, the Xen tools will now build a
  suitable mach-virt compatible dts when launching the guest.
 

[...]

  +The platform may also provide hypervisor specific functionality
  +(e.g. PV I/O), if it does so then this functionality must be
  +discoverable (directly or indirectly) via device tree.
 
 I think it would be informative to provide pointers here to commonly used
 paravirtualized devices, especially VirtIO PCI/MMIO.
 

I disagree: that would only encourage limited testing or assumptions
about these specific devices when really this platform is just a
bare-bones platform driven by device tree which should make no
preference, whatsoever, about which devices are used with the platform.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHSET 0/5] tracing/uprobes: Support multi buffer and event trigger

2014-02-02 Thread Namhyung Kim
Ping!

On Fri, 17 Jan 2014 17:08:35 +0900, Namhyung Kim wrote:
 Hello,
 (Resending with LKML CC'ed)

 This patchset tries to add support for recent multi buffer and event
 trigger changes to uprobes.  The multi buffer support patch is an
 updated version of Zovi's previous patch v6 [1].

 Zovi, please tell me if you have any update and/or issues with this.

 Masami and Oleg, I kept your Reviewed-by's in the patch since I think
 it's just an rebase.  Please take a look again to see whether I added
 some mistakes.


 You can also get it from 'uprobe/trigger-v1' branch in my tree

   git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git

 Any comments are welcome, thanks
 Namhyung


 [1] https://lkml.org/lkml/2013/7/4/165

 Cc: Masami Hiramatsu masami.hiramatsu...@hitachi.com
 Cc: Oleg Nesterov o...@redhat.com
 Cc: Srikar Dronamraju sri...@linux.vnet.ibm.com
 Cc: zhangwei(Jovi) jovi.zhang...@huawei.com
 Cc: Tom Zanussi tom.zanu...@linux.intel.com  


 Namhyung Kim (4):
   tracing/uprobes: Rename uprobe_{trace,perf}_print() functions
   tracing/uprobes: Move argument fetching to uprobe_dispatcher()
   tracing/uprobes: Support event triggering
   tracing/uprobes: Support mix of ftrace and perf

 zhangwei(Jovi) (1):
   tracing/uprobes: Support ftrace_event_file base multibuffer

  kernel/trace/trace_kprobe.c |  17 
  kernel/trace/trace_probe.h  |  17 
  kernel/trace/trace_uprobe.c | 191 
 +++-
  3 files changed, 151 insertions(+), 74 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/6] audit: Enable arm64 support

2014-02-02 Thread AKASHI Takahiro

Richard,

On 01/30/2014 07:36 AM, Richard Guy Briggs wrote:

On 14/01/29, Richard Guy Briggs wrote:

On 14/01/27, AKASHI Takahiro wrote:

[To audit maintainers]

On 01/23/2014 11:18 PM, Catalin Marinas wrote:

On Fri, Jan 17, 2014 at 08:13:14AM +, AKASHI Takahiro wrote:

--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -327,6 +327,8 @@ enum {
  /* distinguish syscall tables */
  #define __AUDIT_ARCH_64BIT 0x8000
  #define __AUDIT_ARCH_LE  0x4000
+#define AUDIT_ARCH_AARCH64 (EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define AUDIT_ARCH_AARCH64EB   (EM_AARCH64|__AUDIT_ARCH_64BIT)
  #define AUDIT_ARCH_ALPHA  (EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
  #define AUDIT_ARCH_ARM(EM_ARM|__AUDIT_ARCH_LE)
  #define AUDIT_ARCH_ARMEB  (EM_ARM)
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3..3aae602 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -284,7 +284,7 @@ config AUDIT

  config AUDITSYSCALL
bool Enable system-call auditing support
-   depends on AUDIT  (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH 
|| (ARM  AEABI  !OABI_COMPAT))
+   depends on AUDIT  (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH 
|| (ARM  AEABI  !OABI_COMPAT) || ARM64)


The usual comment for such changes: could you please clean this up and
just use something like depends on HAVE_ARCH_AUDITSYSCALL?


Do you agree to this change?

If so, I can create a patch, but have some concerns:
1) I can't verify it on other architectures than (arm ) arm64.
2) Some architectures (microblaze, mips, openrisc) are not listed here, but
their ptrace.c have a call to audit_syscall_entry/exit().
(audit_syscall_entry/exit are null if !AUDITSYSCALL, though)


I can try: ppc s390 x86_64 ppc64 i686 s390x


These arches above all pass compile and basic tests with the following patches 
applied:

audit: correct a type mismatch in audit_syscall_exit() pending (already 
upstream)

audit: Modify a set of system calls in audit class definitions (already 
upstream)

[PATCH v3] audit: Add generic compat syscall support

[PATCH v2] audit: Enable arm64 support
[PATCH v2] arm64: Add regs_return_value() in syscall.h
[PATCH v2] arm64: Add audit support
[PATCH v2] arm64: audit: Add 32-bit (compat) syscall support
[PATCH v2] arm64: audit: Add makefile rule to create unistd_32.h for 
compat syscalls
[PATCH v2] arm64: audit: Add audit hook in ptrace/syscall_trace


I think that you missed Catalin's suggestion.
Please use the patch I will post after this message and try it again, please?

Thanks,
-Takahiro AKASHI




So I'm afraid that the change might break someone's assumption.

Thanks,
-Takahiro AKASHI


- RGB


- RGB

--
Richard Guy Briggs rbri...@redhat.com
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red 
Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] audit: Add CONFIG_HAVE_ARCH_AUDITSYSCALL

2014-02-02 Thread AKASHI Takahiro
Currently AUDITSYSCALL has a long list of architecture depencency:
   depends on AUDIT  (X86 || PARISC || PPC || S390 || IA64 || UML ||
SPARC64 || SUPERH || (ARM  AEABI  !OABI_COMPAT))
The purpose of this patch is to replace it with HAVE_ARCH_AUDITSYSCALL
for simplicity.

Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org
---
 arch/arm/Kconfig   |1 +
 arch/ia64/Kconfig  |1 +
 arch/parisc/Kconfig|1 +
 arch/powerpc/Kconfig   |1 +
 arch/s390/Kconfig  |1 +
 arch/sh/Kconfig|1 +
 arch/sparc/Kconfig |1 +
 arch/um/Kconfig.common |1 +
 arch/x86/Kconfig   |1 +
 init/Kconfig   |5 -
 10 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c1f1a7e..cf69f89 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HARDIRQS_SW_RESEND
+   select HAVE_ARCH_AUDITSYSCALL if (AEABI  !OABI_COMPAT)
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
select HAVE_ARCH_KGDB
select HAVE_ARCH_SECCOMP_FILTER if (AEABI  !OABI_COMPAT)
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 4e4119b..9143d91 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -43,6 +43,7 @@ config IA64
select HAVE_MOD_ARCH_SPECIFIC
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
+   select HAVE_ARCH_AUDITSYSCALL
default y
help
  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b5f1858..0821e83 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -28,6 +28,7 @@ config PARISC
select CLONE_BACKWARDS
select TTY # Needed for pdc_cons.c
select HAVE_DEBUG_STACKOVERFLOW
+   select HAVE_ARCH_AUDITSYSCALL
 
help
  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b44b52c..96627d6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
select OLD_SIGACTION if PPC32
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_IRQ_EXIT_ON_IRQ_STACK
+   select HAVE_ARCH_AUDITSYSCALL
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1e1a03d..b3b9853 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@ config S390
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+   select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL if !MARCH_G5
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 9b0979f..675fb7c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -42,6 +42,7 @@ config SUPERH
select MODULES_USE_ELF_RELA
select OLD_SIGSUSPEND
select OLD_SIGACTION
+   select HAVE_ARCH_AUDITSYSCALL
help
  The SuperH is a RISC processor targeted for use in embedded systems
  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d4f7a6a..7f7ad7e 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -76,6 +76,7 @@ config SPARC64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_C_RECORDMCOUNT
select NO_BOOTMEM
+   select HAVE_ARCH_AUDITSYSCALL
 
 config ARCH_DEFCONFIG
string
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index 21ca44c..6915d28 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -1,6 +1,7 @@
 config UML
bool
default y
+   select HAVE_ARCH_AUDITSYSCALL
select HAVE_UID16
select GENERIC_IRQ_SHOW
select GENERIC_CPU_DEVICES
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e903c71..6ef682f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
select RTC_LIB
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+   select HAVE_ARCH_AUDITSYSCALL
 
 config INSTRUCTION_DECODER
def_bool y
diff --git a/init/Kconfig b/init/Kconfig
index 79383d3..9fe22d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -282,9 +282,12 @@ config AUDIT
  logging of avc messages output).  Does not do system-call
  auditing without CONFIG_AUDITSYSCALL.
 
+config HAVE_ARCH_AUDITSYSCALL
+   bool
+
 config AUDITSYSCALL
bool Enable system-call auditing support
-   depends on AUDIT  (X86 || PARISC || PPC || S390 || IA64 || UML || 
SPARC64 || SUPERH || (ARM  AEABI  !OABI_COMPAT))
+   depends on AUDIT  HAVE_ARCH_AUDITSYSCALL
default y if SECURITY_SELINUX
help
  Enable 

[PATCH] f2fs: remove the ugly pointer conversion

2014-02-02 Thread Jaegeuk Kim
This patch modifies the use of bi_private to remove pointer chasing for sbi.
Previously, we had a bi_private structure, but it needs memory allocation.
So this patch uses bi_private by the sbi pointer and adds a completion pointer
into the sbi.
This can achieve no memory allocation and nice use of the bi_private.

Signed-off-by: Jaegeuk Kim jaegeuk@samsung.com
---
 fs/f2fs/data.c | 11 +++
 fs/f2fs/f2fs.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 20c3c64..d175ae3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 
 static void f2fs_write_end_io(struct bio *bio, int err)
 {
-   struct f2fs_sb_info *sbi = 
F2FS_SB(bio-bi_io_vec-bv_page-mapping-host-i_sb);
+   struct f2fs_sb_info *sbi = bio-bi_private;
struct bio_vec *bvec;
int i;
 
@@ -61,8 +61,10 @@ static void f2fs_write_end_io(struct bio *bio, int err)
dec_page_count(sbi, F2FS_WRITEBACK);
}
 
-   if (bio-bi_private)
-   complete(bio-bi_private);
+   if (sbi-wait_io) {
+   complete(sbi-wait_io);
+   sbi-wait_io = NULL;
+   }
 
if (!get_pages(sbi, F2FS_WRITEBACK) 
!list_empty(sbi-cp_wait.task_list))
@@ -85,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, 
block_t blk_addr,
bio-bi_bdev = sbi-sb-s_bdev;
bio-bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
bio-bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+   bio-bi_private = sbi;
 
return bio;
 }
@@ -112,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
 */
if (fio-type == META_FLUSH) {
DECLARE_COMPLETION_ONSTACK(wait);
-   io-bio-bi_private = wait;
+   io-sbi-wait_io = wait;
submit_bio(rw, io-bio);
wait_for_completion(wait);
} else {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 55288d2..aeff132 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -398,6 +398,7 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io;   /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE];/* for write bios */
+   struct completion *wait_io; /* for completion bios */
 
/* for checkpoint */
struct f2fs_checkpoint *ckpt;   /* raw checkpoint pointer */
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpufreq: cpu0: make THERMAL_CPU support optional

2014-02-02 Thread Viresh Kumar
Cc'ing the guy who introduced this bug..

On 2 February 2014 04:20, Rob Herring robherri...@gmail.com wrote:
 From: Rob Herring r...@kernel.org

 The addition of THERMAL and THERMAL_CPU selections causes a kconfig
 warning on highbank platforms:

 warning: (ARM_HIGHBANK_CPUFREQ) selects GENERIC_CPUFREQ_CPU0 which has
 unmet direct dependencies (ARCH_HAS_CPUFREQ  CPU_FREQ  HAVE_CLK 
  REGULATOR  OF  THERMAL  CPU_THERMAL)

 The cpufreq-cpu0 driver does not require thermal zone support, and it
 should be selectable independently. Add a new kconfig option to enable
 this feature.

 Reported-by: Olof Johansson o...@lixom.net
 Cc: Rafael J. Wysocki r...@rjwysocki.net
 Cc: Viresh Kumar viresh.ku...@linaro.org
 Cc: cpuf...@vger.kernel.org
 Signed-off-by: Rob Herring r...@kernel.org
 ---
  drivers/cpufreq/Kconfig | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)

 diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
 index 4b029c0..a197a04a 100644
 --- a/drivers/cpufreq/Kconfig
 +++ b/drivers/cpufreq/Kconfig
 @@ -185,7 +185,7 @@ config CPU_FREQ_GOV_CONSERVATIVE

  config GENERIC_CPUFREQ_CPU0
 tristate Generic CPU0 cpufreq driver
 -   depends on HAVE_CLK  REGULATOR  OF  THERMAL  CPU_THERMAL
 +   depends on HAVE_CLK  REGULATOR  OF

That's fine as code would still compile due to dummy routines..

 select PM_OPP
 help
   This adds a generic cpufreq driver for CPU0 frequency management.
 @@ -194,6 +194,14 @@ config GENERIC_CPUFREQ_CPU0

   If in doubt, say N.

 +config GENERIC_CPUFREQ_CPU0_THERMAL
 +   bool Thermal zone support for Generic CPU0 cpufreq
 +   depends on GENERIC_CPUFREQ_CPU0
 +   select THERMAL
 +   select THERMAL_CPU
 +   help
 + This adds thermal support to the generic cpufreq driver for CPU0.

But do we really need this? Let the platform enable THERMAL
and THERMAL_CPU themselves, as this was the case currently.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH TRIVIAL] mm: vmscan: shrink_slab: rename max_pass - freeable

2014-02-02 Thread David Rientjes
On Sun, 2 Feb 2014, Vladimir Davydov wrote:

 The name `max_pass' is misleading, because this variable actually keeps
 the estimate number of freeable objects, not the maximal number of
 objects we can scan in this pass, which can be twice that. Rename it to
 reflect its actual meaning.
 
 Signed-off-by: Vladimir Davydov vdavy...@parallels.com

Acked-by: David Rientjes rient...@google.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86, perf, p4: Counter corruption when using lots of perf groups

2014-02-02 Thread Cyrill Gorcunov
On Wed, Jan 29, 2014 at 03:17:17PM -0500, Don Zickus wrote:
   I am not entirely sure on the corruption path, but what happens is:
   
   o perf schedules a group with p4_pmu_schedule_events()
   o inside p4_pmu_schedule_events(), it notices an hwc pointer is being 
   reused
 but for a different cpu, so it 'swaps' the config bits and returns the
 updated 'assign' array with a _new_ index.
   o perf schedules another group with p4_pmu_schedule_events()
   o inside p4_pmu_schedule_events(), it notices an hwc pointer is being 
   reused
 (the same one as above) but for the _same_ cpu [BUG!!], so it updates 
   the
 'assign' array to use the _old_ (wrong cpu) index because the _new_ 
   index is in
 an earlier part of the 'assign' array (and hasn't been committed yet).
   o perf commits the transaction using the wrong index and corrupts the 
   other cpu
  
  Thanks for the fix Don! I fear I won't be able to look precisely tonight, so
  could it wait until tomorrow? (If it's critical sure such fix should do the
  trick).
 
 There is no rush.  Early next week is fine too. :-)

Hi Don, sorry for delay. I thought maybe extending match_prev_assignment()
would be better (ie to figure out if previous event can run without
reprogramming the counter) but this makes code only harder (and what
is worse -- having no physical accees to p4 machine leaves no chance
to test changes). So eventually I think your patch does the same thing
as I had in mind but in different way. Thus

Acked-by: Cyrill Gorcunov gorcu...@openvz.org

thanks a lot!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH 11/34] benet: Use pci_enable_msix_range()

2014-02-02 Thread Sathya Perla
 -Original Message-
 From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On 
 Behalf
 Of Alexander Gordeev
 
 As result of deprecation of MSI-X/MSI enablement functions
 pci_enable_msix() and pci_enable_msi_block() all drivers
 using these two interfaces need to be updated to use the
 new pci_enable_msi_range() and pci_enable_msix_range()
 interfaces.
 
 Signed-off-by: Alexander Gordeev agord...@redhat.com

Acked-by: Sathya Perla  sathya.pe...@emulex.com

 ---
  drivers/net/ethernet/emulex/benet/be_main.c |   31 +++---
  1 files changed, 13 insertions(+), 18 deletions(-)
 
 diff --git a/drivers/net/ethernet/emulex/benet/be_main.c
 b/drivers/net/ethernet/emulex/benet/be_main.c
 index 04ac9c6..f55c09b 100644
 --- a/drivers/net/ethernet/emulex/benet/be_main.c
 +++ b/drivers/net/ethernet/emulex/benet/be_main.c
 @@ -2505,7 +2505,7 @@ static void be_msix_disable(struct be_adapter *adapter)
 
  static int be_msix_enable(struct be_adapter *adapter)
  {
 - int i, status, num_vec;
 + int i, num_vec;
   struct device *dev = adapter-pdev-dev;
 
   /* If RoCE is supported, program the max number of NIC vectors that
 @@ -2521,24 +2521,11 @@ static int be_msix_enable(struct be_adapter *adapter)
   for (i = 0; i  num_vec; i++)
   adapter-msix_entries[i].entry = i;
 
 - status = pci_enable_msix(adapter-pdev, adapter-msix_entries, num_vec);
 - if (status == 0) {
 - goto done;
 - } else if (status = MIN_MSIX_VECTORS) {
 - num_vec = status;
 - status = pci_enable_msix(adapter-pdev, adapter-msix_entries,
 -  num_vec);
 - if (!status)
 - goto done;
 - }
 + num_vec = pci_enable_msix_range(adapter-pdev, adapter-msix_entries,
 + MIN_MSIX_VECTORS, num_vec);
 + if (num_vec  0)
 + goto fail;
 
 - dev_warn(dev, MSIx enable failed\n);
 -
 - /* INTx is not supported in VFs, so fail probe if enable_msix fails */
 - if (!be_physfn(adapter))
 - return status;
 - return 0;
 -done:
   if (be_roce_supported(adapter)  num_vec  MIN_MSIX_VECTORS) {
   adapter-num_msix_roce_vec = num_vec / 2;
   dev_info(dev, enabled %d MSI-x vector(s) for RoCE\n,
 @@ -2550,6 +2537,14 @@ done:
   dev_info(dev, enabled %d MSI-x vector(s) for NIC\n,
adapter-num_msix_vec);
   return 0;
 +
 +fail:
 + dev_warn(dev, MSIx enable failed\n);
 +
 + /* INTx is not supported in VFs, so fail probe if enable_msix fails */
 + if (!be_physfn(adapter))
 + return num_vec;
 + return 0;
  }
 
  static inline int be_msix_vec_get(struct be_adapter *adapter,
 --
 1.7.7.6
 
 --
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs

2014-02-02 Thread David Rientjes
On Sun, 2 Feb 2014, Vladimir Davydov wrote:

 Per-memcg kmem caches are named as follows:
 
   global-cache-name(cgroup-kmem-id:cgroup-name)
 
 where cgroup-kmem-id is the unique id of the memcg the cache belongs
 to, cgroup-name is the relative name of the memcg on the cgroup fs.
 Cache names are exposed to userspace for debugging purposes (e.g. via
 sysfs in case of slub or via dmesg).
 
 Using relative names makes it impossible in general (in case the cgroup
 hierarchy is not flat) to find out which memcg a particular cache
 belongs to, because cgroup-kmem-id is not known to the user. Since
 using absolute cgroup names would be an overkill, let's fix this by
 exporting the id of kmem-active memcg via cgroup fs file
 memory.kmem.id.
 

Hmm, I'm not sure exporting additional information is the best way to do 
it only for this purpose.  I do understand the problem in naming 
collisions if the hierarchy isn't flat and we typically work around that 
by ensuring child memcgs still have a unique memcg.  This isn't only a 
problem in slab cache naming, me also avoid printing the entire absolute 
names for things like the oom killer.  So it would be nice to have 
consensus on how people are supposed to identify memcgs with a hierarchy: 
either by exporting information like the id like you do here (but leave 
the oom killer still problematic) or by insisting people name their memcgs 
with unique names if they care to differentiate them.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] acpi-cpufreq: De-register cpu notifier and free struct msr on error.

2014-02-02 Thread Viresh Kumar
On 28 January 2014 09:28, Konrad Rzeszutek Wilk kon...@kernel.org wrote:
 If cpufreq_register_driver() fails we would free the acpi driver
 related structures but not free the ones allocated
 by acpi_cpufreq_boost_init() function. This meant that as
 the driver error-ed out and a CPU online/offline event came
 we would crash and burn as one of the CPU notifiers would point
 to garbage.

 This fixes a regression that commit cfc9c8ed03e4d908f2388af8815f44c87b503aaf
 acpi-cpufreq: Adjust the code to use the common boost attribute
 introduced.

 CC: Lukasz Majewski l.majew...@samsung.com
 CC: Myungjoo Ham myungjoo@samsung.com
 CC: Viresh Kumar viresh.ku...@linaro.org
 CC: Rafael J. Wysocki rafael.j.wyso...@intel.com
 CC: Boris Ostrovsky boris.ostrov...@oracle.com
 Signed-off-by: Konrad Rzeszutek Wilk konrad.w...@oracle.com
 ---
  drivers/cpufreq/acpi-cpufreq.c |5 +++--
  1 files changed, 3 insertions(+), 2 deletions(-)

Acked-by: Viresh Kumar viresh.ku...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 3/3] idle: store the idle state index in the struct rq

2014-02-02 Thread Preeti U Murthy
Hi Daniel,

On 01/31/2014 03:45 PM, Daniel Lezcano wrote:
 On 01/31/2014 09:45 AM, Preeti Murthy wrote:
 Hi,

 On Thu, Jan 30, 2014 at 10:55 PM, Daniel Lezcano
 daniel.lezc...@linaro.org wrote:
 On 01/30/2014 05:35 PM, Peter Zijlstra wrote:

 On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:

 struct cpuidle_state *state = drv-states[rq-index];

 And from the state, we have the following informations:

 struct cpuidle_state {

  [ ... ]

   unsigned intexit_latency; /* in US */
   int power_usage; /* in mW */
   unsigned inttarget_residency; /* in US */
   booldisabled; /* disabled on all CPUs */

  [ ... ]
 };


 Right, but can we say that a higher index will save more power and have
 a higher exit latency? Or is a driver free to have a random mapping
 from
 idle_index to state?


 If the driver does its own random mapping that will break the governor
 logic. So yes, the states are ordered, the higher the index is, the
 more you
 save power and the higher the exit latency is.

 The above point holds true for only the ladder governor which sees the
 idle
 states indexed in the increasing order of target_residency/exit_latency.
 
 The cpuidle framework has been modified for both governor, see commit
 8aef33a7.
 
 The power field was initially used to do the selection, but no power
 value was ever used to filled this field by any hardware. So the field
 was arbitrarily filled with a decreasing value (-1, -2, -3 ...), and
 used by the governor's select function. The patch above just removed
 this field and the condition on power for 'select' assuming the idle
 state are power ordered in the array.

Ok. Looking at commit id 71abbbf856a0, it looks like the primary
motivation for it was the power_usage numbers of each idle state. But if
that went unused, then it perhaps makes sense to revert that patch.

Commit 8aef33a7 pretty much did that. However I think it overlooked the
menu_select() function where the the search iterates through all the
idle states introduced by the above mentioned commit again. Since its
purpose is outdated as per what you say, its best if we correct this now
as per the below post that you have pointed to.

[RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection
 
 However this is not true as far as I can see in the menu governor. It
 acknowledges the dynamic ordering of idle states as can be seen in the
 menu_select() function in the menu governor, where the idle state for the
 CPU gets chosen.  You will notice that, even if it is found that the
 predicted
 idle time of the CPU is smaller than the target residency of an idle
 state,
 the governor continues to search for suitable idle states in the
 higher indexed
 states although it should have halted if the idle states' were ordered
 according
 to their target residency.. The same holds for exit_latency.
 
 I am not sure to get the point. Actually, this loop should be just
 optimized to backward search the idle state like cpuidle_play_dead does
 
 There is also a patch proposed by Alex Shi about this loop.
 
 [RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection
 
 http://comments.gmane.org/gmane.linux.power-management.general/42124

But again if we are copying the exit_latency and target_residency
numbers of the idle state entered, into the rq as soon as the idle state
for the CPU is chosen, as per the discussion on this thread, then I
guess the ordering of the idle states in the cpuidle state table does
not matter.

Thanks

Regards
Preeti U Murthy

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 6/6] mm, hugetlb: improve page-fault scalability

2014-02-02 Thread Joonsoo Kim
On Fri, Jan 31, 2014 at 09:36:46AM -0800, Davidlohr Bueso wrote:
 From: Davidlohr Bueso davidl...@hp.com
 
 The kernel can currently only handle a single hugetlb page fault at a time.
 This is due to a single mutex that serializes the entire path. This lock
 protects from spurious OOM errors under conditions of low of low availability
 of free hugepages. This problem is specific to hugepages, because it is
 normal to want to use every single hugepage in the system - with normal pages
 we simply assume there will always be a few spare pages which can be used
 temporarily until the race is resolved.
 
 Address this problem by using a table of mutexes, allowing a better chance of
 parallelization, where each hugepage is individually serialized. The hash key
 is selected depending on the mapping type. For shared ones it consists of the
 address space and file offset being faulted; while for private ones the mm and
 virtual address are used. The size of the table is selected based on a 
 compromise
 of collisions and memory footprint of a series of database workloads.

Hello,

Thanks for doing this patchset. :)
Just one question!
Why do we need a separate hash key depending on the mapping type?

Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mmc:sdhci: handle busy-end interrupt during command

2014-02-02 Thread Chanho Min
It is fully legal for a controller to start handling busy-end interrupt
before it has signaled that the command has completed. So make sure
we do things in the proper order, Or it results that command interrupt
is ignored so it can cause unexpected operations. This is founded at some
toshiba emmc with the bellow warning.

mmc0: Got command interrupt 0x0001 even though
no command operation was in progress.

Signed-off-by: Hankyung Yu hankyung...@lge.com
Signed-off-by: Chanho Min chanho@lge.com
---
 drivers/mmc/host/sdhci.c  |   17 +++--
 include/linux/mmc/sdhci.h |1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index bd8a098..21f98e7 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1016,6 +1016,7 @@ void sdhci_send_command(struct sdhci_host *host, struct 
mmc_command *cmd)
mod_timer(host-timer, jiffies + 10 * HZ);
 
host-cmd = cmd;
+   host-busy_handle = 0;
 
sdhci_prepare_data(host, cmd);
 
@@ -2271,8 +2272,12 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 
intmask)
if (host-cmd-data)
DBG(Cannot wait for busy signal when also 
doing a data transfer);
-   else if (!(host-quirks  SDHCI_QUIRK_NO_BUSY_IRQ))
+   else if (!(host-quirks  SDHCI_QUIRK_NO_BUSY_IRQ)
+!host-busy_handle) {
+   /* Mark that command complete before busy is ended */
+   host-busy_handle = 1;
return;
+   }
 
/* The controller does not support the end-of-busy IRQ,
 * fall through and take the SDHCI_INT_RESPONSE */
@@ -2335,7 +2340,15 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 
intmask)
 */
if (host-cmd  (host-cmd-flags  MMC_RSP_BUSY)) {
if (intmask  SDHCI_INT_DATA_END) {
-   sdhci_finish_command(host);
+   /*
+* Some cards handle busy-end interrupt
+* before the command completed, so make
+* sure we do things in the proper order.
+*/
+   if (host-busy_handle)
+   sdhci_finish_command(host);
+   else
+   host-busy_handle = 1;
return;
}
}
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 3e781b8..0118020 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -148,6 +148,7 @@ struct sdhci_host {
struct mmc_command *cmd;/* Current command */
struct mmc_data *data;  /* Current data request */
unsigned int data_early:1;  /* Data finished before cmd */
+   unsigned int busy_handle:1; /* Handling the order of Busy-end */
 
struct sg_mapping_iter sg_miter;/* SG state for PIO */
unsigned int blocks;/* remaining PIO blocks */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Is it ok for deferrable timer wakeup the idle cpu?

2014-02-02 Thread Viresh Kumar
Sorry was away for short vacation.

On 28 January 2014 19:20, Frederic Weisbecker fweis...@gmail.com wrote:
 On Thu, Jan 23, 2014 at 07:50:40PM +0530, Viresh Kumar wrote:
 Wait, I got the wrong code here. That's wasn't my initial intention.
 I actually wanted to write something like this:

  -   wake_up_nohz_cpu(cpu);
  +   if (!tbase_get_deferrable(timer-base) || idle_cpu(cpu))
  +   wake_up_nohz_cpu(cpu);

 Will that work?

Something is seriously wrong with me, again wrote rubbish code.
Let me phrase what I wanted to write :)

don't send IPI to a idle CPU for a deferrable timer.

Probably I code it correctly this time atleast.

-   wake_up_nohz_cpu(cpu);
+   if (!(tbase_get_deferrable(timer-base)  idle_cpu(cpu)))
+   wake_up_nohz_cpu(cpu);

 Well, this is going to wake up the target from its idle state, which is
 what we want to avoid if the timer is deferrable, right?

Yeah, sorry for doing it for second time :(

 The simplest thing we want is:

if (!tbase_get_deferrable(timer-base) || tick_nohz_full_cpu(cpu))
wake_up_nohz_cpu(cpu);

 This spares the IPI for the common case where the timer is deferrable and we 
 run
 in periodic or dynticks-idle mode (which should be 99.99% of the existing 
 workloads).

I wasn't looking at this problem with NO_HZ_FULL in mind. As I thought its
only about if the CPU is idle or not. And so the solution I was
talking about was:

don't send IPI to a idle CPU for a deferrable timer.

But I see that still failing with the code you wrote. For normal cases where we
don't enable NO_HZ_FULL, we will still end up waking up idle CPUs which
is what Lei Wen reported initially.

Also if a CPU is marked for NO_HZ_FULL and is not idle currently then we
wouldn't send a IPI for a deferrable timer. But we actually need that, so that
we can reevaluate the timers order again?

 Then we can later optimize that and spare the IPI on full dynticks CPUs when 
 they run
 idle, but that require some special care about subtle races which can't be 
 dealt
 with a simple test on idle_cpu(target). And power consumption in full 
 dynticks
 is already very suboptimized anyway.

 So I suggest we start simple with the above test, and a big fat comment which 
 explains
 what we are doing and what needs to be done in the future.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 0/3] Deferrable timers support for timerfd API

2014-02-02 Thread Alexey Perevalov

Dear John, hello

could we figure out without Thomas advice?
Maybe it worth to propose timerfd and posix timer flag unification patch?

On 01/21/2014 11:12 PM, John Stultz wrote:

On 01/13/2014 02:43 AM, Alexey Perevalov wrote:

Hello dear community.

This is reworked patch set of original Anton's Vorontsov
proposal regarding unified deferrable timers in the user space.
http://lwn.net/Articles/514707/


I decided to resubmit it due we found it usefull for us too.

timerfd was modified since Anton's commit, Alarm support was added.
This isn't only rebase. Anton's previous version used deferrable timer
in couple with hrtimer. This version uses only deferrable timer. It
mean the behaviour of overrun number is different.
e.g. if you don't poll one second timer for a 10 seconds - you'll get
10 overruns with hrtimer, but for deferrable timer it could be another value.


Sorry, last week was a little crazy and I didn't get a chance to closely
review this. But looking at this my major conceptual objection with the
previous patchset (introducing the new clockid) is gone.

My remaining conceptual concern here is that the TIMER_DEFERRABLE flag
is a timerfd only construct here, and I worry we should make sure we
think this through well enough that the same functionality can be
supported via other timer interfaces (like clock_nanosleep, etc), which
may mean the functionality should be pushed more deeply into the hrtimer
subsystem.

So main suggestion here is to make sure you cc Thomas Gleixner on future
iterations, so he can provide some thoughts on what the best approach
might be here. I know he also has some plans that might collide with the
jiffies_to_ktime work.

Thomas: Any thought here? Should we be trying to unify the timerfd flags
and the posix timer flags (specifically things like TIMER_CANCEL_ON_SET,
which is currently timerfd-only)?  Should a deferrable flag be added to
the hrtimer core or left to the timer wheel?

thanks
-john
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/





--
Best regards,
Alexey Perevalov,
Leading software engineer,
phone: +7 (495) 797 25 00 ext 3969
e-mail: a.pereva...@samsung.com mailto:a.pereva...@samsumng.com

Mobile group, Samsung RD Institute Rus
12 Dvintsev street, building 1
127018, Moscow, Russian Federation
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 0/1] audit: generic compat system call support

2014-02-02 Thread AKASHI Takahiro
Arm64 supports 32-bit mode(AArch32) and 64-bit mode(AArch64).
To enable audit on arm64, we want to use lib/audit.c and re-work it
to support compat system calls as well without copying it under
arch sub-directory.

Since this patch is implemented in much the same way as on existing
bi-architectures (ie. ppc, s390, sparc and x86), it's not difficult
for them to utilize this generic code instead of their own implementation.

The code was tested on armv8 fast model with 64-bit and 32-bit userland
by using modified audit-test-code. As this patch is mandatory for my
system call audit support for arm64 patch, please review it as well
for better understandings.

Changes v2 - v3:
* Specify AUDIT_CLASS_XYZ_32 instead of AUDIT_CLASS_XYZ when registering
  compat syscalls (bug fix)

Changes v3 - v4:
* Add CONFIG_AUDIT_COMPAT_GENERIC to compile in compat_audit.c
* Re-define audit_is_compat() in generic way in order to eliminate
  necessity of asm/audit.h.

AKASHI Takahiro (1):
  audit: Add generic compat syscall support

 include/linux/audit.h  |8 +++
 include/uapi/linux/audit.h |6 ++
 lib/Kconfig|5 +
 lib/Makefile   |1 +
 lib/audit.c|   15 -
 lib/compat_audit.c |   50 
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 lib/compat_audit.c

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 1/1] audit: Add generic compat syscall support

2014-02-02 Thread AKASHI Takahiro
lib/audit.c provides a generic definition for auditing system calls.
This patch extends it for compat syscall support on bi-architectures
(32/64-bit) by adding lib/compat_audit.c.
What is required to support this feature are:
 * add asm/unistd32.h for compat system call names
 * enable CONFIG_AUDIT_COMPAT_GENERIC

Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org
---
 include/linux/audit.h  |8 +++
 include/uapi/linux/audit.h |6 ++
 lib/Kconfig|5 +
 lib/Makefile   |1 +
 lib/audit.c|   15 -
 lib/compat_audit.c |   50 
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 lib/compat_audit.c

diff --git a/include/linux/audit.h b/include/linux/audit.h
index bf1ef22..b5d5cca 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -78,6 +78,14 @@ extern int is_audit_feature_set(int which);
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
+/* only for compat system calls */
+extern unsigned compat_write_class[];
+extern unsigned compat_read_class[];
+extern unsigned compat_dir_class[];
+extern unsigned compat_chattr_class[];
+extern unsigned compat_signal_class[];
+
+extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall);
 
 /* audit_names-type values */
 #defineAUDIT_TYPE_UNKNOWN  0   /* we don't know yet */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 44b05a0..0a73cf3 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -355,6 +355,12 @@ enum {
 #define AUDIT_ARCH_SPARC64 (EM_SPARCV9|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_X86_64  (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
+#ifdef CONFIG_COMPAT
+#define audit_is_compat(arch)  (!((arch)  __AUDIT_ARCH_64BIT))
+#else
+#define audit_is_compat(arch)  false
+#endif
+
 #define AUDIT_PERM_EXEC1
 #define AUDIT_PERM_WRITE   2
 #define AUDIT_PERM_READ4
diff --git a/lib/Kconfig b/lib/Kconfig
index 991c98b..48896db 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -182,6 +182,11 @@ config AUDIT_GENERIC
depends on AUDIT  !AUDIT_ARCH
default y
 
+config AUDIT_COMPAT_GENERIC
+   bool
+   depends on AUDIT_GENERIC  COMPAT
+   default y
+
 config RANDOM32_SELFTEST
bool PRNG perform self test on init
default n
diff --git a/lib/Makefile b/lib/Makefile
index a459c31..972552b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
+obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
diff --git a/lib/audit.c b/lib/audit.c
index 76bbed4..1d726a2 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -30,11 +30,17 @@ static unsigned signal_class[] = {
 
 int audit_classify_arch(int arch)
 {
-   return 0;
+   if (audit_is_compat(arch))
+   return 1;
+   else
+   return 0;
 }
 
 int audit_classify_syscall(int abi, unsigned syscall)
 {
+   if (audit_is_compat(abi))
+   return audit_classify_compat_syscall(abi, syscall);
+
switch(syscall) {
 #ifdef __NR_open
case __NR_open:
@@ -57,6 +63,13 @@ int audit_classify_syscall(int abi, unsigned syscall)
 
 static int __init audit_classes_init(void)
 {
+#ifdef CONFIG_AUDIT_COMPAT_GENERIC
+   audit_register_class(AUDIT_CLASS_WRITE_32, compat_write_class);
+   audit_register_class(AUDIT_CLASS_READ_32, compat_read_class);
+   audit_register_class(AUDIT_CLASS_DIR_WRITE_32, compat_dir_class);
+   audit_register_class(AUDIT_CLASS_CHATTR_32, compat_chattr_class);
+   audit_register_class(AUDIT_CLASS_SIGNAL_32, compat_signal_class);
+#endif
audit_register_class(AUDIT_CLASS_WRITE, write_class);
audit_register_class(AUDIT_CLASS_READ, read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
diff --git a/lib/compat_audit.c b/lib/compat_audit.c
new file mode 100644
index 000..873f75b
--- /dev/null
+++ b/lib/compat_audit.c
@@ -0,0 +1,50 @@
+#include linux/init.h
+#include linux/types.h
+#include asm/unistd32.h
+
+unsigned compat_dir_class[] = {
+#include asm-generic/audit_dir_write.h
+~0U
+};
+
+unsigned compat_read_class[] = {
+#include asm-generic/audit_read.h
+~0U
+};
+
+unsigned compat_write_class[] = {
+#include asm-generic/audit_write.h
+~0U
+};
+
+unsigned compat_chattr_class[] = {
+#include asm-generic/audit_change_attr.h
+~0U
+};
+
+unsigned compat_signal_class[] = {
+#include asm-generic/audit_signal.h
+~0U
+};
+
+int audit_classify_compat_syscall(int abi, unsigned syscall)
+{
+   switch (syscall) {
+#ifdef __NR_open

[PATCH v3 0/3] arm64: Add audit support

2014-02-02 Thread AKASHI Takahiro
This patchset adds system call audit support on arm64.
Both 32-bit (AUDIT_ARCH_ARM) and 64-bit tasks (AUDIT_ARCH_AARCH64)
are supported. Since arm64 has the exact same set of system calls
on LE and BE, we don't care about endianness (or more specifically
__AUDIT_ARCH_64BIT bit in AUDIT_ARCH_*).

There are some prerequisites for this patch to work correctly:
* generic compat system call audit support patch
* correct a type mismatch in audit_syscall_exit() patch
   (already accepted and queued in 3.14)
* Modify a set of system calls in audit class patch
   (already accepted and queued in 3.14)
* __NR_* definitions for compat syscalls patch from Catalin
* userspace audit tool (v2.3.2 + my patch for arm64)

Please review them as well for better understandings.

This code was tested on both 32-bit and 64-bit LE userland 
in the following two ways:
1) basic operations with auditctl/autrace
  # auditctl -a exit,always -S openat -F path=/etc/inittab
  # auditctl -a exit,always -F dir=/tmp -F perm=rw
  # auditctl -a task,always
  # autrace /bin/ls
by comparing output from autrace with one from strace

2) audit-test-code (+ my workarounds for arm/arm64)
  by running audit-tool, filter and syscalls test categories.

Changes v1 - v2:
* Modified to utilize generic compat system call audit [3/6, 4/6, 5/6]
  Please note that a required header, unistd_32.h, is automatically
  generated from unistd32.h.
* Refer to regs-orig_x0 instead of regs-x0 as the first argument of
  system call in audit_syscall_entry() [6/6]
* Include Add regs_return_value() in syscall.h patch [2/6],
  which was not intentionally included in v1 because it could be added
  by kprobes support.

Changes v2 - v3:
* Remove asm/audit.h.
  See generic compat syscall audit support patch v4
* Remove endianness dependency, ie. AUDIT_ARCH_ARMEB/AARCH64EB.
* Remove kernel/syscalls/Makefile which was used to create unistd32.h.
  See Catalin's Add __NR_* definitions for compat syscalls patch

AKASHI Takahiro (3):
  arm64: Add regs_return_value() in syscall.h
  arm64: Add audit support
  arm64: audit: Add audit hook in ptrace/syscall_trace

 arch/arm64/Kconfig   |1 +
 arch/arm64/include/asm/ptrace.h  |5 +
 arch/arm64/include/asm/syscall.h |   15 +++
 arch/arm64/include/asm/thread_info.h |1 +
 arch/arm64/kernel/entry.S|3 +++
 arch/arm64/kernel/ptrace.c   |   10 ++
 include/uapi/linux/audit.h   |1 +
 7 files changed, 36 insertions(+)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/3] arm64: Add regs_return_value() in syscall.h

2014-02-02 Thread AKASHI Takahiro
This macro, regs_return_value, is used mainly for audit to record system
call's results, but may also be used in test_kprobes.c.

Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org
---
 arch/arm64/include/asm/ptrace.h |5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 0e7fa49..5800ec1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -134,6 +134,11 @@ struct pt_regs {
 #define user_stack_pointer(regs) \
((regs)-sp)
 
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+   return regs-regs[0];
+}
+
 /*
  * Are the current registers suitable for user mode? (used to maintain
  * security in signal handlers)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Make math_state_restore() save and restore the interrupt flag

2014-02-02 Thread Suresh Siddha
On Sun, 2014-02-02 at 11:15 -0800, Linus Torvalds wrote:
 On Sat, Feb 1, 2014 at 11:19 PM, Suresh Siddha sbsid...@gmail.com wrote:
 
  The real fix for Nate's problem will be coming from Linus, with a
  slightly modified option-b that Linus proposed. Linus, please let me
  know if you want me to spin it. I can do it sunday night.
 
 Please do it, since clearly I wasn't aware enough about the whole
 non-TS-checking FPU state details.
 
 Also, since this issue doesn't seem to be a recent regression, I'm not
 going to take this patch directly (even though I'm planning on doing
 -rc1 in a few hours), and expect that I'll get it through the normal
 channels (presumably together with the __kernel_fpu_end cleanups). Ok
 with everybody?

Here is the second patch, which should fix the issue reported in this
thread. Maarten, Nate, George, please give this patch a try as is and
see if it helps address the issue you ran into. And please ack/review
with your test results.

Other patch which cleans up the irq_enable/disable logic in
math_state_restore() has been sent yesterday. You can run your
experiments with both these patches if you want. But your issue should
get fixed with just the appended patch here.

Peter, Please push both these patches through normal channels depending
on the results.

thanks,
suresh
---
From: Suresh Siddha sbsid...@gmail.com
Subject: x86, fpu: check tsk_used_math() in kernel_fpu_end() for eager fpu

For non-eager fpu mode, thread's fpu state is allocated during the first
fpu usage (in the context of device not available exception). This
(math_state_restore()) can be a blocking call and hence we enable
interrupts (which were originally disabled when the exception happened),
allocate memory and disable interrupts etc.

But the eager-fpu mode, call's the same math_state_restore() from
kernel_fpu_end(). The assumption being that tsk_used_math() is always
set for the eager-fpu mode and thus avoid the code path of enabling
interrupts, allocating fpu state using blocking call and disable
interrupts etc. 

But the below issue was noticed by Maarten Baert, Nate Eldredge and
few others:

If a user process dumps core on an ecrypt fs while aesni-intel is loaded,
we get a BUG() in __find_get_block() complaining that it was called with
interrupts disabled; then all further accesses to our ecrypt fs hang
and we have to reboot.

The aesni-intel code (encrypting the core file that we are writing) needs
the FPU and quite properly wraps its code in kernel_fpu_{begin,end}(),
the latter of which calls math_state_restore(). So after kernel_fpu_end(),
interrupts may be disabled, which nobody seems to expect, and they stay
that way until we eventually get to __find_get_block() which barfs.

For eager fpu, most the time, tsk_used_math() is true. At few instances
during thread exit, signal return handling etc, tsk_used_math() might
be false.

In kernel_fpu_end(), for eager-fpu, call math_state_restore()
only if tsk_used_math() is set. Otherwise, don't bother. Kernel code
path which cleared tsk_used_math() knows what needs to be done
with the fpu state.

Reported-by: Maarten Baert maarten-ba...@hotmail.com
Reported-by: Nate Eldredge n...@thatsmathematics.com
Suggested-by: Linus Torvalds torva...@linux-foundation.org
Signed-off-by: Suresh Siddha sbsid...@gmail.com
Cc: George Spelvin li...@horizon.com
---
 arch/x86/kernel/i387.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 4e5f770..670bba1 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -87,10 +87,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
 
 void __kernel_fpu_end(void)
 {
-   if (use_eager_fpu())
-   math_state_restore();
-   else
+   if (use_eager_fpu()) {
+   /*
+* For eager fpu, most the time, tsk_used_math() is true.
+* Restore the user math as we are done with the kernel usage.
+* At few instances during thread exit, signal handling etc,
+* tsk_used_math() is false. Those few places will take proper
+* actions, so we don't need to restore the math here.
+*/
+   if (likely(tsk_used_math(current)))
+   math_state_restore();
+   } else {
stts();
+   }
 }
 EXPORT_SYMBOL(__kernel_fpu_end);
 


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/3] arm64: Add audit support

2014-02-02 Thread AKASHI Takahiro
On AArch64, audit is supported through generic lib/audit.c and
compat_audit.c, and so this patch adds arch specific definitions required.

Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org
---
 arch/arm64/Kconfig   |1 +
 arch/arm64/include/asm/syscall.h |   15 +++
 include/uapi/linux/audit.h   |1 +
 3 files changed, 17 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d4dd22..3c21405 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,6 +19,7 @@ config ARM64
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select HARDIRQS_SW_RESEND
+   select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_TRACEHOOK
select HAVE_DEBUG_BUGVERBOSE
select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 70ba9d4..6900183 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -16,7 +16,9 @@
 #ifndef __ASM_SYSCALL_H
 #define __ASM_SYSCALL_H
 
+#include linux/audit.h
 #include linux/err.h
+#include asm/compat.h
 
 
 static inline int syscall_get_nr(struct task_struct *task,
@@ -104,4 +106,17 @@ static inline void syscall_set_arguments(struct 
task_struct *task,
memcpy(regs-regs[i], args, n * sizeof(args[0]));
 }
 
+/*
+ * We don't care about endianness (__AUDIT_ARCH_LE bit) here because
+ * AArch64 has the same system calls both on little- and big- endian.
+ */
+static inline int syscall_get_arch(struct task_struct *task,
+  struct pt_regs *regs)
+{
+   if (is_compat_thread(task_thread_info(task)))
+   return AUDIT_ARCH_ARM;
+
+   return AUDIT_ARCH_AARCH64;
+}
+
 #endif /* __ASM_SYSCALL_H */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0a73cf3..cf27cae 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -327,6 +327,7 @@ enum {
 /* distinguish syscall tables */
 #define __AUDIT_ARCH_64BIT 0x8000
 #define __AUDIT_ARCH_LE   0x4000
+#define AUDIT_ARCH_AARCH64 (EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ALPHA   (EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARM (EM_ARM|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_ARMEB   (EM_ARM)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 3/3] arm64: audit: Add audit hook in ptrace/syscall_trace

2014-02-02 Thread AKASHI Takahiro
This patch adds auditing functions on entry to or exit from
every system call invocation.

Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org
---
 arch/arm64/include/asm/thread_info.h |1 +
 arch/arm64/kernel/entry.S|3 +++
 arch/arm64/kernel/ptrace.c   |   10 ++
 3 files changed, 14 insertions(+)

diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 720e70b..7468388 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED   1
 #define TIF_NOTIFY_RESUME  2   /* callback before returning to user */
 #define TIF_SYSCALL_TRACE  8
+#define TIF_SYSCALL_AUDIT  9
 #define TIF_POLLING_NRFLAG 16
 #define TIF_MEMDIE 18  /* is terminating due to OOM killer */
 #define TIF_FREEZE 19
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 827cbad..83c4b29 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -630,6 +630,9 @@ el0_svc_naked:  // 
compat entry point
get_thread_info tsk
ldr x16, [tsk, #TI_FLAGS]   // check for syscall tracing
tbnzx16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls?
+#ifdef CONFIG_AUDITSYSCALL
+   tbnzx16, #TIF_SYSCALL_AUDIT, __sys_trace // auditing syscalls?
+#endif
adr lr, ret_fast_syscall// return address
cmp scno, sc_nr // check upper syscall limit
b.hsni_sys
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 6777a21..75a3f23 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
  * along with this program.  If not, see http://www.gnu.org/licenses/.
  */
 
+#include linux/audit.h
 #include linux/kernel.h
 #include linux/sched.h
 #include linux/mm.h
@@ -38,6 +39,7 @@
 #include asm/compat.h
 #include asm/debug-monitors.h
 #include asm/pgtable.h
+#include asm/syscall.h
 #include asm/traps.h
 #include asm/system_misc.h
 
@@ -1064,6 +1066,14 @@ asmlinkage int syscall_trace(int dir, struct pt_regs 
*regs)
 {
unsigned long saved_reg;
 
+   if (dir)
+   audit_syscall_exit(regs);
+   else
+   audit_syscall_entry(syscall_get_arch(current, regs),
+   (int)regs-syscallno,
+   regs-orig_x0, regs-regs[1],
+   regs-regs[2], regs-regs[3]);
+
if (!test_thread_flag(TIF_SYSCALL_TRACE))
return regs-syscallno;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs

2014-02-02 Thread Vladimir Davydov
On 02/03/2014 10:21 AM, David Rientjes wrote:
 On Sun, 2 Feb 2014, Vladimir Davydov wrote:

 Per-memcg kmem caches are named as follows:

   global-cache-name(cgroup-kmem-id:cgroup-name)

 where cgroup-kmem-id is the unique id of the memcg the cache belongs
 to, cgroup-name is the relative name of the memcg on the cgroup fs.
 Cache names are exposed to userspace for debugging purposes (e.g. via
 sysfs in case of slub or via dmesg).

 Using relative names makes it impossible in general (in case the cgroup
 hierarchy is not flat) to find out which memcg a particular cache
 belongs to, because cgroup-kmem-id is not known to the user. Since
 using absolute cgroup names would be an overkill, let's fix this by
 exporting the id of kmem-active memcg via cgroup fs file
 memory.kmem.id.

 Hmm, I'm not sure exporting additional information is the best way to do 
 it only for this purpose.  I do understand the problem in naming 
 collisions if the hierarchy isn't flat and we typically work around that 
 by ensuring child memcgs still have a unique memcg.  This isn't only a 
 problem in slab cache naming, me also avoid printing the entire absolute 
 names for things like the oom killer.

AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and
memcg slab cache names serve for different purposes. The point is oom is
a perfectly normal situation for the kernel, and info dumped to dmesg is
for admin to find out the cause of the problem (a greedy user or
cgroup). On the other hand, slab cache names are dumped to dmesg only on
extraordinary situations - like bugs in slab implementation, or double
free, or detected memory leaks - where we usually do not need the name
of the memcg that triggered the problem, because the bug is likely to be
in the kernel subsys using the cache. Plus, the names are exported to
sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the
use cases for oom vs slab names are completely different - information
vs debugging - and I want to export kmem.id only for the ability of
debugging kmemcg and slab subsystems.

 So it would be nice to have 
 consensus on how people are supposed to identify memcgs with a hierarchy: 
 either by exporting information like the id like you do here (but leave 
 the oom killer still problematic) or by insisting people name their memcgs 
 with unique names if they care to differentiate them.

Anyway, I agree with you that this needs a consensus, because this is a
functional change.

Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 10/10] watchdog: xilinx: Enable this driver for Zynq

2014-02-02 Thread Michal Simek
On 01/31/2014 03:52 PM, Guenter Roeck wrote:
 On 01/31/2014 06:18 AM, Michal Simek wrote:
 Enable this driver for Zynq.
 Move it to architecture independent Kconfig part.

 Signed-off-by: Michal Simek michal.si...@xilinx.com
 ---

 Build tested by zero day testing system.
 ---
   drivers/watchdog/Kconfig | 22 +-
   1 file changed, 9 insertions(+), 13 deletions(-)

 diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
 index 9db5d3c..6120403 100644
 --- a/drivers/watchdog/Kconfig
 +++ b/drivers/watchdog/Kconfig
 @@ -111,6 +111,15 @@ config WM8350_WATCHDOG
 Support for the watchdog in the WM8350 AudioPlus PMIC.  When
 the watchdog triggers the system will be reset.

 +config XILINX_WATCHDOG
 +tristate Xilinx Watchdog timer
 +select WATCHDOG_CORE
 
 This needs to depend on HAS_IOMEM.

Are you sure?
I have no problem to do this change.
Zero day testing system doesn't report any problem with it.

I have checked dependencies and only score, tile and um has NO_IOMEM option
enables. And below in log is tile with allyesconfig that's why I believe
this driver has been also tested without any issue.

Thanks,
Michal


git://git.monstr.eu/linux-2.6-microblaze  xnext/watchdog
f7bdfada576e93eaab8f6dc2ecd881da8f43911c  watchdog: xilinx: Enable this driver 
for Zynq

elapsed time: 81m

configs tested: 122

alpha   defconfig
pariscallnoconfig
parisc b180_defconfig
pariscc3000_defconfig
parisc  defconfig
arm   allnoconfig
arm   almodconfig
arm at91_dt_defconfig
arm   imx_v6_v7_defconfig
arm  marzen_defconfig
arm   omap2plus_defconfig
arm  prima2_defconfig
arm s3c2410_defconfig
arm   spear13xx_defconfig
arm   tegra_defconfig
m32r   m32104ut_defconfig
m32r mappi3.smp_defconfig
m32r opsput_defconfig
m32r   usrv_defconfig
xtensa   common_defconfig
xtensa  iss_defconfig
x86_64allnoconfig
shallnoconfig
sh  rsk7269_defconfig
sh  sh7785lcr_32bit_defconfig
shtitan_defconfig
x86_64 randconfig-c0-0131
x86_64 randconfig-c1-0131
x86_64 randconfig-c2-0131
x86_64 randconfig-c3-0131
x86_64 randconfig-c4-0131
x86_64 randconfig-c5-0131
x86_64 randconfig-c6-0131
x86_64 randconfig-c7-0131
x86_64 randconfig-c8-0131
x86_64 randconfig-c9-0131
x86_64   allyesconfig
alphaallyesconfig
avr32allyesconfig
blackfin allyesconfig
cris allyesconfig
ia64 allyesconfig
m68k allyesconfig
mips allyesconfig
parisc   allyesconfig
powerpc  allyesconfig
s390 allyesconfig
sh   allyesconfig
sparcallyesconfig
sparc64  allyesconfig
tile allyesconfig
xtensa   allyesconfig
ia64 alldefconfig
ia64 allmodconfig
ia64  allnoconfig
ia64defconfig
x86_64lkp
powerpc  chroma_defconfig
powerpc   corenet64_smp_defconfig
powerpcgamecube_defconfig
powerpc linkstation_defconfig
powerpc wii_defconfig
x86_64 randconfig-j0-0131
x86_64 randconfig-j1-0131
x86_64 randconfig-j2-0131
x86_64 randconfig-j3-0131
x86_64 randconfig-j4-0131
x86_64 randconfig-j5-0131
m68k allmodconfig
m68k  amiga_defconfig
m68k   m5475evb_defconfig
m68k  multi_defconfig
blackfinBF526-EZBRD_defconfig
blackfinBF533-EZKIT_defconfig
blackfinBF561-EZKIT-SMP_defconfig
blackfin  TCM-BF537_defconfig
cris etrax-100lx_v2_defconfig
i386   randconfig-r0-0131
i386   

Re: [PATCH] ipv6: default route for link local address is not added while assigning a address

2014-02-02 Thread Sohny Thomas

On Wednesday 29 January 2014 04:08 PM, Nicolas Dichtel wrote:

Le 29/01/2014 07:41, Sohny Thomas a écrit :

Resending this on netdev mailing list:
Default route for link local address is configured automatically if
NETWORKING_IPV6=yes is in ifcfg-eth*.
When the route table for the interface is flushed and a new address is
added to
the same device with out removing linklocal addr, default route for
link local
address has to added by default.

I have found the issue to be caused by this checkin

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/net/ipv6?id=62b54dd91567686a1cb118f76a72d5f4764a86dd



According to this change :
He removes adding a link local route if any other address is added ,
applicable
across all interfaces though there's mentioned only lo interface
So below patch fixes for other devices

Signed-off-by: Sohny THomas sohth...@linux.vnet.ibm.com

Your email client has corrupted the patch, it cannot be applied.
Please read Documentation/email-clients.txt

Sorry about that. Will resend again


About the patch, I still think that the flush is too agressive. Link local
routes are marked as 'proto kernel', removing them without the link local
address is wrong.

With this patch, you will add a link local route even if you don't have
a link local address.
I think it wouldn't hurt to have a Link local route for NDP  in case a 
the routes become unreachable


-Regards,
Sohny






--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Linux 3.14-rc1 is out

2014-02-02 Thread Rafał Miłecki
Is that OK/wanted to note a possibly wide regression as Reply-To in the 
announce thread?


If your USB 3.0 stopped working with 3.14-rc1, please note it's already 
tracked regression reported in:
xhci regression since xhci: replace xhci_write_64() with writeq() - 
devices not detected


http://www.spinics.net/lists/linux-usb/msg101628.html
http://comments.gmane.org/gmane.linux.usb.general/102295
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] edac/85xx: Remove deprecated IRQF_DISABLED

2014-02-02 Thread Johannes Thumshirn
On Tue, Jan 21, 2014 at 09:42:27AM +0100, Johannes Thumshirn wrote:
 Remove IRQF_DISABLED as it is a NOOP.

 Signed-off-by: Johannes Thumshirn johannes.thumsh...@men.de
 ---
  drivers/edac/mpc85xx_edac.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

 diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
 index 8f918217..f4aec2e 100644
 --- a/drivers/edac/mpc85xx_edac.c
 +++ b/drivers/edac/mpc85xx_edac.c
 @@ -357,7 +357,7 @@ int mpc85xx_pci_err_probe(struct platform_device *op)
   pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0);
   res = devm_request_irq(op-dev, pdata-irq,
  mpc85xx_pci_isr,
 -IRQF_DISABLED | IRQF_SHARED,
 +IRQF_SHARED,
  [EDAC] PCI err, pci);
   if (res  0) {
   printk(KERN_ERR
 @@ -633,7 +633,7 @@ static int mpc85xx_l2_err_probe(struct platform_device 
 *op)
   if (edac_op_state == EDAC_OPSTATE_INT) {
   pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0);
   res = devm_request_irq(op-dev, pdata-irq,
 -mpc85xx_l2_isr, IRQF_DISABLED,
 +mpc85xx_l2_isr, 0,
  [EDAC] L2 err, edac_dev);
   if (res  0) {
   printk(KERN_ERR
 @@ -1133,7 +1133,7 @@ static int mpc85xx_mc_err_probe(struct platform_device 
 *op)
   pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0);
   res = devm_request_irq(op-dev, pdata-irq,
  mpc85xx_mc_isr,
 - IRQF_DISABLED | IRQF_SHARED,
 +IRQF_SHARED,
  [EDAC] MC err, mci);
   if (res  0) {
   printk(KERN_ERR %s: Unable to request irq %d for 
 --
 1.8.5.2


Boris, Mauro:

Ping?

Johannes
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ipv6: default route for link local address is not added while assigning a address

2014-02-02 Thread Sohny Thomas



Actually I am not so sure, there is no defined semantic of flush. I would
be ok with all three solutions: leave it as is, always add link-local
address (it does not matter if we don't have a link-local address on
that interface, as a global scoped one is just fine enough) or make flush not
remove the link-local address (but this seems a bit too special cased for me).


1) In case if we leave it as it is, there is rfc 6724 rule 2 to be 
considered ( previously rfc 3484)


Rule 2: Prefer appropriate scope.
   If Scope(SA)  Scope(SB): If Scope(SA)  Scope(D), then prefer SB and
   otherwise prefer SA.  Similarly, if Scope(SB)  Scope(SA): If
   Scope(SB)  Scope(D), then prefer SA and otherwise prefer SB.

Test:

   Destination: fe80::2(LS)
Candidate Source Addresses: 3ffe::1(GS) or fec0::1(SS) or LLA(LS)
Result: LLA(LS)
Scope(LLA)  Scope(fec0::1): If Scope(LLA)  Scope(fe80::2),  no, 
prefer LLA
Scope(LLA)  Scope(3ffe::1): If Scope(LLA)  Scope(fe80::2),  no, 
prefer LLA



Now the above test fails since the route itself is not present, and the 
test assumes that the route gets added since the LLA is not removed 
during the test


2) having a LLA always helps in NDP i think

3) making flush not remove link-local address will be chnaging 
functionality of ip flush command


Regards,
Sohny



Greetings,

   Hannes





--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs

2014-02-02 Thread Vladimir Davydov
[adding Johannes Weiner and Hugh Dickins to cc in case they have
something to object against this]

On 02/03/2014 10:57 AM, Vladimir Davydov wrote:
 On 02/03/2014 10:21 AM, David Rientjes wrote:
 On Sun, 2 Feb 2014, Vladimir Davydov wrote:

 Per-memcg kmem caches are named as follows:

   global-cache-name(cgroup-kmem-id:cgroup-name)

 where cgroup-kmem-id is the unique id of the memcg the cache belongs
 to, cgroup-name is the relative name of the memcg on the cgroup fs.
 Cache names are exposed to userspace for debugging purposes (e.g. via
 sysfs in case of slub or via dmesg).

 Using relative names makes it impossible in general (in case the cgroup
 hierarchy is not flat) to find out which memcg a particular cache
 belongs to, because cgroup-kmem-id is not known to the user. Since
 using absolute cgroup names would be an overkill, let's fix this by
 exporting the id of kmem-active memcg via cgroup fs file
 memory.kmem.id.

 Hmm, I'm not sure exporting additional information is the best way to do 
 it only for this purpose.  I do understand the problem in naming 
 collisions if the hierarchy isn't flat and we typically work around that 
 by ensuring child memcgs still have a unique memcg.  This isn't only a 
 problem in slab cache naming, me also avoid printing the entire absolute 
 names for things like the oom killer.
 AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and
 memcg slab cache names serve for different purposes. The point is oom is
 a perfectly normal situation for the kernel, and info dumped to dmesg is
 for admin to find out the cause of the problem (a greedy user or
 cgroup). On the other hand, slab cache names are dumped to dmesg only on
 extraordinary situations - like bugs in slab implementation, or double
 free, or detected memory leaks - where we usually do not need the name
 of the memcg that triggered the problem, because the bug is likely to be
 in the kernel subsys using the cache. Plus, the names are exported to
 sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the
 use cases for oom vs slab names are completely different - information
 vs debugging - and I want to export kmem.id only for the ability of
 debugging kmemcg and slab subsystems.

 So it would be nice to have 
 consensus on how people are supposed to identify memcgs with a hierarchy: 
 either by exporting information like the id like you do here (but leave 
 the oom killer still problematic) or by insisting people name their memcgs 
 with unique names if they care to differentiate them.
 Anyway, I agree with you that this needs a consensus, because this is a
 functional change.

 Thanks.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] powerpc: add ioremap_wt

2014-02-02 Thread Michael Moese
Allow for IO memory to be mapped cacheable for performing
PCI read bursts.

Signed-off-by: Michael Moese michael.mo...@men.de
---
 arch/powerpc/include/asm/io.h | 3 +++
 arch/powerpc/mm/pgtable_32.c  | 8 
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 45698d5..9591fff 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -631,6 +631,8 @@ static inline void iosync(void)
  *
  * * ioremap_wc enables write combining
  *
+ * * ioremap_wc enables write thru
+ *
  * * iounmap undoes such a mapping and can be hooked
  *
  * * __ioremap_at (and the pending __iounmap_at) are low level functions to
@@ -652,6 +654,7 @@ extern void __iomem *ioremap(phys_addr_t address, unsigned 
long size);
 extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
  unsigned long flags);
 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
+extern void __iomem *ioremap_wt(phys_addr_t address, unsigned long size);
 #define ioremap_nocache(addr, size)ioremap((addr), (size))
 
 extern void iounmap(volatile void __iomem *addr);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 51f8795..9ab0a54 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -141,6 +141,14 @@ ioremap_wc(phys_addr_t addr, unsigned long size)
 EXPORT_SYMBOL(ioremap_wc);
 
 void __iomem *
+ioremap_wt(phys_addr_t addr, unsigned long size)
+{
+   return __ioremap_caller(addr, size, _PAGE_WRITETHRU,
+   __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *
 ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
 {
/* writeable implies dirty for kernel addresses */
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -tip v2 1/8] [BUGFIX] perf-probe: Fix to do exit call for symbol maps

2014-02-02 Thread Namhyung Kim
Hi Masami,

On Wed, 29 Jan 2014 09:14:52 +, Masami Hiramatsu wrote:
 Some perf-probe commands do symbol_init() but doesn't
 do exit call. This fixes that to call symbol_exit()
 and relase machine if needed.
 This also merges init_vmlinux() and init_user_exec()
 because both of them are doing similar things.
 (init_user_exec() just skips init vmlinux related
  symbol maps)

 Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com
 ---
  tools/perf/util/probe-event.c |  110 
 +++--
  1 file changed, 61 insertions(+), 49 deletions(-)

 diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
 index a8a9b6c..14c649df 100644
 --- a/tools/perf/util/probe-event.c
 +++ b/tools/perf/util/probe-event.c
 @@ -73,31 +73,35 @@ static char *synthesize_perf_probe_point(struct 
 perf_probe_point *pp);
  static int convert_name_to_addr(struct perf_probe_event *pev,
   const char *exec);
  static void clear_probe_trace_event(struct probe_trace_event *tev);
 -static struct machine machine;
 +static struct machine *host_machine;
  
  /* Initialize symbol maps and path of vmlinux/modules */
 -static int init_vmlinux(void)
 +static int init_symbol_maps(bool user_only)
  {
   int ret;
  
   symbol_conf.sort_by_name = true;
 - if (symbol_conf.vmlinux_name == NULL)
 - symbol_conf.try_vmlinux_path = true;
 - else
 - pr_debug(Use vmlinux: %s\n, symbol_conf.vmlinux_name);
 + if (user_only)
 + symbol_conf.try_vmlinux_path = false;
 + else {
 + if (symbol_conf.vmlinux_name == NULL)
 + symbol_conf.try_vmlinux_path = true;

This looks unnecessary and duplicate since we already have following
code in __cmd_probe().

/*
 * Only consider the user's kernel image path if given.
 */
symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);

Thanks,
Namhyung


 + else
 + pr_debug(Use vmlinux: %s\n, symbol_conf.vmlinux_name);
 + }
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] Add 32 bit VDSO time support for 32 bit kernel

2014-02-02 Thread Stefani Seibold
Am Sonntag, den 02.02.2014, 16:12 -0800 schrieb Andy Lutomirski:
 On Sun, Feb 2, 2014 at 1:39 PM, Stefani Seibold stef...@seibold.net wrote:
  Am Sonntag, den 02.02.2014, 08:46 -0800 schrieb Andy Lutomirski:
  On Sun, Feb 2, 2014 at 3:27 AM,  stef...@seibold.net wrote:
   From: Stefani Seibold stef...@seibold.net
  
   This patch add the time support for 32 bit a VDSO to a 32 bit kernel.
 
  [...]
 
  Can you address the review comments from last time around?  For
  example, this still seems to have redundant vvar and hpet mappings, it
  doesn't use the VVAR macro, it moves the 32-bit compat vDSO, etc.
 
 
  I will address the compat VDSO issue.
 
  But the VVAR macro will be not a part of this patch set. If you depend
  on this, feel free to create one. From my point of view this is not
  feasible without a macro hacking, because the address accessing the vvar
  area differs in kernel and VDSO user mode.
 
 Sorry, but I will make the code messier for no apparent reason and I
 will not offer to fix it in the same series gets my NAK.
 
 Hint: I'm talking about two or three lines of code in vvar.h.
 

A hint back: if you threat me with a NAK for a requested code sequence
which currently no user, this is far away from professional. I am not
your trainee.

BTW: If it is so easy, send me the two or three lines and i will merge
it ;-)

 
  I also see no redundant mapping. There are two modes, one is the map of
  the kernel area the other maps the VDSO into the user space area. This
  is exactly the behaviour of the origin VDSO implementation.
 
 No.
 
 In your series there are *three* mappings.  There are:
 
  - The linear mapping that the kernel loader sets up (the writable
 mapping used in the kernel).  This is implicit and, of course, fine.
  - There's the fixmap page, which aliases the normal kernel mapping at
 a fixed address with the user, ro, and nx attributes.  The 64-bit vDSO
 uses that mapping.  See vdso.h -- it's all arranged pretty clearly.
 Your code, for no discernible reason, sets up a fixmap entry on
 *32-bit* kernels.
  - The vma that you're setting up adjacent to the actual vdso text.
 This is what you are using.
 
 Please choose *one* user-readable mapping for the 32-bit vdso and
 stick with it.  If the 64-bit vdso can use it to and userspace doesn't
 break, even better.  But a pointless set of extra fixmap entries is
 not okay.
 

Again: I wrote that there are two modes for a 32 bit kernel and
therefore there are two mappings at the same time. Since there are both
ways available in a 32 bit kernel via the vdso32= kernel parameter, both
must be supported.

Due the lack of a real fixmap for a 32 bit kernel (FIXADDR_TOP is a
variable), the HPET and VVAR Page can only relative addressed. So this
pages must located before or after the VDSO. 

This is why i need to setup this pages into the fixmap area, this is the
compat mode vdso32=2.

For vdso32=1 i need to map the VDSO Page together with the HPET and
VVAR into the user space.

For compability reasons both mappings are required.

There is only one binary for the VDSO page, regardless of the vdso=
kernel parameter and this code can only do a relative addressing.

A 64 bit kernel can do it in an other way, because there is a real
fixmap area, so this special handling is not needed.

- Stefani


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/7] improve robustness on handling migratetype

2014-02-02 Thread Joonsoo Kim
On Wed, Jan 29, 2014 at 05:52:41PM +0100, Vlastimil Babka wrote:
 On 01/10/2014 09:48 AM, Joonsoo Kim wrote:
 On Thu, Jan 09, 2014 at 09:27:20AM +, Mel Gorman wrote:
 On Thu, Jan 09, 2014 at 04:04:40PM +0900, Joonsoo Kim wrote:
 Hello,
 
 I found some weaknesses on handling migratetype during code review and
 testing CMA.
 
 First, we don't have any synchronization method on get/set pageblock
 migratetype. When we change migratetype, we hold the zone lock. So
 writer-writer race doesn't exist. But while someone changes migratetype,
 others can get migratetype. This may introduce totally unintended value
 as migratetype. Although I haven't heard of any problem report about
 that, it is better to protect properly.
 
 
 This is deliberate. The migratetypes for the majority of users are advisory
 and aimed for fragmentation avoidance. It was important that the cost of
 that be kept as low as possible and the general case is that migration types
 change very rarely. In many cases, the zone lock is held. In other cases,
 such as splitting free pages, the cost is simply not justified.
 
 I doubt there is any amount of data you could add in support that would
 justify hammering the free fast paths (which call get_pageblock_type).
 
 Hello, Mel.
 
 There is a possibility that we can get unintended value such as 6 as 
 migratetype
 if reader-writer (get/set pageblock_migratetype) race happends. It can be
 possible, because we read the value without any synchronization method. And
 this migratetype, 6, has no place in buddy freelist, so array index overrun 
 can
 be possible and the system can break, although I haven't heard that it 
 occurs.
 
 Hello,
 
 it seems this can indeed happen. I'm working on memory compaction
 improvements and in a prototype patch, I'm basically adding calls of
 start_isolate_page_range() undo_isolate_page_range() some functions
 under compact_zone(). With this I've seen occurrences of NULL
 pointers in move_freepages(), free_one_page() in places where
 free_list[migratetype] is manipulated by e.g. list_move(). That lead
 me to question the value of migratetype and I found this thread.
 Adding some debugging in get_pageblock_migratetype() and voila, I
 get a value of 6 being read.
 
 So is it just my patch adding a dangerous situation, or does it exist in
 mainline as well? By looking at free_one_page(), it uses zone-lock, but
 get_pageblock_migratetype() is called by its callers
 (free_hot_cold_page() or __free_pages_ok()) outside of the lock.
 This determined migratetype is then used under free_one_page() to
 access a free_list.
 
 It seems that this could race with set_pageblock_migratetype()
 called from try_to_steal_freepages() (despite the latter being
 properly locked). There are also other callers but those seem to be
 either limited to initialization and isolation, which should be rare
 (?).
 However, try_to_steal_freepages can occur repeatedly.
 So I assume that the race happens but never manifests as a fatal
 error as long as MIGRATE_UNMOVABLE, MIGRATE_RECLAIMABLE and
 MIGRATE_MOVABLE
 values are used. Only MIGRATE_CMA and MIGRATE_ISOLATE have values
 with bit 4 enabled and can thus result in invalid values due to
 non-atomic access.
 
 Does that make sense to you and should we thus proceed with patching
 this race?
 

Hello,

This race is possible without your prototype patch, however, on very low
probability. Some codes related to memory failure use set_migratetype_isolate()
which could result in this race.

Although it may be very rare case and not critical, it is better to fix
this race. I prefer that we don't depend on luck. :)

Mel's suggestion looks good to me. Do you have another idea?

Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 6/8] cleanup __vdso_gettimeofday

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch do a little cleanup for the __vdso_gettimeofday() function.

It kick out an unneeded ret local variable and makes the code faster if
only the timezone is needed.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/vdso/vclock_gettime.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 743f277..bf969a0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -259,13 +259,12 @@ int clock_gettime(clockid_t, struct timespec *)
 
 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-   long ret = VCLOCK_NONE;
-
if (likely(tv != NULL)) {
BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
 offsetof(struct timespec, tv_nsec) ||
 sizeof(*tv) != sizeof(struct timespec));
-   ret = do_realtime((struct timespec *)tv);
+   if (do_realtime((struct timespec *)tv) == VCLOCK_NONE)
+   return vdso_fallback_gtod(tv, tz);
tv-tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
@@ -274,8 +273,6 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct 
timezone *tz)
tz-tz_dsttime = gtod-sys_tz.tz_dsttime;
}
 
-   if (ret == VCLOCK_NONE)
-   return vdso_fallback_gtod(tv, tz);
return 0;
 }
 int gettimeofday(struct timeval *, struct timezone *)
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 1/8] Make vsyscall_gtod_data handling x86 generic

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch move the vsyscall_gtod_data handling out of vsyscall_64.c
into an additonal file vsyscall_gtod.c to make the functionality
available for x86 32 bit kernel.

It also adds a new vsyscall_32.c which setup the VVAR page.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/Kconfig   |  4 +--
 arch/x86/include/asm/clocksource.h |  4 ---
 arch/x86/include/asm/fixmap.h  |  2 ++
 arch/x86/include/asm/vvar.h|  4 +++
 arch/x86/kernel/Makefile   |  3 +-
 arch/x86/kernel/hpet.c |  4 ---
 arch/x86/kernel/setup.c|  2 --
 arch/x86/kernel/tsc.c  |  2 --
 arch/x86/kernel/vmlinux.lds.S  |  3 --
 arch/x86/kernel/vsyscall_32.c  | 24 +++
 arch/x86/kernel/vsyscall_64.c  | 44 
 arch/x86/kernel/vsyscall_gtod.c| 60 ++
 12 files changed, 94 insertions(+), 62 deletions(-)
 create mode 100644 arch/x86/kernel/vsyscall_32.c
 create mode 100644 arch/x86/kernel/vsyscall_gtod.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 940e50e..b556f00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,9 +107,9 @@ config X86
select HAVE_ARCH_SOFT_DIRTY
select CLOCKSOURCE_WATCHDOG
select GENERIC_CLOCKEVENTS
-   select ARCH_CLOCKSOURCE_DATA if X86_64
+   select ARCH_CLOCKSOURCE_DATA
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32  
X86_LOCAL_APIC)
-   select GENERIC_TIME_VSYSCALL if X86_64
+   select GENERIC_TIME_VSYSCALL
select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
diff --git a/arch/x86/include/asm/clocksource.h 
b/arch/x86/include/asm/clocksource.h
index 16a57f4..eda81dc 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,8 +3,6 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#ifdef CONFIG_X86_64
-
 #define VCLOCK_NONE 0  /* No vDSO clock available. */
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.   */
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.  */
@@ -14,6 +12,4 @@ struct arch_clocksource_data {
int vclock_mode;
 };
 
-#endif /* CONFIG_X86_64 */
-
 #endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7252cd3..094d0cc 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -75,6 +75,8 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_32
FIX_HOLE,
FIX_VDSO,
+   VVAR_PAGE,
+   VSYSCALL_HPET,
 #else
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index d76ac40..c442782 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -17,7 +17,11 @@
  */
 
 /* Base address of vvars.  This is not ABI. */
+#ifdef CONFIG_X86_64
 #define VVAR_ADDRESS (-10*1024*1024 - 4096)
+#else
+#define VVAR_ADDRESS 0xd000
+#endif
 
 #if defined(__VVAR_KERNEL_LDS)
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cb648c8..3282eda 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,7 +26,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y  += probe_roms.o
 obj-$(CONFIG_X86_32)   += i386_ksyms_32.o
 obj-$(CONFIG_X86_64)   += sys_x86_64.o x8664_ksyms_64.o
-obj-y  += syscall_$(BITS).o
+obj-y  += syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_X86_32)   += vsyscall_32.o
 obj-$(CONFIG_X86_64)   += vsyscall_64.o
 obj-$(CONFIG_X86_64)   += vsyscall_emu_64.o
 obj-$(CONFIG_SYSFS)+= ksysfs.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index da85a8e..54263f0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -74,9 +74,7 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
 static inline void hpet_set_mapping(void)
 {
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
-#endif
 }
 
 static inline void hpet_clear_mapping(void)
@@ -752,9 +750,7 @@ static struct clocksource clocksource_hpet = {
.mask   = HPET_MASK,
.flags  = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
-#ifdef CONFIG_X86_64
.archdata   = { .vclock_mode = VCLOCK_HPET },
-#endif
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 06853e6..56ff330 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1182,9 +1182,7 @@ void __init setup_arch(char **cmdline_p)
 
tboot_probe();
 
-#ifdef CONFIG_X86_64
map_vsyscall();
-#endif
 
generic_apic_probe();
 
diff --git a/arch/x86/kernel/tsc.c 

[PATCH v10 5/8] replace VVAR(vsyscall_gtod_data) by gtod macro

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

There a currently more than 30 users of the gtod macro, so replace the
last VVAR(vsyscall_gtod_data) by gtod macro.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/vdso/vclock_gettime.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index fd074dd..743f277 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -109,7 +109,7 @@ static notrace cycle_t vread_pvclock(int *mode)
*mode = VCLOCK_NONE;
 
/* refer to tsc.c read_tsc() comment for rationale */
-   last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+   last = gtod-clock.cycle_last;
 
if (likely(ret = last))
return ret;
@@ -133,7 +133,7 @@ notrace static cycle_t vread_tsc(void)
rdtsc_barrier();
ret = (cycle_t)vget_cycles();
 
-   last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+   last = gtod-clock.cycle_last;
 
if (likely(ret = last))
return ret;
@@ -288,7 +288,7 @@ int gettimeofday(struct timeval *, struct timezone *)
 notrace time_t __vdso_time(time_t *t)
 {
/* This is atomic on x86_64 so we don't need any locks. */
-   time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
+   time_t result = ACCESS_ONCE(gtod-wall_time_sec);
 
if (t)
*t = result;
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 4/8] vclock_gettime.c __vdso_clock_gettime cleanup

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch is a small code cleanup for the __vdso_clock_gettime() function.

It removes the unneeded return values from do_monotonic_coarse() and
do_realtime_coarse() and add a fallback label for doing the kernel
gettimeofday() system call.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/vdso/vclock_gettime.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index bbc8065..fd074dd 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -209,7 +209,7 @@ notrace static int do_monotonic(struct timespec *ts)
return mode;
 }
 
-notrace static int do_realtime_coarse(struct timespec *ts)
+notrace static void do_realtime_coarse(struct timespec *ts)
 {
unsigned long seq;
do {
@@ -217,10 +217,9 @@ notrace static int do_realtime_coarse(struct timespec *ts)
ts-tv_sec = gtod-wall_time_coarse.tv_sec;
ts-tv_nsec = gtod-wall_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
-   return 0;
 }
 
-notrace static int do_monotonic_coarse(struct timespec *ts)
+notrace static void do_monotonic_coarse(struct timespec *ts)
 {
unsigned long seq;
do {
@@ -228,30 +227,32 @@ notrace static int do_monotonic_coarse(struct timespec 
*ts)
ts-tv_sec = gtod-monotonic_time_coarse.tv_sec;
ts-tv_nsec = gtod-monotonic_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
-
-   return 0;
 }
 
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
-   int ret = VCLOCK_NONE;
-
switch (clock) {
case CLOCK_REALTIME:
-   ret = do_realtime(ts);
+   if (do_realtime(ts) == VCLOCK_NONE)
+   goto fallback;
break;
case CLOCK_MONOTONIC:
-   ret = do_monotonic(ts);
+   if (do_monotonic(ts) == VCLOCK_NONE)
+   goto fallback;
break;
case CLOCK_REALTIME_COARSE:
-   return do_realtime_coarse(ts);
+   do_realtime_coarse(ts);
+   break;
case CLOCK_MONOTONIC_COARSE:
-   return do_monotonic_coarse(ts);
+   do_monotonic_coarse(ts);
+   break;
+   default:
+   goto fallback;
}
 
-   if (ret == VCLOCK_NONE)
-   return vdso_fallback_gettime(clock, ts);
return 0;
+fallback:
+   return vdso_fallback_gettime(clock, ts);
 }
 int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias(__vdso_clock_gettime)));
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 7/8] Add 32 bit VDSO time support for 32 bit kernel

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch add the time support for 32 bit a VDSO to a 32 bit kernel.

For 32 bit programs running on a 32 bit kernel, the same mechanism is
used as for 64 bit programs running on a 64 bit kernel.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/include/asm/vdso.h   |  3 ++
 arch/x86/include/asm/vdso32.h | 11 +++
 arch/x86/vdso/Makefile|  7 +
 arch/x86/vdso/vclock_gettime.c| 59 +--
 arch/x86/vdso/vdso-layout.lds.S   | 22 +
 arch/x86/vdso/vdso32-setup.c  | 55 
 arch/x86/vdso/vdso32/vclock_gettime.c | 16 ++
 arch/x86/vdso/vdso32/vdso32.lds.S |  9 ++
 8 files changed, 174 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/include/asm/vdso32.h
 create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index fddb53d..fe3cef9 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -2,6 +2,9 @@
 #define _ASM_X86_VDSO_H
 
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+
+#include asm/vdso32.h
+
 extern const char VDSO32_PRELINK[];
 
 /*
diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h
new file mode 100644
index 000..37e1a75
--- /dev/null
+++ b/arch/x86/include/asm/vdso32.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_VDSO32_H
+#define _ASM_X86_VDSO32_H
+
+#define VDSO_BASE_PAGE 0
+#define VDSO_VVAR_PAGE 1
+#define VDSO_HPET_PAGE 2
+#defineVDSO_PAGES  3
+#define VDSO_PREV_PAGES2
+#defineVDSO_OFFSET(x)  ((x) * PAGE_SIZE)
+
+#endif
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index fd14be1..1ff5b0a 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -145,8 +145,15 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
 $(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
 $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
 
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=3 -freg-struct-return -fpic
+$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
 $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
 $(obj)/vdso32/vdso32.lds \
+$(obj)/vdso32/vclock_gettime.o \
 $(obj)/vdso32/note.o \
 $(obj)/vdso32/%.o
$(call if_changed,vdso)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index bf969a0..42f641c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -4,6 +4,9 @@
  *
  * Fast user context implementation of clock_gettime, gettimeofday, and time.
  *
+ * 32 Bit compat layer by Stefani Seibold stef...@seibold.net
+ *  sponsored by Rohde  Schwarz GmbH  Co. KG Munich/Germany
+ *
  * The code should have no internal unresolved relocations.
  * Check with readelf after changing.
  */
@@ -24,6 +27,8 @@
 #include asm/io.h
 #include asm/pvclock.h
 
+#ifndef BUILD_VDSO32
+
 #define gtod (VVAR(vsyscall_gtod_data))
 
 static notrace cycle_t vread_hpet(void)
@@ -47,6 +52,54 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, 
struct timezone *tz)
0 (__NR_gettimeofday), D (tv), S (tz) : memory);
return ret;
 }
+#else
+
+struct vsyscall_gtod_data vvar_vsyscall_gtod_data
+   __attribute__((visibility(hidden)));
+
+u8 hpet_page
+   __attribute__((visibility(hidden)));
+
+#define gtod (vvar_vsyscall_gtod_data)
+
+#ifdef CONFIG_HPET_TIMER
+static notrace cycle_t vread_hpet(void)
+{
+   return readl(hpet_page + HPET_COUNTER);
+}
+#endif
+
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+   long ret;
+
+   asm(
+   push %%ebx \n
+   mov %2,%%ebx \n
+   call VDSO32_vsyscall \n
+   pop %%ebx \n
+   : =a (ret)
+   : 0 (__NR_clock_gettime), d (clock), c (ts)
+   : memory);
+   return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv,
+   struct timezone *tz)
+{
+   long ret;
+
+   asm(
+   push %%ebx \n
+   mov %2,%%ebx \n
+   call VDSO32_vsyscall \n
+   pop %%ebx \n
+   : =a (ret)
+   : 0 (__NR_gettimeofday), d (tv), c (tz)
+   : memory);
+   return ret;
+}
+#endif
 
 #ifdef CONFIG_PARAVIRT_CLOCK
 
@@ -152,12 +205,14 @@ notrace static cycle_t vread_tsc(void)
 
 notrace static inline u64 vgetsns(int *mode)
 {
-   long v;
+   u64 v;
cycles_t cycles;
if (gtod-clock.vclock_mode == VCLOCK_TSC)
 

[PATCH v9 0/8] Add 32 bit VDSO time function support

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch add the functions vdso_gettimeofday(), vdso_clock_gettime()
and vdso_time() to the 32 bit VDSO.

The reason to do this was to get a fast reliable time stamp. Many developers
uses TSC to get a fast time stamp, without knowing the pitfalls. VDSO
time functions a fast and a reliable way, because the kernel knows the
best time source and the P- and C-state of the CPU.

The helper library to use the VDSO functions can be download at
http://http://seibold.net/vdso.c
The libary is very small, only 228 lines of code. Compile it with
gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so
and use it with LD_PRELOAD=path/libvdso.so

This kind of helper must be integrated into glibc, for x86 64 bit and
PowerPC it is already there.

Some linux 32 bit kernel benchmark results (all measurements are in nano
seconds):

Intel(R) Celeron(TM) CPU 400MHz

Average time kernel call:
 gettimeofday(): 1039
 clock_gettime(): 1578
 time(): 526
Average time VDSO call:
 gettimeofday(): 378
 clock_gettime(): 303
 time(): 60

Celeron(R) Dual-Core CPU T3100 1.90GHz

Average time kernel call:
 gettimeofday(): 209
 clock_gettime(): 406
 time(): 135
Average time VDSO call:
 gettimeofday(): 51
 clock_gettime(): 43
 time(): 10

So you can see a performance increase between 4 and 13, depending on the
CPU and the function.

The address layout of the VDSO has changed, because there is no fixed
address space available on a x86 32 bit kernel, despite the name. Because
someone decided to add an offset to the __FIXADDR_TOP for virtualization.

Also the IA32 Emulation uses the whole 4 GB address space, so there is no
fixed address available.

This was the reason not depend on this kind of address and change the layout
of the VDSO. The VDSO for a 32 bit application has now three pages:

^ Higher Address
|
++
+ VDSO page (includes code) ro+x +
++
+ VVAR page (export kernel variables) ro +
++
+ HPET page (mapped registers) ro 
++
|
^ Lower Address

The VDSO page for a 32 bit resided still on 0xe000, the the VVAR and
HPET page are mapped before.

In the non compat mode the VMA of the VDSO is now 3 pages for a 32 bit kernel.
So this decrease the available logical address room by 2 pages.

The patch is against kernel 3.14 (e7651b819e90da924991d727d3c007200a18670d)

Changelog:
25.11.2012 - first release and proof of concept for linux 3.4
11.12.2012 - Port to linux 3.7 and code cleanup
12.12.2012 - fixes suggested by Andy Lutomirski
   - fixes suggested by John Stultz
   - use call VDSO32_vsyscall instead of int 80
   - code cleanup
17.12.2012 - support for IA32_EMULATION, this includes
 - code cleanup
 - include cleanup to fix compile warnings and errors
 - move out seqcount from seqlock, enable use in VDSO
 - map FIXMAP and HPET into the 32 bit address space
18.12.2012 - split into separate patches
30.01.2014 - revamp the code
 - code clean up
 - VDSO layout changed
 - no fixed addresses
 - port to 3.14
01.02.2014 - code cleanup
02.02.2014 - code cleanup
 - split into more patches
 - use HPET_COUNTER instead of hard coded value
 - fix changelog to the right year ;-)
02.02.2014 - reverse the mapping, this make the new VDSO 32 bit support
 full compatible.
03.02.2014 - code cleanup
 - fix comment
 - fix ABI break in vdso32.lds.S
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 2/8] Add new func _install_special_mapping() to mmap.c

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

The _install_special_mapping() is the new base function for
install_special_mapping(). This function will return a pointer of the
created VMA or a error code in an ERR_PTR()

This new function will be needed by the for the vdso 32 bit support to map the
additonal vvar and hpet pages into the 32 bit address space. This will be done
with io_remap_pfn_range() and remap_pfn_range, which requieres a vm_area_struct.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 include/linux/mm.h |  3 +++
 mm/mmap.c  | 20 
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46e..55342aa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1740,6 +1740,9 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct 
file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 
 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
+extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
+  unsigned long addr, unsigned long len,
+  unsigned long flags, struct page **pages);
 extern int install_special_mapping(struct mm_struct *mm,
   unsigned long addr, unsigned long len,
   unsigned long flags, struct page **pages);
diff --git a/mm/mmap.c b/mm/mmap.c
index 20ff0c3..81ba54f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2918,7 +2918,7 @@ static const struct vm_operations_struct 
special_mapping_vmops = {
  * The array pointer and the pages it points to are assumed to stay alive
  * for as long as this mapping might exist.
  */
-int install_special_mapping(struct mm_struct *mm,
+struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
 {
@@ -2927,7 +2927,7 @@ int install_special_mapping(struct mm_struct *mm,
 
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
-   return -ENOMEM;
+   return ERR_PTR(-ENOMEM);
 
INIT_LIST_HEAD(vma-anon_vma_chain);
vma-vm_mm = mm;
@@ -2948,11 +2948,23 @@ int install_special_mapping(struct mm_struct *mm,
 
perf_event_mmap(vma);
 
-   return 0;
+   return vma;
 
 out:
kmem_cache_free(vm_area_cachep, vma);
-   return ret;
+   return ERR_PTR(ret);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+   unsigned long addr, unsigned long len,
+   unsigned long vm_flags, struct page **pages)
+{
+   struct vm_area_struct *vma = _install_special_mapping(mm,
+   addr, len, vm_flags, pages);
+
+   if (IS_ERR(vma))
+   return PTR_ERR(vma);
+   return 0;
 }
 
 static DEFINE_MUTEX(mm_all_locks_mutex);
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH -tip v2 4/8] perf-probe: Use _stext based address instead of the symbol name

2014-02-02 Thread Namhyung Kim
On Wed, 29 Jan 2014 09:14:59 +, Masami Hiramatsu wrote:
 diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
 index 4a9f43b..120954b 100644
 --- a/tools/perf/util/probe-event.c
 +++ b/tools/perf/util/probe-event.c
 @@ -387,6 +387,44 @@ static int add_module_to_probe_trace_events(struct 
 probe_trace_event *tevs,
   return ret;
  }
  
 +/* Post processing the probe events */
 +static int post_process_probe_trace_events(struct probe_trace_event *tevs,
 +int ntevs, const char *module,
 +bool uprobe)
 +{
 + struct symbol *sym;
 + struct map *map;
 + unsigned long stext = 0;
 + char *tmp;
 + int i;
 +
 + if (uprobe)
 + return add_exec_to_probe_trace_events(tevs, ntevs, module);
 +
 + /* Note that currently _stext based probe is not for drivers */
 + if (module)
 + return add_module_to_probe_trace_events(tevs, ntevs, module);
 +
 + sym = __find_kernel_function_by_name(_stext, map);

Couldn't we just use kmap-ref_reloc_sym instead of the hard-coded
_stext?  You might want to check the Adrian's recent kaslr fixes (now
in tip/perf/urgent).

Thanks,
Namhyung


 + if (!sym) {
 + pr_debug(Failed to find _stext. Use original symbol name.\n);
 + return 0;
 + }
 + stext = map-unmap_ip(map, sym-start);
 +
 + for (i = 0; i  ntevs; i++) {
 + if (tevs[i].point.address) {
 + tmp = strdup(_stext);
 + if (!tmp)
 + return -ENOMEM;
 + free(tevs[i].point.symbol);
 + tevs[i].point.symbol = tmp;
 + tevs[i].point.offset = tevs[i].point.address - stext;
 + }
 + }
 + return 0;
 +}
 +
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v10 8/8] Add 32 bit VDSO time support for 64 bit kernel

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This patch add the VDSO time support for the IA32 Emulation Layer.

Due the nature of the kernel headers and the LP64 compiler where the
size of a long and a pointer differs against a 32 bit compiler, there
is a lot of type hacking necessary.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/vdso/vclock_gettime.c| 109 +++---
 arch/x86/vdso/vdso32/vclock_gettime.c |   7 +++
 2 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 42f641c..8ba8db8 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -31,12 +31,24 @@
 
 #define gtod (VVAR(vsyscall_gtod_data))
 
+struct api_timeval {
+   longtv_sec; /* seconds */
+   longtv_usec;/* microseconds */
+};
+
+struct api_timespec {
+   longtv_sec; /* seconds */
+   longtv_nsec;/* nanoseconds */
+};
+
+typedef long api_time_t;
+
 static notrace cycle_t vread_hpet(void)
 {
return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 
HPET_COUNTER);
 }
 
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct api_timespec *ts)
 {
long ret;
asm(syscall : =a (ret) :
@@ -44,7 +56,8 @@ notrace static long vdso_fallback_gettime(long clock, struct 
timespec *ts)
return ret;
 }
 
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+notrace static long vdso_fallback_gtod(struct api_timeval *tv,
+   struct timezone *tz)
 {
long ret;
 
@@ -54,14 +67,62 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, 
struct timezone *tz)
 }
 #else
 
+#ifdef CONFIG_IA32_EMULATION
+typedef s64arch_time_t;
+
+struct arch_timespec {
+   s64 tv_sec;
+   s64 tv_nsec;
+};
+
+#define ALIGN8 __attribute__ ((aligned (8)))
+
+struct arch_vsyscall_gtod_data {
+   seqcount_t  seq ALIGN8;
+
+   struct { /* extract of a clocksource struct */
+   int vclock_mode ALIGN8;
+   cycle_t cycle_last ALIGN8;
+   cycle_t mask ALIGN8;
+   u32 mult;
+   u32 shift;
+   } clock;
+
+   /* open coded 'struct timespec' */
+   arch_time_t wall_time_sec;
+   u64 wall_time_snsec;
+   u64 monotonic_time_snsec;
+   arch_time_t monotonic_time_sec;
+
+   struct timezone sys_tz;
+   struct arch_timespec wall_time_coarse;
+   struct arch_timespec monotonic_time_coarse;
+};
+
+struct arch_vsyscall_gtod_data vvar_vsyscall_gtod_data
+   __attribute__((visibility(hidden)));
+#else
 struct vsyscall_gtod_data vvar_vsyscall_gtod_data
__attribute__((visibility(hidden)));
+#endif
 
 u8 hpet_page
__attribute__((visibility(hidden)));
 
 #define gtod (vvar_vsyscall_gtod_data)
 
+struct api_timeval {
+   s32 tv_sec; /* seconds */
+   s32 tv_usec;/* microseconds */
+};
+
+struct api_timespec {
+   s32 tv_sec; /* seconds */
+   s32 tv_nsec;/* microseconds */
+};
+
+typedef s32 api_time_t;
+
 #ifdef CONFIG_HPET_TIMER
 static notrace cycle_t vread_hpet(void)
 {
@@ -69,7 +130,7 @@ static notrace cycle_t vread_hpet(void)
 }
 #endif
 
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct api_timespec *ts)
 {
long ret;
 
@@ -79,12 +140,12 @@ notrace static long vdso_fallback_gettime(long clock, 
struct timespec *ts)
call VDSO32_vsyscall \n
pop %%ebx \n
: =a (ret)
-   : 0 (__NR_clock_gettime), d (clock), c (ts)
+   : 0 (__NR_ia32_clock_gettime), d (clock), c (ts)
: memory);
return ret;
 }
 
-notrace static long vdso_fallback_gtod(struct timeval *tv,
+notrace static long vdso_fallback_gtod(struct api_timeval *tv,
struct timezone *tz)
 {
long ret;
@@ -95,7 +156,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv,
call VDSO32_vsyscall \n
pop %%ebx \n
: =a (ret)
-   : 0 (__NR_gettimeofday), d (tv), c (tz)
+   : 0 (__NR_ia32_gettimeofday), d (tv), c (tz)
: memory);
return ret;
 }
@@ -284,42 +345,48 @@ notrace static void do_monotonic_coarse(struct timespec 
*ts)
} while (unlikely(read_seqcount_retry(gtod-seq, seq)));
 }
 
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct api_timespec *ts)
 {
+   struct timespec tmp;
+
switch (clock) {
case CLOCK_REALTIME:
-   if (do_realtime(ts) == VCLOCK_NONE)
+   if (do_realtime(tmp) == 

[PATCH v10 3/8] revamp vclock_gettime.c

2014-02-02 Thread stefani
From: Stefani Seibold stef...@seibold.net

This intermediate patch revamps the vclock_gettime.c by moving some functions
around. It is only for spliting purpose, to make whole the 32 bit vdso timer
patch easier to review.

Signed-off-by: Stefani Seibold stef...@seibold.net
---
 arch/x86/vdso/vclock_gettime.c | 85 +-
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index eb5d7a5..bbc8065 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -26,41 +26,26 @@
 
 #define gtod (VVAR(vsyscall_gtod_data))
 
-notrace static cycle_t vread_tsc(void)
+static notrace cycle_t vread_hpet(void)
 {
-   cycle_t ret;
-   u64 last;
-
-   /*
-* Empirically, a fence (of type that depends on the CPU)
-* before rdtsc is enough to ensure that rdtsc is ordered
-* with respect to loads.  The various CPU manuals are unclear
-* as to whether rdtsc can be reordered with later loads,
-* but no one has ever seen it happen.
-*/
-   rdtsc_barrier();
-   ret = (cycle_t)vget_cycles();
-
-   last = VVAR(vsyscall_gtod_data).clock.cycle_last;
-
-   if (likely(ret = last))
-   return ret;
+   return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 
HPET_COUNTER);
+}
 
-   /*
-* GCC likes to generate cmov here, but this branch is extremely
-* predictable (it's just a funciton of time and the likely is
-* very likely) and there's a data dependence, so force GCC
-* to generate a branch instead.  I don't barrier() because
-* we don't actually need a barrier, and if this function
-* ever gets inlined it will generate worse code.
-*/
-   asm volatile ();
-   return last;
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+   long ret;
+   asm(syscall : =a (ret) :
+   0 (__NR_clock_gettime), D (clock), S (ts) : memory);
+   return ret;
 }
 
-static notrace cycle_t vread_hpet(void)
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 {
-   return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 
HPET_COUNTER);
+   long ret;
+
+   asm(syscall : =a (ret) :
+   0 (__NR_gettimeofday), D (tv), S (tz) : memory);
+   return ret;
 }
 
 #ifdef CONFIG_PARAVIRT_CLOCK
@@ -133,23 +118,37 @@ static notrace cycle_t vread_pvclock(int *mode)
 }
 #endif
 
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static cycle_t vread_tsc(void)
 {
-   long ret;
-   asm(syscall : =a (ret) :
-   0 (__NR_clock_gettime),D (clock), S (ts) : memory);
-   return ret;
-}
+   cycle_t ret;
+   u64 last;
 
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
-   long ret;
+   /*
+* Empirically, a fence (of type that depends on the CPU)
+* before rdtsc is enough to ensure that rdtsc is ordered
+* with respect to loads.  The various CPU manuals are unclear
+* as to whether rdtsc can be reordered with later loads,
+* but no one has ever seen it happen.
+*/
+   rdtsc_barrier();
+   ret = (cycle_t)vget_cycles();
 
-   asm(syscall : =a (ret) :
-   0 (__NR_gettimeofday), D (tv), S (tz) : memory);
-   return ret;
-}
+   last = VVAR(vsyscall_gtod_data).clock.cycle_last;
 
+   if (likely(ret = last))
+   return ret;
+
+   /*
+* GCC likes to generate cmov here, but this branch is extremely
+* predictable (it's just a funciton of time and the likely is
+* very likely) and there's a data dependence, so force GCC
+* to generate a branch instead.  I don't barrier() because
+* we don't actually need a barrier, and if this function
+* ever gets inlined it will generate worse code.
+*/
+   asm volatile ();
+   return last;
+}
 
 notrace static inline u64 vgetsns(int *mode)
 {
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5