[PATCH 1/4] w1: fix netlink refcnt leak on error path
If the message type is W1_MASTER_CMD or W1_SLAVE_CMD, then a reference is taken when searching for the slave or master device. If there isn't any following data m-len (mlen is a copy) is 0 and packing up the message for later execution is skipped leaving nothing to decrement the reference counts. Way back when, m-len was checked before the search that increments the reference count, but W1_LIST_MASTERS has no additional data, the check was moved in 9be62e0b2fadaf5ff causing this bug. This change reorders to put the check before the reference count is incremented avoiding the problem. Signed-off-by: David Fries da...@fries.net --- drivers/w1/w1_netlink.c | 44 ++-- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c index 5234964..a02704a 100644 --- a/drivers/w1/w1_netlink.c +++ b/drivers/w1/w1_netlink.c @@ -300,12 +300,6 @@ static int w1_process_command_root(struct cn_msg *msg, struct w1_netlink_msg *w; u32 *id; - if (mcmd-type != W1_LIST_MASTERS) { - printk(KERN_NOTICE %s: msg: %x.%x, wrong type: %u, len: %u.\n, - __func__, msg-id.idx, msg-id.val, mcmd-type, mcmd-len); - return -EPROTO; - } - cn = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!cn) return -ENOMEM; @@ -441,6 +435,9 @@ static void w1_process_cb(struct w1_master *dev, struct w1_async_cmd *async_cmd) w1_netlink_send_error(node-block-msg, node-m, cmd, node-block-portid, err); + /* ref taken in w1_search_slave or w1_search_master_id when building +* the block +*/ if (sl) w1_unref_slave(sl); else @@ -503,30 +500,42 @@ static void w1_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) msg_len = msg-len; while (msg_len !err) { - struct w1_reg_num id; - u16 mlen = m-len; dev = NULL; sl = NULL; - memcpy(id, m-id.id, sizeof(id)); -#if 0 - printk(%s: %02x.%012llx.%02x: type=%02x, len=%u.\n, - __func__, id.family, (unsigned long long)id.id, id.crc, m-type, m-len); -#endif if (m-len + sizeof(struct w1_netlink_msg) msg_len) { err = -E2BIG; break; } + /* execute on this thread, no need to process later */ + if (m-type == W1_LIST_MASTERS) { + err = w1_process_command_root(msg, m, nsp-portid); + goto out_cont; + } + + /* All following message types require additional data, +* check here before references are taken. +*/ + if (!m-len) { + err = -EPROTO; + goto out_cont; + } + + /* both search calls take reference counts */ if (m-type == W1_MASTER_CMD) { dev = w1_search_master_id(m-id.mst.id); } else if (m-type == W1_SLAVE_CMD) { - sl = w1_search_slave(id); + sl = w1_search_slave((struct w1_reg_num *)m-id.id); if (sl) dev = sl-master; } else { - err = w1_process_command_root(msg, m, nsp-portid); + printk(KERN_NOTICE + %s: msg: %x.%x, wrong type: %u, len: %u.\n, + __func__, msg-id.idx, msg-id.val, + m-type, m-len); + err = -EPROTO; goto out_cont; } @@ -536,8 +545,6 @@ static void w1_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) } err = 0; - if (!mlen) - goto out_cont; atomic_inc(block-refcnt); node-async.cb = w1_process_cb; @@ -557,7 +564,8 @@ out_cont: if (err) w1_netlink_send_error(msg, m, NULL, nsp-portid, err); msg_len -= sizeof(struct w1_netlink_msg) + m-len; - m = (struct w1_netlink_msg *)(((u8 *)m) + sizeof(struct w1_netlink_msg) + m-len); + m = (struct w1_netlink_msg *)(((u8 *)m) + + sizeof(struct w1_netlink_msg) + m-len); /* * Let's allow requests for nonexisting devices. -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/4] w1: document struct w1_netlink_msg and struct w1_netlink_cmd
I wasn't sure on the length, so I looked it up and documented it. Signed-off-by: David Fries da...@fries.net --- drivers/w1/w1_netlink.h | 25 + 1 file changed, 25 insertions(+) diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h index 1e9504e..c646a98 100644 --- a/drivers/w1/w1_netlink.h +++ b/drivers/w1/w1_netlink.h @@ -49,6 +49,19 @@ enum w1_netlink_message_types { W1_LIST_MASTERS, }; +/** + * struct w1_netlink_msg - holds w1 message type, id, and result + * + * @type: one of enum w1_netlink_message_types + * @status: kernel feedback for success 0 or errno failure value + * @len: length of data following w1_netlink_msg + * @id: union holding master bus id (msg.id) and slave device id (id[8]). + * @data: start address of any following data + * + * The base message structure for w1 messages over netlink. + * The netlink connector data sequence is, struct nlmsghdr, struct cn_msg, + * then one or more struct w1_netlink_msg (each with optional data). + */ struct w1_netlink_msg { __u8type; @@ -66,6 +79,7 @@ struct w1_netlink_msg /** * enum w1_commands - commands available for master or slave operations + * * @W1_CMD_READ: read len bytes * @W1_CMD_WRITE: write len bytes * @W1_CMD_SEARCH: initiate a standard search, returns only the slave @@ -93,6 +107,17 @@ enum w1_commands { W1_CMD_MAX }; +/** + * struct w1_netlink_cmd - holds the command and data + * + * @cmd: one of enum w1_commands + * @res: reserved + * @len: length of data following w1_netlink_cmd + * @data: start address of any following data + * + * One or more struct w1_netlink_cmd is placed starting at w1_netlink_msg.data + * each with optional data. + */ struct w1_netlink_cmd { __u8cmd; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH,RFC] random: collect cpu randomness
On Sun, 2 February 2014 22:25:31 +0100, Stephan Mueller wrote: Am Sonntag, 2. Februar 2014, 15:36:17 schrieb Jörn Engel: Collects entropy from random behaviour all modern cpus exhibit. The scheduler and slab allocator are instrumented for this purpose. How much randomness can be gathered is clearly hardware-dependent and hard to estimate. Therefore the entropy estimate is zero, but random bits still get mixed into the pools. May I ask what the purpose of the patches is when no entropy is implied? I see that the pool is stirred more. But is that really a problem that needs addressing? For my part, I think the whole business of estimating entropy is bordering on the esoteric. If the hash on the output side is any good, you have a completely unpredictable prng once the entropy pool is unpredictable. Additional random bits are nice, but not all that useful. Blocking /dev/random based on entropy estimates is likewise not all that useful. Key phrase is once the entropy pool is unpredictable. So early in bootup it may make sense to estimate the entropy. But here the problem is that you cannot measure entropy, at least not within a single system and a reasonable amount of time. That leaves you with a heuristic that, like all heuristics, is wrong. I personally care more about generating high-quality randomness as soon as possible and with low cost to the system. Feel free to disagree or set your priorities differently. Please, do not get me wrong with the presented critisism here -- the approach in general looks interesting. However, the following patches makes me wonder big time. extern void get_random_bytes(void *buf, int nbytes); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..7af6389f9b9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2511,6 +2511,7 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq-curr; + __add_cpu_randomness(__builtin_return_address(1), prev); schedule_debug(prev); diff --git a/mm/slab.c b/mm/slab.c index eb043bf05f4c..ea5a30d44ad1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3587,6 +3587,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, trace_kmalloc(caller, ret, size, cachep-size, flags); + add_cpu_randomness(__builtin_return_address(2), ret); return ret; } First, the noise source you add is constantly triggered throughout the execution of the kernel. Entropy is very important, we (who are interested in crypto) know that. But how often is entropy needed? Other folks wonder about the speed of the kernel. And with these two patches, every kmalloc and every scheduling invocation now dives into the random.c code to do something. I would think this is a bit expensive, especially to stir the pool without increasing the entropy estimator. I think entropy collection should be performed when it is needed and not throughout the lifetime of the system. Please measure how expensive it really is. My measurement gave me a doesn't matter result, surprising as it may seem. If the cost actually matters, we can either disable or rate-limit the randomness collection at some point after boot. But that would bring us back into the estimation business. Second, when I offered my initial patch which independently collects some entropy on the CPU execution timing, I got shot down with one concern raised by Ted, and that was about whether a user can influence the entropy collection process. When I am trying to measure CPU execution timing in the RNG, the concern was raised that the measured timing variations was due to CPU states that were influenced by users. Your patch here clearly hooks into code paths which are definitely affected by user actions. So, this patch therefore would be subject to the same concerns. I personally think that this is not so much an issue, yet it was raised previously. The nice thing about the random pool is that mixing any amount of deterministic data into it does not diminish the randomness already in it. Given that attribute, I don't understand the concern. It seems I have a bad timing, because just two days ago I released a new attempt on the CPU jitter RNG [1] with a new noise source, and I was just about to prepare a release email. With that attempt, both issues raised above are addressed, including a theoretical foundation of the noise source. [1] http://www.chronox.de/ I am not married to my patch. If the approach makes sense, let's merge it. If the approach does not make sense or there is a better alternative, drop it on the floor. The problem I see with your approach is this: The only prerequisite is the availability of a high-resolution timer that is available in modern CPUs. Given a modern CPU with a high-resolution timer, you will almost certainly collect enough randomness for good random
Re: [PATCH,RFC] random: collect cpu randomness
On 02/02/2014 05:24 PM, Jörn Engel wrote: For my part, I think the whole business of estimating entropy is bordering on the esoteric. If the hash on the output side is any good, you have a completely unpredictable prng once the entropy pool is unpredictable. Additional random bits are nice, but not all that useful. Blocking /dev/random based on entropy estimates is likewise not all that useful. Key phrase is once the entropy pool is unpredictable. So early in bootup it may make sense to estimate the entropy. But here the problem is that you cannot measure entropy, at least not within a single system and a reasonable amount of time. That leaves you with a heuristic that, like all heuristics, is wrong. The entropy bound needs to be a conservative lower bound. Its main use is to provide backpressure (should we spend more CPU time producing entropy) although the forward pressure on /dev/random is potentially useful for high security applications. This does NOT mean that zero-credit entropy generation is useless, far from it. It just means that we are doing it on an it can't hurt basis, rather than I know for sure that this is valuable. -hpa -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ACPI / hotplug / PCI: Scan root bus under the PCI rescan-remove lock
From: Rafael J. Wysocki rafael.j.wyso...@intel.com Since acpiphp_check_bridge() called by acpiphp_check_host_bridge() does things that require PCI rescan-remove locking around it, make acpiphp_check_host_bridge() use that locking. Signed-off-by: Rafael J. Wysocki rafael.j.wyso...@intel.com --- One more thing I overlooked in the PCI rescan-remove locking patchset. I just found it sitting there in a dark dusty corner and staring at me in horror when I approached it with a vacuum cleaner ... Anyway, 3.14-rc2 material on top of patches [1-2/13] from this series: https://lkml.org/lkml/2014/2/1/123 Thanks, Rafael --- drivers/pci/hotplug/acpiphp_glue.c |4 1 file changed, 4 insertions(+) Index: linux-pm/drivers/pci/hotplug/acpiphp_glue.c === --- linux-pm.orig/drivers/pci/hotplug/acpiphp_glue.c +++ linux-pm/drivers/pci/hotplug/acpiphp_glue.c @@ -829,7 +829,11 @@ void acpiphp_check_host_bridge(acpi_hand bridge = acpiphp_handle_to_bridge(handle); if (bridge) { + pci_lock_rescan_remove(); + acpiphp_check_bridge(bridge); + + pci_unlock_rescan_remove(); put_bridge(bridge); } } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH,RFC] random: collect cpu randomness
On Sun, Feb 02, 2014 at 10:25:31PM +0100, Stephan Mueller wrote: Second, when I offered my initial patch which independently collects some entropy on the CPU execution timing, I got shot down with one concern raised by Ted, and that was about whether a user can influence the entropy collection process. Um, that wasn't my concern. After all, when we sample keyboard timing while trying to generate a GPG key, of course the user can and does influence the entropy collection process. The question is whether an attacker who has deep knowledge of the how the CPU works internally, perhaps made worse with quantization effects (i.e., it doesn't matter if analog-generated settling time is measured in microseconds if the output is being clocked out in milliseconds), such that it is predictable. I really like Jörn's tests doing repeated boot testing and observing on a SMP system, the slab allocation pattern is quite deterministic. So even though the numbers might *look* random, an attacker with deep knowledge of how the kernel was compiled and what memory allocations get done during the boot sequence would be able to quite successfuly measure it. I'm guessing that indeed, on a 4-CPU KVM system, what you're measuring is the when the host OS happens to be scheduling the KVM threads, with some variability caused by external networking interrupts, etc. It would definitely be a good idea to retry that experiment on a real 4-CPU system to see what sort of results you might get. It might very well be that the attacker who knows the relative ordering of the slab/thread activations but for which it's not entirely clear whether one cpu will be ahead of another, that there is *some* entropy, but perhaps only a handful bits. It's the fact that we can't be sure how much uncertainty there might be with an attacker with very deep knowledge the CPU which is why Jörn's conservatism of not crediting the entropy counter is quite understandable. Of course, this doesn't help someone who is trying to speed up the time it takes GPG to generate a new key pair. But in terms of improving /dev/urandom as it is used by many crypto applications, it certainly can't hurt. The real question is how much overhead does it add, and is it worth it. Jörn, I take it that was the reason for creating an even faster, but weaker mixing function? Was the existing fast mix causing a measurable overhead, or was this your just being really paranoid about not adding anything to the various kernel fastpaths? - Ted -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[tip:x86/x32] compat: Fix sparse address space warnings
Commit-ID: dce44e03b0a3448ad11ac6c6e0cbe299e0400791 Gitweb: http://git.kernel.org/tip/dce44e03b0a3448ad11ac6c6e0cbe299e0400791 Author: H. Peter Anvin h...@linux.intel.com AuthorDate: Sun, 2 Feb 2014 17:57:28 -0800 Committer: H. Peter Anvin h...@linux.intel.com CommitDate: Sun, 2 Feb 2014 18:00:29 -0800 compat: Fix sparse address space warnings In compat_sys_old_getrlimit() we pass a kernel pointer to sys_old_getrlimit() inside a set_fs() bracket. This is okay, so we can safely cast the affected pointer to __user. In compat_clock_nanosleep_restart(), the variable rmtp holds a user pointer. Annotate it as such. Both of these warnings are ancient, but were reported by Fengguang Wu's test system due to other changes. Signed-off-by: H. Peter Anvin h...@linux.intel.com Cc: Toyo Abe to...@mvista.com Link: http://lkml.kernel.org/n/tip-507h7cq5e45eg6ygtykon...@git.kernel.org --- kernel/compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/compat.c b/kernel/compat.c index 3afc524..7076b57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -451,7 +451,7 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, r); + ret = sys_old_getrlimit(resource, (struct rlimit __user *)r); set_fs(old_fs); if (!ret) { @@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = restart-nanosleep.compat_rmtp; + struct compat_timespec __user *rmtp = restart-nanosleep.compat_rmtp; restart-nanosleep.rmtp = (struct timespec __user *) tu; oldfs = get_fs(); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 0/3] epoll: read(),write(),ioctl() interface
Hi everyone, This patch series adds support for read(), write(), and ioctl() operations on eventpolls as well as an associated userspace structure to format the eventpoll entries delivered via read()/write() buffers. The new structure, struct epoll, differs from struct epoll_event mainly in that it also holds the associated file descriptor. Using the normal I/O interface to manipulate eventpolls is much neater than using epoll-specific syscalls while also allowing for greater flexibility (theoretically, pipes could be used to filter access). Specifically, write() creates, modifies, and/or removes event entries stored in the supplied buffer, using the userspace identifier to check whether an entry exists and removing it if no events are set to trigger it, while read() simply waits for enough events to fill the provided buffer. As timeout control is essential for polling to be practical, ioctl() is used to configure an optional timeout, which is infinite by default. Documentation/ioctl/ioctl-number.txt | 1 + fs/eventpoll.c | 534 --- include/uapi/linux/eventpoll.h | 10 + 3 files changed, 384 insertions(+), 161 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 2/3] epoll: add struct epoll ioctl() commands
Add a new 'struct epoll' to the userspace eventpoll interface. Buffers supplied to read() write() calls on eventpolls are interpreted as arrays of this structure. The new structure's only functional difference from epoll_event is it also holds the associated file descriptor (needed for write() to properly create events but useful information in general). Also define the ioctl() command macros to set get the timeout of an eventpoll. Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com --- diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2..73f817c 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -56,11 +56,21 @@ #define EPOLL_PACKED #endif +/* ioctl() requests */ +#define EPIOC_GETTIMEOUT _IOR('$', 0x10, int) +#define EPIOC_SETTIMEOUT _IOW('$', 0x11, int) + struct epoll_event { __u32 events; __u64 data; } EPOLL_PACKED; +struct epoll { + int ep_fildes; /* file descriptor */ + int ep_events; /* triggering events */ + long long ep_ident; /* entry ID (cf. epoll_event-data) */ +} EPOLL_PACKED; /* A.K.A. epe for eventpoll entry */ + #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 1/3] epoll: reserve small ioctl() space
Reserve a small ioctl() command space for eventpolls, of which only two are currently utilized. Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com --- diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index d7e43fa..3c6f8ac 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -81,6 +81,7 @@ Code Seq#(hex) Include FileComments 0x22 all scsi/sg.h '#'00-3F IEEE 1394 Subsystem Block for the entire subsystem '$'00-0F linux/perf_counter.h, linux/perf_event.h +'$'10-1F include/uapi/linux/eventpoll.h ''00-07 drivers/firewire/nosy-user.h '1'00-1F linux/timepps.h PPS kit from Ulrich Windl ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 3/3] epoll: add read()/write()/ioctl() operations
The eventpoll implementation is largely interface-agnostic, aside from the userspace structure format and epoll_ctl(). Particularly as each field of the structure is handled independently, replacing usage of epoll_event internally was straighforward and clarifies the code some. As for epoll_ctl(), its functionality was moved into the new ep_eventpoll_write() function, and epoll_ctl() just hands off its work to it. The ep_eventpoll_read() function is very similar to epoll_wait(), which remains independent but shares the vast majority of code for minimal redundancy. Finally, ep_eventpoll_ioctl() is a simple interface to configure a default timeout for read() operations on the given eventpoll. Signed-off-by: Nathaniel Yazdani n1ght.4nd@gmail.com --- diff --git a/fs/eventpoll.c b/fs/eventpoll.c index af90312..7f0ce59 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -168,8 +168,11 @@ struct epitem { /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; - /* The structure that describe the interested events and the source fd */ - struct epoll_event event; + /* Interested events */ + int events; + + /* The userspace identifier for this entry */ + long long ident; }; /* @@ -216,6 +219,9 @@ struct eventpoll { struct file *file; + /* Default timeout */ + int timeout; + /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; @@ -251,6 +257,13 @@ struct ep_send_events_data { struct epoll_event __user *events; }; +/* ep_scan_ready_list() callback data for ep_send_epes() */ +struct ep_send_epes_data +{ + int max; + struct epoll __user *epes; +}; + /* * Configuration options available inside /proc/sys/fs/epoll/ */ @@ -795,9 +808,9 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) { - pt-_key = epi-event.events; + pt-_key = epi-events; - return epi-ffd.file-f_op-poll(epi-ffd.file, pt) epi-event.events; + return epi-ffd.file-f_op-poll(epi-ffd.file, pt) epi-events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -881,8 +894,8 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f) struct epitem *epi = rb_entry(rbp, struct epitem, rbn); ret = seq_printf(m, tfd: %8d events: %8x data: %16llx\n, -epi-ffd.fd, epi-event.events, -(long long)epi-event.data); +epi-ffd.fd, epi-events, +(long long)epi-ident); if (ret) break; } @@ -892,6 +905,15 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f) } #endif +static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf, + size_t bufsz, loff_t *pos); + +static ssize_t ep_eventpoll_read(struct file *file, char __user *buf, +size_t bufsz, loff_t *pos); + +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS @@ -899,6 +921,9 @@ static const struct file_operations eventpoll_fops = { #endif .release= ep_eventpoll_release, .poll = ep_eventpoll_poll, + .read = ep_eventpoll_read, + .write = ep_eventpoll_write, + .unlocked_ioctl = ep_eventpoll_ioctl, .llseek = noop_llseek, }; @@ -1025,7 +1050,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ - if (!(epi-event.events ~EP_PRIVATE_BITS)) + if (!(epi-events ~EP_PRIVATE_BITS)) goto out_unlock; /* @@ -1034,7 +1059,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * callback. We need to be able to handle both cases here, hence the * test for key != NULL before the event match test. */ - if (key !((unsigned long) key epi-event.events)) + if (key !((unsigned long) key epi-events)) goto out_unlock; /* @@ -1264,7 +1289,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) /* * Must be called with mtx held. */ -static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +static int ep_insert(struct eventpoll *ep, long long ident, int events, struct file *tfile, int fd, int full_check) {
Re: [livelock, 3.13.0] livelock when run out of swap space
On Mon, 3 Feb 2014, Dave Chinner wrote: Hi folks, I just had a test machine livelock when running a concurrent rm -rf workload on an XFS filesystem with 64k directory block sizes. The buffer allocation code started reporting this 5 times a second: XFS: possible memory allocation deadlock in kmem_alloc (mode:0x8250) Which is in GFP_NOFS|GFP_ZERO context. It is likely to have been a high order allocation (up to 64k), but there was still lenty of free memory available (2.8GB of 16GB): $ free total used free sharedbuffers cached Mem: 16424296 135937322830564 0136 3184 -/+ buffers/cache: 135904122833884 Swap: 497976 497976 0 $ But clearly there was no page cache being used. All of the memory in use was in the inode/dentry caches: OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME 9486678 9483271 99%1.19K 364874 26 11675968K xfs_inode 4820508 4820508 100%0.21K 130284 37 1042272K dentry 4820224 4820224 100%0.06K 75316 64301264K kmalloc-64 The issue is that memory allocation was not making progress - the shrinkers we not doing anything because they were under GFP_NOFS allocation context, and kswapd was never woken to take over. The system was compeltely out of swap space, and all the CPU was being burnt in this function: 44.91% [kernel] [k] scan_swap_map The typical stack trace of a looping memory allocation is this: [211699.924006] CPU: 2 PID: 21939 Comm: rm Not tainted 3.13.0-dgc+ #172 [211699.924006] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [211699.924006] task: 88041a7dde40 ti: 8803bbeec000 task.ti: 8803bbeec000 [211699.924006] RIP: 0010:[81187dd8] [81187dd8] scan_swap_map+0x118/0x520 [211699.924006] RSP: 0018:8803bbeed508 EFLAGS: 0297 [211699.924006] RAX: a1ba RBX: 0032 RCX: [211699.924006] RDX: 0001 RSI: 0001e64e RDI: 00019def [211699.924006] RBP: 8803bbeed558 R08: 002c6ba0 R09: [211699.924006] R10: 57ffb90ace3d4f80 R11: 00019def R12: 88041a682900 [211699.924006] R13: 01ff R14: 0040 R15: 88041a6829a0 [211699.924006] FS: 7fae682c8700() GS:88031bc0() knlGS: [211699.924006] CS: 0010 DS: ES: CR0: 8005003b [211699.924006] CR2: 004353b0 CR3: 0002ea498000 CR4: 06e0 [211699.924006] Stack: [211699.924006] 8803bbeed538 88031a9dc720 a1ba 4893 [211699.924006] 88041a682900 0001 [211699.924006] 88041a6829a0 8803bbeed598 8118837f [211699.924006] Call Trace: [211699.924006] [8118837f] get_swap_page+0xef/0x1e0 [211699.924006] [81184e34] add_to_swap+0x24/0x70 [211699.924006] [8115f110] shrink_page_list+0x300/0xa20 [211699.924006] [81169089] ? __mod_zone_page_state+0x49/0x50 [211699.924006] [8116a3b9] ? wait_iff_congested+0xa9/0x150 [211699.924006] [8115fe03] shrink_inactive_list+0x243/0x480 [211699.924006] [811606f1] shrink_lruvec+0x371/0x670 [211699.924006] [81cdb4ce] ? _raw_spin_unlock+0xe/0x10 [211699.924006] [81160dea] do_try_to_free_pages+0x11a/0x360 [211699.924006] [81161220] try_to_free_pages+0x110/0x190 [211699.924006] [81156422] __alloc_pages_nodemask+0x5a2/0x8a0 [211699.924006] [8118fac2] alloc_pages_current+0xb2/0x170 [211699.924006] [81151bde] __get_free_pages+0xe/0x50 [211699.924006] [8116d199] kmalloc_order_trace+0x39/0xb0 [211699.924006] [810cf4c3] ? finish_wait+0x63/0x80 [211699.924006] [81197156] __kmalloc+0x176/0x180 [211699.924006] [810cf520] ? __init_waitqueue_head+0x40/0x40 [211699.924006] [814a74f7] kmem_alloc+0x77/0xf0 [211699.924006] [814feb54] xfs_log_commit_cil+0x3c4/0x5a0 [211699.924006] [814a6be3] xfs_trans_commit+0xc3/0x2d0 [211699.924006] [814e913e] xfs_remove+0x3be/0x440 [211699.924006] [811b7d8d] ? __d_lookup+0x11d/0x170 [211699.924006] [8149b842] xfs_vn_unlink+0x52/0xa0 [211699.924006] [811acc22] vfs_unlink+0xf2/0x160 [211699.924006] [811acef0] do_unlinkat+0x260/0x2a0 [211699.924006] [811b003b] SyS_unlinkat+0x1b/0x40 [211699.924006] [81ce3ea9] system_call_fastpath+0x16/0x1b i.e. trying to do memory allocation during a transaction commit in XFS, and that is looping in kmem_alloc(). THe problem in this case is that kswapd was not being started to free slab cache memory (i.e. to handle the defered GFP_NOFS slab reclaim). It stayed in the livelock state for over an hour before I broke it by running echo 2
Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote: So CPU5 CPU52 were eating 100% CPU doing nothing instead of running cc1 objdump right? Yeah. According to the backtrace both of them are trying to access the per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail to get the timer lock here. They shouldn't spin there for minutes, I have no idea why they did so… I dumped it for later-guy.. but he tends to get busy doing other crap, and just whacks my carefully saved data ;-) I guess this problem does not occur without -RT and before that patch you saw only that one warning from can_stop_full_tick()? I didn't try it without -RT, and yes, without, you just get the warning. -Mike -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)
On 02/03/2014 04:10 AM, Ilia Mirkin wrote: Hi Alexandre, On Fri, Jan 31, 2014 at 10:16 PM, Alexandre Courbot acour...@nvidia.com wrote: I guess my email address might surprise some of you, so let me anticipate some questions you might have. :P Yes, this work is endorsed by NVIDIA. Several other NVIDIAns (CC'd), including core GPU experts, have provided significant technical guidance and will continue their involvement. Special thanks go to Terje Bergstrom and Ken Adams for their invaluable GPU expertise, and Thierry Reding (at FOSDEM this weekend) for help with debugging and user-space testing. Let me also stress that although very exciting, this effort is still experimental, so I would like to make sure that nobody makes excessive expectations based on these few patches. The scope of this work is strictly limited to Tegra (although given the similarities desktop GPU support will certainly benefit from it indirectly), and we do not have any plan to work on user-space support. So do not uninstall that proprietary driver just yet. ;) With this being clarified, we are looking forward to getting your feedback and working with you guys to bring and improve Tegra K1 support into Nouveau! :) I've sent a couple of fairly trivial comments, as you saw, and I suspect that others with a better understanding of the guts will have more substantial architectural feedback, esp after the weekend/FOSDEM. However, since no one's said it already -- welcome to Nouveau! Thanks! ^_^v One beginner question: is it appropriate to send kernel patches to the nouveau list in addition to dri-devel? The moderation messages I receive make me think that this list might rather be intended for general discussion. From the looks of it, you could bring up a full open-source stack with your patches (i.e. Xorg + nouveau DDX + mesa) and use PRIME to render stuff (assuming the actual display hw has an X ddx). Although I suspect that you're going to want to use your own drivers. Still a little curious if you've tried the open-source stack and whether it worked. [Not sure what the status is of render-node support is in mesa, but perhaps it's enough to try running piglit tests, if you can't get X going with the display HW.] We are still testing things at libdrm level, but are eventually interested in bringing up the existing open-source stack. Our guess (and hope) is that it will work nicely almost as-is, minus the fact that the display hardware is not handled by Nouveau and we only support render nodes (I have yet to look at what the state of render nodes in Mesa is). For X, Thierry is IIUC working on the display driver, and at some point these efforts should join to connect tegradrm and Nouveau using PRIME. We are not quite there yet, and since we are working with limited resources it will likely require some time, but the fact we could bring up a (seemingly) working Nouveau kernel driver with so little code is encouraging. Thanks, Alex. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [git pull] Please pull powerpc.git next branch
On Wed, 2014-01-29 at 13:29 +1100, Alistair Popple wrote: Looks like I missed the dart iommu code when changing the iommu table initialisation. The patch below should fix it, would you mind testing it Ben? Thanks. Any reason not to add the following to save ourselves in future? diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index d773dd4..6ab7b53 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) unsigned int i; struct iommu_pool *p; + BUG_ON(!tbl-it_page_shift); + /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long); cheers -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] pci: fix kernel-doc notation warning
From: Randy Dunlap rdun...@infradead.org Fix a blank kernel-doc line to have an asterisk instead of being totally empty. This fixes the kernel-doc warning: Warning(drivers/pci/msi.c:962): bad line: Signed-off-by: Randy Dunlap rdun...@infradead.org --- drivers/pci/msi.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- lnx-314-rc1.orig/drivers/pci/msi.c +++ lnx-314-rc1/drivers/pci/msi.c @@ -959,7 +959,7 @@ EXPORT_SYMBOL(pci_disable_msi); /** * pci_msix_vec_count - return the number of device's MSI-X table entries * @dev: pointer to the pci_dev data structure of MSI-X device function - + * * This function returns the number of device's MSI-X table entries and * therefore the number of MSI-X vectors device is capable of sending. * It returns a negative errno if the device is not capable of sending MSI-X -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)
On Sun, Feb 2, 2014 at 9:44 PM, Alexandre Courbot acour...@nvidia.com wrote: One beginner question: is it appropriate to send kernel patches to the nouveau list in addition to dri-devel? The moderation messages I receive make me think that this list might rather be intended for general discussion. I usually do. The main thing is to make sure that they're To: Ben, since he's the one who will be ultimately be picking them up. I think that if you're not subscribed, all the lists.freedesktop.org lists moderate you, but dri-devel is configured not to tell you about it. Also I've been getting bounce messages from nouveau@ complaining of too many cc's and so it's getting auto-moderated -- not sure who, if anyone, is an admin of the nouveau list. Hopefully someone :) -ilia -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
linux-next: Tree for Feb 3
Hi all, This tree fails (more than usual) the powerpc allyesconfig build. Changes since 20140131: Dropped tree: btrfs (needs cleaning up) The powerpc tree still had its build failure. The btrfs tree had lost of conflicts against Linus' tree so I dropped it for today. Non-merge commits (relative to Linus' tree): 983 1521 files changed, 21062 insertions(+), 7748 deletions(-) I have created today's linux-next tree at git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git (patches at http://www.kernel.org/pub/linux/kernel/next/ ). If you are tracking the linux-next tree using git, you should not use git pull to do so as that will try to merge the new linux-next release with the old one. You should use git fetch as mentioned in the FAQ on the wiki (see below). You can see which trees have been included by looking in the Next/Trees file in the source. There are also quilt-import.log and merge.log files in the Next directory. Between each merge, the tree was built with a ppc64_defconfig for powerpc and an allmodconfig for x86_64 and a multi_v7_defconfig for arm. After the final fixups (if any), it is also built with powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig and allyesconfig (minus CONFIG_PROFILE_ALL_BRANCHES - this fails its final link) and i386, sparc, sparc64 and arm defconfig. These builds also have CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and CONFIG_DEBUG_INFO disabled when necessary. Below is a summary of the state of the merge. I am currently merging 208 trees (counting Linus' and 28 trees of patches pending for Linus' tree). Stats about the size of the tree over time can be seen at http://neuling.org/linux-next-size.html . Status of my local build tests will be at http://kisskb.ellerman.id.au/linux-next . If maintainers want to give advice about cross compilers/configs that work, we are always open to add more builds. Thanks to Randy Dunlap for doing many randconfig builds. And to Paul Gortmaker for triage and bug fixes. There is a wiki covering stuff to do with linux-next at http://linux.f-seidel.de/linux-next/pmwiki/ . Thanks to Frank Seidel. -- Cheers, Stephen Rothwells...@canb.auug.org.au $ git checkout master $ git reset --hard stable Merging origin/master (602456bf1699 Merge branch 'hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jdelvare/staging) Merging fixes/master (b0031f227e47 Merge tag 's2mps11-build' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator) Merging kbuild-current/rc-fixes (19514fc665ff arm, kbuild: make make install not depend on vmlinux) Merging arc-current/for-curr (7e22e91102c6 Linux 3.13-rc8) Merging arm-current/fixes (d326b65c57d6 ARM: fix building with gcc 4.6.4) Merging m68k-current/for-linus (56931d73697c m68k/mac: Make SCC reset work more reliably) Merging metag-fixes/fixes (3b2f64d00c46 Linux 3.11-rc2) Merging powerpc-merge/merge (b3084f4db3ae powerpc/thp: Fix crash on mremap) Merging sparc/master (9b0cd304f26b Merge branch 'drm-next' of git://people.freedesktop.org/~airlied/linux) Merging net/master (4fe46b9a4d0b vxlan: remove extra newline after function definition) Merging ipsec/master (965cdea82569 dccp: catch failed request_module call in dccp_probe init) Merging sound-current/for-linus (75fae117a5db ALSA: hda/hdmi - allow PIN_OUT to be dynamically enabled) Merging pci-current/for-linus (f0b75693cbb2 MAINTAINERS: Add DesignWare, i.MX6, Armada, R-Car PCI host maintainers) Merging wireless/master (7d0d46da750a Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net) Merging driver-core.current/driver-core-linus (90804ed61f24 Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs) Merging tty.current/tty-linus (413541dd66d5 Linux 3.13-rc5) Merging usb.current/usb-linus (90804ed61f24 Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs) Merging staging.current/staging-linus (77d143de7581 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml) Merging char-misc.current/char-misc-linus (90804ed61f24 Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs) Merging input-current/for-linus (55df811f2066 Merge branch 'next' into for-linus) Merging md-current/for-linus (d47648fcf061 raid5: avoid finding discard stripe) Merging crypto-current/master (ee97dc7db4cb crypto: s390 - fix des and des3_ede ctr concurrency issue) Merging ide/master (9b0cd304f26b Merge branch 'drm-next' of git://people.freedesktop.org/~airlied/linux) Merging dwmw2/master (5950f0803ca9 pcmcia: remove RPX board stuff) Merging devicetree-current/devicetree/merge (6f041e99fc7b of: Fix NULL dereference in unflatten_and_copy()) Merging rr-fixes/fixes (7122c3e9154b scripts/link-vmlinux.sh: only filter kernel symbols for arm) Merging mfd-fixes/master (73beb63d290f mfd:
[PATCH v3 6/8] ARM: dts: sun7i: cubieboard2: Enable GMAC instead of EMAC
GMAC has better performance and fewer hardware issues. Use the GMAC in MII mode for ethernet instead of the EMAC. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20-cubieboard2.dts | 27 --- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts b/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts index 5c51cb8..7bf4935 100644 --- a/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts +++ b/arch/arm/boot/dts/sun7i-a20-cubieboard2.dts @@ -19,21 +19,6 @@ compatible = cubietech,cubieboard2, allwinner,sun7i-a20; soc@01c0 { - emac: ethernet@01c0b000 { - pinctrl-names = default; - pinctrl-0 = emac_pins_a; - phy = phy1; - status = okay; - }; - - mdio@01c0b080 { - status = okay; - - phy1: ethernet-phy@1 { - reg = 1; - }; - }; - pinctrl@01c20800 { led_pins_cubieboard2: led_pins@0 { allwinner,pins = PH20, PH21; @@ -60,6 +45,18 @@ pinctrl-0 = i2c1_pins_a; status = okay; }; + + gmac: ethernet@01c5 { + pinctrl-names = default; + pinctrl-0 = gmac_pins_mii_a; + phy = phy1; + phy-mode = mii; + status = okay; + + phy1: ethernet-phy@1 { + reg = 1; + }; + }; }; leds { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 5/8] ARM: dts: sun7i: cubietruck: Enable the GMAC
The CubieTruck uses the GMAC with an RGMII phy. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20-cubietruck.dts | 12 1 file changed, 12 insertions(+) diff --git a/arch/arm/boot/dts/sun7i-a20-cubietruck.dts b/arch/arm/boot/dts/sun7i-a20-cubietruck.dts index f9dcb61..025ce52 100644 --- a/arch/arm/boot/dts/sun7i-a20-cubietruck.dts +++ b/arch/arm/boot/dts/sun7i-a20-cubietruck.dts @@ -51,6 +51,18 @@ pinctrl-0 = i2c2_pins_a; status = okay; }; + + gmac: ethernet@01c5 { + pinctrl-names = default; + pinctrl-0 = gmac_pins_rgmii_a; + phy = phy1; + phy-mode = rgmii; + status = okay; + + phy1: ethernet-phy@1 { + reg = 1; + }; + }; }; leds { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 8/8] ARM: dts: sun7i: Add ethernet alias for GMAC
U-Boot will insert MAC address into the device tree image. It looks up ethernet[0-5] aliases to find the ethernet nodes. Alias GMAC as ethernet0, as it is the only ethernet controller used. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 65fb8d0..c48fb11 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -17,7 +17,7 @@ interrupt-parent = gic; aliases { - ethernet0 = emac; + ethernet0 = gmac; }; cpus { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 4/8] ARM: dts: sun7i: Add pin muxing options for the GMAC
The A20 has EMAC and GMAC muxed on the same pins. Add pin sets with gmac function for MII and RGMII mode to the DTSI. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20.dtsi | 26 ++ 1 file changed, 26 insertions(+) diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 5fbac23..65fb8d0 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -469,6 +469,32 @@ allwinner,drive = 0; allwinner,pull = 0; }; + + gmac_pins_mii_a: gmac_mii@0 { + allwinner,pins = PA0, PA1, PA2, + PA3, PA4, PA5, PA6, + PA7, PA8, PA9, PA10, + PA11, PA12, PA13, PA14, + PA15, PA16; + allwinner,function = gmac; + allwinner,drive = 0; + allwinner,pull = 0; + }; + + gmac_pins_rgmii_a: gmac_rgmii@0 { + allwinner,pins = PA0, PA1, PA2, + PA3, PA4, PA5, PA6, + PA7, PA8, PA10, + PA11, PA12, PA13, + PA15, PA16; + allwinner,function = gmac; + /* +* data lines in RGMII mode use DDR mode +* and need a higher signal drive strength +*/ + allwinner,drive = 3; + allwinner,pull = 0; + }; }; timer@01c20c00 { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 7/8] ARM: dts: sun7i: olinuxino-micro: Enable GMAC instead of EMAC
GMAC has better performance and fewer hardware issues. Use the GMAC in MII mode for ethernet instead of the EMAC. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts | 27 +++-- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts b/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts index ead3013..b02a796 100644 --- a/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts +++ b/arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts @@ -19,21 +19,6 @@ compatible = olimex,a20-olinuxino-micro, allwinner,sun7i-a20; soc@01c0 { - emac: ethernet@01c0b000 { - pinctrl-names = default; - pinctrl-0 = emac_pins_a; - phy = phy1; - status = okay; - }; - - mdio@01c0b080 { - status = okay; - - phy1: ethernet-phy@1 { - reg = 1; - }; - }; - pinctrl@01c20800 { led_pins_olinuxino: led_pins@0 { allwinner,pins = PH2; @@ -78,6 +63,18 @@ pinctrl-0 = i2c2_pins_a; status = okay; }; + + gmac: ethernet@01c5 { + pinctrl-names = default; + pinctrl-0 = gmac_pins_mii_a; + phy = phy1; + phy-mode = mii; + status = okay; + + phy1: ethernet-phy@1 { + reg = 1; + }; + }; }; leds { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH,RFC] random: collect cpu randomness
On Sun, 2 February 2014 20:39:22 -0500, Theodore Ts'o wrote: The real question is how much overhead does it add, and is it worth it. Jörn, I take it that was the reason for creating an even faster, but weaker mixing function? Was the existing fast mix causing a measurable overhead, or was this your just being really paranoid about not adding anything to the various kernel fastpaths? It was paranoia. And I am still somewhat paranoid and don't trust my benchmark results yet. Maybe on an 1024-CPU Altix with a 100k-thread workload the overhead is too much. Just because I couldn't measure a difference on my wimpy notebook does not mean much. Jörn -- One of the painful things about our time is that those who feel certainty are stupid, and those with any imagination and understanding are filled with doubt and indecision. -- Bertrand Russell -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 0/8] Add Allwinner A20 GMAC ethernet support
Hi, This is the remaining part of v3 of the Allwinner A20 GMAC glue layer for stmmac. The stmmac driver changes have been merged through net-next. The remaining bits are clock and DT patches. The patches should be applied over my clock renaming patches. The Allwinner A20 SoC integrates an early version of dwmac IP from Synopsys. On top of that is a hardware glue layer. This layer needs to be configured before the dwmac can be used. Part of the glue layer is a clock mux, which controls the source and direction of the TX clock used by GMAC. Changes since v2: * Added more comments on GMAC clock driver * Drop CLK_SET_PARENT_GATE in GMAC clock driver * Use macro for max clock parents * Line wrapping Changes since v1: * Added optional reset control to stmmac driver core * Added non CONFIG_RESET_CONROLLER routines for the above change * Extended callback API, as discussed with Srinivas * Used new stmmac_of_data to pass features and callbacks, instead of platform data, as discussed * Seperated clock module glue layer into clock driver Cheers, ChenYu Chen-Yu Tsai (8): clk: sunxi: Add Allwinner A20/A31 GMAC clock unit ARM: dts: sun7i: Add GMAC clock node to sun7i DTSI ARM: dts: sun7i: Add GMAC controller node to sun7i DTSI ARM: dts: sun7i: Add pin muxing options for the GMAC ARM: dts: sun7i: cubietruck: Enable the GMAC ARM: dts: sun7i: cubieboard2: Enable GMAC instead of EMAC ARM: dts: sun7i: olinuxino-micro: Enable GMAC instead of EMAC ARM: dts: sun7i: Add ethernet alias for GMAC Documentation/devicetree/bindings/clock/sunxi.txt | 26 +++ arch/arm/boot/dts/sun7i-a20-cubieboard2.dts | 27 arch/arm/boot/dts/sun7i-a20-cubietruck.dts| 12 arch/arm/boot/dts/sun7i-a20-olinuxino-micro.dts | 27 arch/arm/boot/dts/sun7i-a20.dtsi | 71 ++- drivers/clk/sunxi/clk-sunxi.c | 83 +++ 6 files changed, 215 insertions(+), 31 deletions(-) -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 1/8] clk: sunxi: Add Allwinner A20/A31 GMAC clock unit
The Allwinner A20/A31 clock module controls the transmit clock source and interface type of the GMAC ethernet controller. Model this as a single clock for GMAC drivers to use. Signed-off-by: Chen-Yu Tsai w...@csie.org --- Documentation/devicetree/bindings/clock/sunxi.txt | 26 +++ drivers/clk/sunxi/clk-sunxi.c | 83 +++ 2 files changed, 109 insertions(+) diff --git a/Documentation/devicetree/bindings/clock/sunxi.txt b/Documentation/devicetree/bindings/clock/sunxi.txt index 0cf679b..f43b4c0 100644 --- a/Documentation/devicetree/bindings/clock/sunxi.txt +++ b/Documentation/devicetree/bindings/clock/sunxi.txt @@ -37,6 +37,7 @@ Required properties: allwinner,sun6i-a31-apb2-gates-clk - for the APB2 gates on A31 allwinner,sun4i-mod0-clk - for the module 0 family of clocks allwinner,sun7i-a20-out-clk - for the external output clocks + allwinner,sun7i-a20-gmac-clk - for the GMAC clock module on A20/A31 Required properties for all clocks: - reg : shall be the control register address for the clock. @@ -50,6 +51,9 @@ Required properties for all clocks: If the clock module only has one output, the name shall be the module name. +For allwinner,sun7i-a20-gmac-clk, the parent clocks shall be fixed rate +dummy clocks at 25 MHz and 125 MHz, respectively. See example. + Clock consumers should specify the desired clocks they use with a clocks phandle cell. Consumers that are using a gated clock should provide an additional ID in their clock property. This ID is the @@ -96,3 +100,25 @@ mmc0_clk: clk@01c20088 { clocks = osc24M, pll6 1, pll5 1; clock-output-names = mmc0; }; + +mii_phy_tx_clk: clk@2 { + #clock-cells = 0; + compatible = fixed-clock; + clock-frequency = 2500; + clock-output-names = mii_phy_tx; +}; + +gmac_int_tx_clk: clk@3 { + #clock-cells = 0; + compatible = fixed-clock; + clock-frequency = 12500; + clock-output-names = gmac_int_tx; +}; + +gmac_clk: clk@01c20164 { + #clock-cells = 0; + compatible = allwinner,sun7i-a20-gmac-clk; + reg = 0x01c20164 0x4; + clocks = mii_phy_tx_clk, gmac_int_tx_clk; + clock-output-names = gmac; +}; diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c index 736fb60..0b361d2 100644 --- a/drivers/clk/sunxi/clk-sunxi.c +++ b/drivers/clk/sunxi/clk-sunxi.c @@ -379,6 +379,89 @@ static void sun7i_a20_get_out_factors(u32 *freq, u32 parent_rate, /** + * sun7i_a20_gmac_clk_setup - Setup function for A20/A31 GMAC clock module + * + * This clock looks something like this + * + * MII TX clock from PHY -|____| to GMAC core + * GMAC Int. RGMII TX clk |___\__/__gate---| to PHY + * Ext. 125MHz RGMII TX clk --|__divider__/| + * || + * + * The external 125 MHz reference is optional, i.e. GMAC can use its + * internal TX clock just fine. The A31 GMAC clock module does not have + * the divider controls for the external reference. + * + * To keep it simple, let the GMAC use either the MII TX clock for MII mode, + * and its internal TX clock for GMII and RGMII modes. The GMAC driver should + * select the appropriate source and gate/ungate the output to the PHY. + * + * Only the GMAC should use this clock. Altering the clock so that it doesn't + * match the GMAC's operation parameters will result in the GMAC not being + * able to send traffic out. The GMAC driver should set the clock rate and + * enable/disable this clock to configure the required state. The clock + * driver then responds by auto-reparenting the clock. + */ + +#define SUN7I_A20_GMAC_GPIT2 +#define SUN7I_A20_GMAC_MASK0x3 +#define SUN7I_A20_GMAC_MAX_PARENTS 2 + +static void __init sun7i_a20_gmac_clk_setup(struct device_node *node) +{ + struct clk *clk; + struct clk_mux *mux; + struct clk_gate *gate; + const char *clk_name = node-name; + const char *parents[SUN7I_A20_GMAC_MAX_PARENTS]; + void *reg; + int i = 0; + + /* allocate mux and gate clock structs */ + mux = kzalloc(sizeof(struct clk_mux), GFP_KERNEL); + if (!mux) + return; + gate = kzalloc(sizeof(struct clk_gate), GFP_KERNEL); + if (!gate) { + kfree(mux); + return; + } + + reg = of_iomap(node, 0); + + of_property_read_string(node, clock-output-names, clk_name); + + while (i SUN7I_A20_GMAC_MAX_PARENTS + (parents[i] = of_clk_get_parent_name(node, i)) != NULL) + i++; + + /* set up gate and fixed rate properties */ + gate-reg = reg; + gate-bit_idx = SUN7I_A20_GMAC_GPIT; + gate-lock = clk_lock; + mux-reg = reg; + mux-mask = SUN7I_A20_GMAC_MASK; + mux-flags =
[PATCH v3 2/8] ARM: dts: sun7i: Add GMAC clock node to sun7i DTSI
The GMAC uses 1 of 2 sources for its transmit clock, depending on the PHY interface mode. Add both sources as dummy clocks, and as parents to the GMAC clock node. Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20.dtsi | 28 1 file changed, 28 insertions(+) diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 1595e9a..fc7f470 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -314,6 +314,34 @@ }; /* +* The following two are dummy clocks, placeholders used +* on gmac_tx clock. The actual frequency and availability +* depends on the external PHY, operation mode and link +* speed. +*/ + mii_phy_tx_clk: clk@2 { + #clock-cells = 0; + compatible = fixed-clock; + clock-frequency = 2500; + clock-output-names = mii_phy_tx; + }; + + gmac_int_tx_clk: clk@3 { + #clock-cells = 0; + compatible = fixed-clock; + clock-frequency = 12500; + clock-output-names = gmac_int_tx; + }; + + gmac_tx_clk: clk@01c20164 { + #clock-cells = 0; + compatible = allwinner,sun7i-a20-gmac-clk; + reg = 0x01c20164 0x4; + clocks = mii_phy_tx_clk, gmac_int_tx_clk; + clock-output-names = gmac_tx; + }; + + /* * Dummy clock used by output clocks */ osc24M_32k: clk@1 { -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 3/8] ARM: dts: sun7i: Add GMAC controller node to sun7i DTSI
Signed-off-by: Chen-Yu Tsai w...@csie.org --- arch/arm/boot/dts/sun7i-a20.dtsi | 15 +++ 1 file changed, 15 insertions(+) diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index fc7f470..5fbac23 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -630,6 +630,21 @@ status = disabled; }; + gmac: ethernet@01c5 { + compatible = allwinner,sun7i-a20-gmac; + reg = 0x01c5 0x1; + interrupts = 0 85 4; + interrupt-names = macirq; + clocks = ahb_gates 49, gmac_tx_clk; + clock-names = stmmaceth, allwinner_gmac_tx; + snps,pbl = 2; + snps,fixed-burst; + snps,force_sf_dma_mode; + status = disabled; + #address-cells = 1; + #size-cells = 0; + }; + hstimer@01c6 { compatible = allwinner,sun7i-a20-hstimer; reg = 0x01c6 0x1000; -- 1.9.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 00/16] drm/nouveau: initial support for GK20A (Tegra K1)
On Mon, Feb 3, 2014 at 1:14 PM, Ilia Mirkin imir...@alum.mit.edu wrote: On Sun, Feb 2, 2014 at 9:44 PM, Alexandre Courbot acour...@nvidia.com wrote: One beginner question: is it appropriate to send kernel patches to the nouveau list in addition to dri-devel? The moderation messages I receive make me think that this list might rather be intended for general discussion. I usually do. The main thing is to make sure that they're To: Ben, since he's the one who will be ultimately be picking them up. I think that if you're not subscribed, all the lists.freedesktop.org lists moderate you, but dri-devel is configured not to tell you about it. Also I've been getting bounce messages from nouveau@ complaining of too many cc's and so it's getting auto-moderated -- not sure who, if anyone, is an admin of the nouveau list. Hopefully someone :) The Nouveau list seems the most appropriate. There's not really any need to explicitly CC me either, I do watch the list :) -ilia ___ dri-devel mailing list dri-de...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] irq_work: allow certain work in hard irq context
On Sun, 2014-02-02 at 21:10 +0100, Sebastian Andrzej Siewior wrote: According to the backtrace both of them are trying to access the per-cpu hrtimer (sched_timer) in order to cancel but they seem to fail to get the timer lock here. They shouldn't spin there for minutes, I have no idea why they did so… Hm. per-cpu... I've been chasing an rt hotplug heisenbug that is pointing to per-cpu oddness. During sched domain re-construction while running Steven's stress script on 64 core box, we hit a freshly constructed domain with _no span_, build_sched_groups()-get_group() explodes when we meeting it. But if you try to watch the thing appear... it just doesn't. static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(d, cpu_map); if (alloc_state != sa_rootdomain) goto error; /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { sd = build_sched_domain(tl, cpu_map, attr, sd, i); BUG_ON(sd == spanless-alien) here.. if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl-flags SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd-flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } } /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd-parent) { sd-span_weight = cpumask_weight(sched_domain_span(sd)); if (sd-flags SD_OVERLAP) { if (build_overlap_sched_groups(sd, i)) goto error; } else { if (build_sched_groups(sd, i)) ..prevents meeting that alien here.. while hotplug locked. static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd-sd, cpu); struct sched_domain *child = sd-child; if (child) cpu = cpumask_first(sched_domain_span(child)); ^^^nr_cpus if (sg) { *sg = *per_cpu_ptr(sdd-sg, cpu); BOOM -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 1/6] fat: add i_disksize to represent uninitialized size
Namjae Jeon linkinj...@gmail.com writes: From: Namjae Jeon namjae.j...@samsung.com Add i_disksize to represent uninitialized allocated size. And mmu_private represent initialized allocated size. Don't we need to update -i_disksize after cont_write_begin()? -- OGAWA Hirofumi hirof...@mail.parknet.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 5/6] fat: permit to return phy block number by fibmap in fallocated region
Namjae Jeon linkinj...@gmail.com writes: From: Namjae Jeon namjae.j...@samsung.com Make the fibmap call the return the proper physical block number for any offset request in the fallocated range. Signed-off-by: Namjae Jeon namjae.j...@samsung.com Signed-off-by: Amit Sahrawat a.sahra...@samsung.com --- fs/fat/cache.c | 13 ++--- fs/fat/fat.h |3 +++ fs/fat/inode.c |3 +++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index a132666..d22c1a2 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -325,19 +325,26 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, last_block = (i_size_read(inode) + (blocksize - 1)) blocksize_bits; if (sector = last_block) { - if (!create) - return 0; - /* * Both -mmu_private and -i_disksize can access * on only allocation path. (caller must hold -i_mutex) */ last_block = (MSDOS_I(inode)-i_disksize + (blocksize - 1)) blocksize_bits; + if (!create) { + /* Map a block in fallocated region */ + if (atomic_read(MSDOS_I(inode)-beyond_isize)) + if (sector last_block) + goto out_map_cluster; + + return 0; + } + if (sector = last_block) return 0; } +out_map_cluster: cluster = sector (sbi-cluster_bits - sb-s_blocksize_bits); offset = sector (sbi-sec_per_clus - 1); cluster = fat_bmap_cluster(inode, cluster); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 7b5851f..b884276 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -129,6 +129,9 @@ struct msdos_inode_info { struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ struct inode vfs_inode; + + /* for getting block number beyond file size in case of fallocate */ + atomic_t beyond_isize; }; struct fat_slot_info { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3636617..1c3192b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -256,7 +256,10 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(MSDOS_I(mapping-host)-truncate_lock); + /* To get block number beyond file size in fallocated region */ + atomic_set(MSDOS_I(mapping-host)-beyond_isize, 1); blocknr = generic_block_bmap(mapping, block, fat_get_block); + atomic_set(MSDOS_I(mapping-host)-beyond_isize, 0); up_read(MSDOS_I(mapping-host)-truncate_lock); This is racy. While user is using bmap, kernel can allocate new blocks. We should use another function for this. For example, something like fat_get_block_bmap() { [...] fat_get_block2(inode, iblock, max_blocks, bh_result, create, bmap); [...] } blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); -- OGAWA Hirofumi hirof...@mail.parknet.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 2/6] fat: add fat_fallocate operation
Sorry for long delay. Namjae Jeon linkinj...@gmail.com writes: + if (mode FALLOC_FL_KEEP_SIZE) { + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - round_up(MSDOS_I(inode)-mmu_private, + sbi-cluster_size); This should use -i_disksize? [...] + /* Release unwritten fallocated blocks on inode eviction. */ + if (MSDOS_I(inode)-mmu_private MSDOS_I(inode)-i_disksize) { + int err; + fat_truncate_blocks(inode, MSDOS_I(inode)-mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it + * synchrnously to avoid any corruption on the next + * access to the cluster chain for the file. + */ + err = fat_sync_inode(inode); Ah, good catch. We have to update i_size. I was forgetting about this. Well, sync inode unconditionally would not be good. Maybe, we better to use __fat_write_inode() with inode_needs_sync() or such. -- OGAWA Hirofumi hirof...@mail.parknet.co.jp -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] arm: document mach-virt platform.
On Thu, Jan 30, 2014 at 04:11:02PM +, Ian Campbell wrote: mach-virt has existed for a while but it is not written down what it actually consists of. Although it seems a bit unusual to document a binding for an entire platform since mach-virt is entirely virtual it is helpful to have something to refer to in the absence of a single concrete implementation. I've done my best to capture the requirements based on the git log and my memory/understanding. [...] + +The platform may also provide hypervisor specific functionality +(e.g. PV I/O), if it does so then this functionality must be +discoverable (directly or indirectly) via device tree. While this is obviously true, I'm not sure I see the value of this text. Isn't it more essential to just say that *any* functionality provided to the platform must be discoverable via device tree? -Christoffer -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] arm: document mach-virt platform.
On Thu, Jan 30, 2014 at 11:54:46AM -0500, Christopher Covington wrote: Hi Ian, On 01/30/2014 11:11 AM, Ian Campbell wrote: mach-virt has existed for a while but it is not written down what it actually consists of. Although it seems a bit unusual to document a binding for an entire platform since mach-virt is entirely virtual it is helpful to have something to refer to in the absence of a single concrete implementation. I've done my best to capture the requirements based on the git log and my memory/understanding. While here remove the xenvm dts example, the Xen tools will now build a suitable mach-virt compatible dts when launching the guest. [...] +The platform may also provide hypervisor specific functionality +(e.g. PV I/O), if it does so then this functionality must be +discoverable (directly or indirectly) via device tree. I think it would be informative to provide pointers here to commonly used paravirtualized devices, especially VirtIO PCI/MMIO. I disagree: that would only encourage limited testing or assumptions about these specific devices when really this platform is just a bare-bones platform driven by device tree which should make no preference, whatsoever, about which devices are used with the platform. -Christoffer -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHSET 0/5] tracing/uprobes: Support multi buffer and event trigger
Ping! On Fri, 17 Jan 2014 17:08:35 +0900, Namhyung Kim wrote: Hello, (Resending with LKML CC'ed) This patchset tries to add support for recent multi buffer and event trigger changes to uprobes. The multi buffer support patch is an updated version of Zovi's previous patch v6 [1]. Zovi, please tell me if you have any update and/or issues with this. Masami and Oleg, I kept your Reviewed-by's in the patch since I think it's just an rebase. Please take a look again to see whether I added some mistakes. You can also get it from 'uprobe/trigger-v1' branch in my tree git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git Any comments are welcome, thanks Namhyung [1] https://lkml.org/lkml/2013/7/4/165 Cc: Masami Hiramatsu masami.hiramatsu...@hitachi.com Cc: Oleg Nesterov o...@redhat.com Cc: Srikar Dronamraju sri...@linux.vnet.ibm.com Cc: zhangwei(Jovi) jovi.zhang...@huawei.com Cc: Tom Zanussi tom.zanu...@linux.intel.com Namhyung Kim (4): tracing/uprobes: Rename uprobe_{trace,perf}_print() functions tracing/uprobes: Move argument fetching to uprobe_dispatcher() tracing/uprobes: Support event triggering tracing/uprobes: Support mix of ftrace and perf zhangwei(Jovi) (1): tracing/uprobes: Support ftrace_event_file base multibuffer kernel/trace/trace_kprobe.c | 17 kernel/trace/trace_probe.h | 17 kernel/trace/trace_uprobe.c | 191 +++- 3 files changed, 151 insertions(+), 74 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 1/6] audit: Enable arm64 support
Richard, On 01/30/2014 07:36 AM, Richard Guy Briggs wrote: On 14/01/29, Richard Guy Briggs wrote: On 14/01/27, AKASHI Takahiro wrote: [To audit maintainers] On 01/23/2014 11:18 PM, Catalin Marinas wrote: On Fri, Jan 17, 2014 at 08:13:14AM +, AKASHI Takahiro wrote: --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -327,6 +327,8 @@ enum { /* distinguish syscall tables */ #define __AUDIT_ARCH_64BIT 0x8000 #define __AUDIT_ARCH_LE 0x4000 +#define AUDIT_ARCH_AARCH64 (EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define AUDIT_ARCH_AARCH64EB (EM_AARCH64|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_ALPHA (EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_ARM(EM_ARM|__AUDIT_ARCH_LE) #define AUDIT_ARCH_ARMEB (EM_ARM) diff --git a/init/Kconfig b/init/Kconfig index 79383d3..3aae602 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -284,7 +284,7 @@ config AUDIT config AUDITSYSCALL bool Enable system-call auditing support - depends on AUDIT (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM AEABI !OABI_COMPAT)) + depends on AUDIT (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM AEABI !OABI_COMPAT) || ARM64) The usual comment for such changes: could you please clean this up and just use something like depends on HAVE_ARCH_AUDITSYSCALL? Do you agree to this change? If so, I can create a patch, but have some concerns: 1) I can't verify it on other architectures than (arm ) arm64. 2) Some architectures (microblaze, mips, openrisc) are not listed here, but their ptrace.c have a call to audit_syscall_entry/exit(). (audit_syscall_entry/exit are null if !AUDITSYSCALL, though) I can try: ppc s390 x86_64 ppc64 i686 s390x These arches above all pass compile and basic tests with the following patches applied: audit: correct a type mismatch in audit_syscall_exit() pending (already upstream) audit: Modify a set of system calls in audit class definitions (already upstream) [PATCH v3] audit: Add generic compat syscall support [PATCH v2] audit: Enable arm64 support [PATCH v2] arm64: Add regs_return_value() in syscall.h [PATCH v2] arm64: Add audit support [PATCH v2] arm64: audit: Add 32-bit (compat) syscall support [PATCH v2] arm64: audit: Add makefile rule to create unistd_32.h for compat syscalls [PATCH v2] arm64: audit: Add audit hook in ptrace/syscall_trace I think that you missed Catalin's suggestion. Please use the patch I will post after this message and try it again, please? Thanks, -Takahiro AKASHI So I'm afraid that the change might break someone's assumption. Thanks, -Takahiro AKASHI - RGB - RGB -- Richard Guy Briggs rbri...@redhat.com Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat Remote, Ottawa, Canada Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] audit: Add CONFIG_HAVE_ARCH_AUDITSYSCALL
Currently AUDITSYSCALL has a long list of architecture depencency: depends on AUDIT (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM AEABI !OABI_COMPAT)) The purpose of this patch is to replace it with HAVE_ARCH_AUDITSYSCALL for simplicity. Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org --- arch/arm/Kconfig |1 + arch/ia64/Kconfig |1 + arch/parisc/Kconfig|1 + arch/powerpc/Kconfig |1 + arch/s390/Kconfig |1 + arch/sh/Kconfig|1 + arch/sparc/Kconfig |1 + arch/um/Kconfig.common |1 + arch/x86/Kconfig |1 + init/Kconfig |5 - 10 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index c1f1a7e..cf69f89 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -23,6 +23,7 @@ config ARM select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL if (AEABI !OABI_COMPAT) select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER if (AEABI !OABI_COMPAT) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 4e4119b..9143d91 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -43,6 +43,7 @@ config IA64 select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_AUDITSYSCALL default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index b5f1858..0821e83 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -28,6 +28,7 @@ config PARISC select CLONE_BACKWARDS select TTY # Needed for pdc_cons.c select HAVE_DEBUG_STACKOVERFLOW + select HAVE_ARCH_AUDITSYSCALL help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b44b52c..96627d6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -139,6 +139,7 @@ config PPC select OLD_SIGACTION if PPC32 select HAVE_DEBUG_STACKOVERFLOW select HAVE_IRQ_EXIT_ON_IRQ_STACK + select HAVE_ARCH_AUDITSYSCALL config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 1e1a03d..b3b9853 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -103,6 +103,7 @@ config S390 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 9b0979f..675fb7c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -42,6 +42,7 @@ config SUPERH select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND select OLD_SIGACTION + select HAVE_ARCH_AUDITSYSCALL help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index d4f7a6a..7f7ad7e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -76,6 +76,7 @@ config SPARC64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select HAVE_C_RECORDMCOUNT select NO_BOOTMEM + select HAVE_ARCH_AUDITSYSCALL config ARCH_DEFCONFIG string diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index 21ca44c..6915d28 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -1,6 +1,7 @@ config UML bool default y + select HAVE_ARCH_AUDITSYSCALL select HAVE_UID16 select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e903c71..6ef682f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select RTC_LIB select HAVE_DEBUG_STACKOVERFLOW select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_ARCH_AUDITSYSCALL config INSTRUCTION_DECODER def_bool y diff --git a/init/Kconfig b/init/Kconfig index 79383d3..9fe22d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -282,9 +282,12 @@ config AUDIT logging of avc messages output). Does not do system-call auditing without CONFIG_AUDITSYSCALL. +config HAVE_ARCH_AUDITSYSCALL + bool + config AUDITSYSCALL bool Enable system-call auditing support - depends on AUDIT (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM AEABI !OABI_COMPAT)) + depends on AUDIT HAVE_ARCH_AUDITSYSCALL default y if SECURITY_SELINUX help Enable
[PATCH] f2fs: remove the ugly pointer conversion
This patch modifies the use of bi_private to remove pointer chasing for sbi. Previously, we had a bi_private structure, but it needs memory allocation. So this patch uses bi_private by the sbi pointer and adds a completion pointer into the sbi. This can achieve no memory allocation and nice use of the bi_private. Signed-off-by: Jaegeuk Kim jaegeuk@samsung.com --- fs/f2fs/data.c | 11 +++ fs/f2fs/f2fs.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 20c3c64..d175ae3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) static void f2fs_write_end_io(struct bio *bio, int err) { - struct f2fs_sb_info *sbi = F2FS_SB(bio-bi_io_vec-bv_page-mapping-host-i_sb); + struct f2fs_sb_info *sbi = bio-bi_private; struct bio_vec *bvec; int i; @@ -61,8 +61,10 @@ static void f2fs_write_end_io(struct bio *bio, int err) dec_page_count(sbi, F2FS_WRITEBACK); } - if (bio-bi_private) - complete(bio-bi_private); + if (sbi-wait_io) { + complete(sbi-wait_io); + sbi-wait_io = NULL; + } if (!get_pages(sbi, F2FS_WRITEBACK) !list_empty(sbi-cp_wait.task_list)) @@ -85,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio-bi_bdev = sbi-sb-s_bdev; bio-bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio-bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio-bi_private = sbi; return bio; } @@ -112,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) */ if (fio-type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); - io-bio-bi_private = wait; + io-sbi-wait_io = wait; submit_bio(rw, io-bio); wait_for_completion(wait); } else { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 55288d2..aeff132 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -398,6 +398,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE];/* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ -- 1.8.4.474.g128a96c -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] cpufreq: cpu0: make THERMAL_CPU support optional
Cc'ing the guy who introduced this bug.. On 2 February 2014 04:20, Rob Herring robherri...@gmail.com wrote: From: Rob Herring r...@kernel.org The addition of THERMAL and THERMAL_CPU selections causes a kconfig warning on highbank platforms: warning: (ARM_HIGHBANK_CPUFREQ) selects GENERIC_CPUFREQ_CPU0 which has unmet direct dependencies (ARCH_HAS_CPUFREQ CPU_FREQ HAVE_CLK REGULATOR OF THERMAL CPU_THERMAL) The cpufreq-cpu0 driver does not require thermal zone support, and it should be selectable independently. Add a new kconfig option to enable this feature. Reported-by: Olof Johansson o...@lixom.net Cc: Rafael J. Wysocki r...@rjwysocki.net Cc: Viresh Kumar viresh.ku...@linaro.org Cc: cpuf...@vger.kernel.org Signed-off-by: Rob Herring r...@kernel.org --- drivers/cpufreq/Kconfig | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 4b029c0..a197a04a 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -185,7 +185,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config GENERIC_CPUFREQ_CPU0 tristate Generic CPU0 cpufreq driver - depends on HAVE_CLK REGULATOR OF THERMAL CPU_THERMAL + depends on HAVE_CLK REGULATOR OF That's fine as code would still compile due to dummy routines.. select PM_OPP help This adds a generic cpufreq driver for CPU0 frequency management. @@ -194,6 +194,14 @@ config GENERIC_CPUFREQ_CPU0 If in doubt, say N. +config GENERIC_CPUFREQ_CPU0_THERMAL + bool Thermal zone support for Generic CPU0 cpufreq + depends on GENERIC_CPUFREQ_CPU0 + select THERMAL + select THERMAL_CPU + help + This adds thermal support to the generic cpufreq driver for CPU0. But do we really need this? Let the platform enable THERMAL and THERMAL_CPU themselves, as this was the case currently. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH TRIVIAL] mm: vmscan: shrink_slab: rename max_pass - freeable
On Sun, 2 Feb 2014, Vladimir Davydov wrote: The name `max_pass' is misleading, because this variable actually keeps the estimate number of freeable objects, not the maximal number of objects we can scan in this pass, which can be twice that. Rename it to reflect its actual meaning. Signed-off-by: Vladimir Davydov vdavy...@parallels.com Acked-by: David Rientjes rient...@google.com -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] x86, perf, p4: Counter corruption when using lots of perf groups
On Wed, Jan 29, 2014 at 03:17:17PM -0500, Don Zickus wrote: I am not entirely sure on the corruption path, but what happens is: o perf schedules a group with p4_pmu_schedule_events() o inside p4_pmu_schedule_events(), it notices an hwc pointer is being reused but for a different cpu, so it 'swaps' the config bits and returns the updated 'assign' array with a _new_ index. o perf schedules another group with p4_pmu_schedule_events() o inside p4_pmu_schedule_events(), it notices an hwc pointer is being reused (the same one as above) but for the _same_ cpu [BUG!!], so it updates the 'assign' array to use the _old_ (wrong cpu) index because the _new_ index is in an earlier part of the 'assign' array (and hasn't been committed yet). o perf commits the transaction using the wrong index and corrupts the other cpu Thanks for the fix Don! I fear I won't be able to look precisely tonight, so could it wait until tomorrow? (If it's critical sure such fix should do the trick). There is no rush. Early next week is fine too. :-) Hi Don, sorry for delay. I thought maybe extending match_prev_assignment() would be better (ie to figure out if previous event can run without reprogramming the counter) but this makes code only harder (and what is worse -- having no physical accees to p4 machine leaves no chance to test changes). So eventually I think your patch does the same thing as I had in mind but in different way. Thus Acked-by: Cyrill Gorcunov gorcu...@openvz.org thanks a lot! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
RE: [PATCH 11/34] benet: Use pci_enable_msix_range()
-Original Message- From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On Behalf Of Alexander Gordeev As result of deprecation of MSI-X/MSI enablement functions pci_enable_msix() and pci_enable_msi_block() all drivers using these two interfaces need to be updated to use the new pci_enable_msi_range() and pci_enable_msix_range() interfaces. Signed-off-by: Alexander Gordeev agord...@redhat.com Acked-by: Sathya Perla sathya.pe...@emulex.com --- drivers/net/ethernet/emulex/benet/be_main.c | 31 +++--- 1 files changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 04ac9c6..f55c09b 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2505,7 +2505,7 @@ static void be_msix_disable(struct be_adapter *adapter) static int be_msix_enable(struct be_adapter *adapter) { - int i, status, num_vec; + int i, num_vec; struct device *dev = adapter-pdev-dev; /* If RoCE is supported, program the max number of NIC vectors that @@ -2521,24 +2521,11 @@ static int be_msix_enable(struct be_adapter *adapter) for (i = 0; i num_vec; i++) adapter-msix_entries[i].entry = i; - status = pci_enable_msix(adapter-pdev, adapter-msix_entries, num_vec); - if (status == 0) { - goto done; - } else if (status = MIN_MSIX_VECTORS) { - num_vec = status; - status = pci_enable_msix(adapter-pdev, adapter-msix_entries, - num_vec); - if (!status) - goto done; - } + num_vec = pci_enable_msix_range(adapter-pdev, adapter-msix_entries, + MIN_MSIX_VECTORS, num_vec); + if (num_vec 0) + goto fail; - dev_warn(dev, MSIx enable failed\n); - - /* INTx is not supported in VFs, so fail probe if enable_msix fails */ - if (!be_physfn(adapter)) - return status; - return 0; -done: if (be_roce_supported(adapter) num_vec MIN_MSIX_VECTORS) { adapter-num_msix_roce_vec = num_vec / 2; dev_info(dev, enabled %d MSI-x vector(s) for RoCE\n, @@ -2550,6 +2537,14 @@ done: dev_info(dev, enabled %d MSI-x vector(s) for NIC\n, adapter-num_msix_vec); return 0; + +fail: + dev_warn(dev, MSIx enable failed\n); + + /* INTx is not supported in VFs, so fail probe if enable_msix fails */ + if (!be_physfn(adapter)) + return num_vec; + return 0; } static inline int be_msix_vec_get(struct be_adapter *adapter, -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs
On Sun, 2 Feb 2014, Vladimir Davydov wrote: Per-memcg kmem caches are named as follows: global-cache-name(cgroup-kmem-id:cgroup-name) where cgroup-kmem-id is the unique id of the memcg the cache belongs to, cgroup-name is the relative name of the memcg on the cgroup fs. Cache names are exposed to userspace for debugging purposes (e.g. via sysfs in case of slub or via dmesg). Using relative names makes it impossible in general (in case the cgroup hierarchy is not flat) to find out which memcg a particular cache belongs to, because cgroup-kmem-id is not known to the user. Since using absolute cgroup names would be an overkill, let's fix this by exporting the id of kmem-active memcg via cgroup fs file memory.kmem.id. Hmm, I'm not sure exporting additional information is the best way to do it only for this purpose. I do understand the problem in naming collisions if the hierarchy isn't flat and we typically work around that by ensuring child memcgs still have a unique memcg. This isn't only a problem in slab cache naming, me also avoid printing the entire absolute names for things like the oom killer. So it would be nice to have consensus on how people are supposed to identify memcgs with a hierarchy: either by exporting information like the id like you do here (but leave the oom killer still problematic) or by insisting people name their memcgs with unique names if they care to differentiate them. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] acpi-cpufreq: De-register cpu notifier and free struct msr on error.
On 28 January 2014 09:28, Konrad Rzeszutek Wilk kon...@kernel.org wrote: If cpufreq_register_driver() fails we would free the acpi driver related structures but not free the ones allocated by acpi_cpufreq_boost_init() function. This meant that as the driver error-ed out and a CPU online/offline event came we would crash and burn as one of the CPU notifiers would point to garbage. This fixes a regression that commit cfc9c8ed03e4d908f2388af8815f44c87b503aaf acpi-cpufreq: Adjust the code to use the common boost attribute introduced. CC: Lukasz Majewski l.majew...@samsung.com CC: Myungjoo Ham myungjoo@samsung.com CC: Viresh Kumar viresh.ku...@linaro.org CC: Rafael J. Wysocki rafael.j.wyso...@intel.com CC: Boris Ostrovsky boris.ostrov...@oracle.com Signed-off-by: Konrad Rzeszutek Wilk konrad.w...@oracle.com --- drivers/cpufreq/acpi-cpufreq.c |5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) Acked-by: Viresh Kumar viresh.ku...@linaro.org -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH 3/3] idle: store the idle state index in the struct rq
Hi Daniel, On 01/31/2014 03:45 PM, Daniel Lezcano wrote: On 01/31/2014 09:45 AM, Preeti Murthy wrote: Hi, On Thu, Jan 30, 2014 at 10:55 PM, Daniel Lezcano daniel.lezc...@linaro.org wrote: On 01/30/2014 05:35 PM, Peter Zijlstra wrote: On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote: struct cpuidle_state *state = drv-states[rq-index]; And from the state, we have the following informations: struct cpuidle_state { [ ... ] unsigned intexit_latency; /* in US */ int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ [ ... ] }; Right, but can we say that a higher index will save more power and have a higher exit latency? Or is a driver free to have a random mapping from idle_index to state? If the driver does its own random mapping that will break the governor logic. So yes, the states are ordered, the higher the index is, the more you save power and the higher the exit latency is. The above point holds true for only the ladder governor which sees the idle states indexed in the increasing order of target_residency/exit_latency. The cpuidle framework has been modified for both governor, see commit 8aef33a7. The power field was initially used to do the selection, but no power value was ever used to filled this field by any hardware. So the field was arbitrarily filled with a decreasing value (-1, -2, -3 ...), and used by the governor's select function. The patch above just removed this field and the condition on power for 'select' assuming the idle state are power ordered in the array. Ok. Looking at commit id 71abbbf856a0, it looks like the primary motivation for it was the power_usage numbers of each idle state. But if that went unused, then it perhaps makes sense to revert that patch. Commit 8aef33a7 pretty much did that. However I think it overlooked the menu_select() function where the the search iterates through all the idle states introduced by the above mentioned commit again. Since its purpose is outdated as per what you say, its best if we correct this now as per the below post that you have pointed to. [RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection However this is not true as far as I can see in the menu governor. It acknowledges the dynamic ordering of idle states as can be seen in the menu_select() function in the menu governor, where the idle state for the CPU gets chosen. You will notice that, even if it is found that the predicted idle time of the CPU is smaller than the target residency of an idle state, the governor continues to search for suitable idle states in the higher indexed states although it should have halted if the idle states' were ordered according to their target residency.. The same holds for exit_latency. I am not sure to get the point. Actually, this loop should be just optimized to backward search the idle state like cpuidle_play_dead does There is also a patch proposed by Alex Shi about this loop. [RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection http://comments.gmane.org/gmane.linux.power-management.general/42124 But again if we are copying the exit_latency and target_residency numbers of the idle state entered, into the rq as soon as the idle state for the CPU is chosen, as per the discussion on this thread, then I guess the ordering of the idle states in the cpuidle state table does not matter. Thanks Regards Preeti U Murthy -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 6/6] mm, hugetlb: improve page-fault scalability
On Fri, Jan 31, 2014 at 09:36:46AM -0800, Davidlohr Bueso wrote: From: Davidlohr Bueso davidl...@hp.com The kernel can currently only handle a single hugetlb page fault at a time. This is due to a single mutex that serializes the entire path. This lock protects from spurious OOM errors under conditions of low of low availability of free hugepages. This problem is specific to hugepages, because it is normal to want to use every single hugepage in the system - with normal pages we simply assume there will always be a few spare pages which can be used temporarily until the race is resolved. Address this problem by using a table of mutexes, allowing a better chance of parallelization, where each hugepage is individually serialized. The hash key is selected depending on the mapping type. For shared ones it consists of the address space and file offset being faulted; while for private ones the mm and virtual address are used. The size of the table is selected based on a compromise of collisions and memory footprint of a series of database workloads. Hello, Thanks for doing this patchset. :) Just one question! Why do we need a separate hash key depending on the mapping type? Thanks. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] mmc:sdhci: handle busy-end interrupt during command
It is fully legal for a controller to start handling busy-end interrupt before it has signaled that the command has completed. So make sure we do things in the proper order, Or it results that command interrupt is ignored so it can cause unexpected operations. This is founded at some toshiba emmc with the bellow warning. mmc0: Got command interrupt 0x0001 even though no command operation was in progress. Signed-off-by: Hankyung Yu hankyung...@lge.com Signed-off-by: Chanho Min chanho@lge.com --- drivers/mmc/host/sdhci.c | 17 +++-- include/linux/mmc/sdhci.h |1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index bd8a098..21f98e7 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1016,6 +1016,7 @@ void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd) mod_timer(host-timer, jiffies + 10 * HZ); host-cmd = cmd; + host-busy_handle = 0; sdhci_prepare_data(host, cmd); @@ -2271,8 +2272,12 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask) if (host-cmd-data) DBG(Cannot wait for busy signal when also doing a data transfer); - else if (!(host-quirks SDHCI_QUIRK_NO_BUSY_IRQ)) + else if (!(host-quirks SDHCI_QUIRK_NO_BUSY_IRQ) +!host-busy_handle) { + /* Mark that command complete before busy is ended */ + host-busy_handle = 1; return; + } /* The controller does not support the end-of-busy IRQ, * fall through and take the SDHCI_INT_RESPONSE */ @@ -2335,7 +2340,15 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) */ if (host-cmd (host-cmd-flags MMC_RSP_BUSY)) { if (intmask SDHCI_INT_DATA_END) { - sdhci_finish_command(host); + /* +* Some cards handle busy-end interrupt +* before the command completed, so make +* sure we do things in the proper order. +*/ + if (host-busy_handle) + sdhci_finish_command(host); + else + host-busy_handle = 1; return; } } diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h index 3e781b8..0118020 100644 --- a/include/linux/mmc/sdhci.h +++ b/include/linux/mmc/sdhci.h @@ -148,6 +148,7 @@ struct sdhci_host { struct mmc_command *cmd;/* Current command */ struct mmc_data *data; /* Current data request */ unsigned int data_early:1; /* Data finished before cmd */ + unsigned int busy_handle:1; /* Handling the order of Busy-end */ struct sg_mapping_iter sg_miter;/* SG state for PIO */ unsigned int blocks;/* remaining PIO blocks */ -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Is it ok for deferrable timer wakeup the idle cpu?
Sorry was away for short vacation. On 28 January 2014 19:20, Frederic Weisbecker fweis...@gmail.com wrote: On Thu, Jan 23, 2014 at 07:50:40PM +0530, Viresh Kumar wrote: Wait, I got the wrong code here. That's wasn't my initial intention. I actually wanted to write something like this: - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer-base) || idle_cpu(cpu)) + wake_up_nohz_cpu(cpu); Will that work? Something is seriously wrong with me, again wrote rubbish code. Let me phrase what I wanted to write :) don't send IPI to a idle CPU for a deferrable timer. Probably I code it correctly this time atleast. - wake_up_nohz_cpu(cpu); + if (!(tbase_get_deferrable(timer-base) idle_cpu(cpu))) + wake_up_nohz_cpu(cpu); Well, this is going to wake up the target from its idle state, which is what we want to avoid if the timer is deferrable, right? Yeah, sorry for doing it for second time :( The simplest thing we want is: if (!tbase_get_deferrable(timer-base) || tick_nohz_full_cpu(cpu)) wake_up_nohz_cpu(cpu); This spares the IPI for the common case where the timer is deferrable and we run in periodic or dynticks-idle mode (which should be 99.99% of the existing workloads). I wasn't looking at this problem with NO_HZ_FULL in mind. As I thought its only about if the CPU is idle or not. And so the solution I was talking about was: don't send IPI to a idle CPU for a deferrable timer. But I see that still failing with the code you wrote. For normal cases where we don't enable NO_HZ_FULL, we will still end up waking up idle CPUs which is what Lei Wen reported initially. Also if a CPU is marked for NO_HZ_FULL and is not idle currently then we wouldn't send a IPI for a deferrable timer. But we actually need that, so that we can reevaluate the timers order again? Then we can later optimize that and spare the IPI on full dynticks CPUs when they run idle, but that require some special care about subtle races which can't be dealt with a simple test on idle_cpu(target). And power consumption in full dynticks is already very suboptimized anyway. So I suggest we start simple with the above test, and a big fat comment which explains what we are doing and what needs to be done in the future. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/3] Deferrable timers support for timerfd API
Dear John, hello could we figure out without Thomas advice? Maybe it worth to propose timerfd and posix timer flag unification patch? On 01/21/2014 11:12 PM, John Stultz wrote: On 01/13/2014 02:43 AM, Alexey Perevalov wrote: Hello dear community. This is reworked patch set of original Anton's Vorontsov proposal regarding unified deferrable timers in the user space. http://lwn.net/Articles/514707/ I decided to resubmit it due we found it usefull for us too. timerfd was modified since Anton's commit, Alarm support was added. This isn't only rebase. Anton's previous version used deferrable timer in couple with hrtimer. This version uses only deferrable timer. It mean the behaviour of overrun number is different. e.g. if you don't poll one second timer for a 10 seconds - you'll get 10 overruns with hrtimer, but for deferrable timer it could be another value. Sorry, last week was a little crazy and I didn't get a chance to closely review this. But looking at this my major conceptual objection with the previous patchset (introducing the new clockid) is gone. My remaining conceptual concern here is that the TIMER_DEFERRABLE flag is a timerfd only construct here, and I worry we should make sure we think this through well enough that the same functionality can be supported via other timer interfaces (like clock_nanosleep, etc), which may mean the functionality should be pushed more deeply into the hrtimer subsystem. So main suggestion here is to make sure you cc Thomas Gleixner on future iterations, so he can provide some thoughts on what the best approach might be here. I know he also has some plans that might collide with the jiffies_to_ktime work. Thomas: Any thought here? Should we be trying to unify the timerfd flags and the posix timer flags (specifically things like TIMER_CANCEL_ON_SET, which is currently timerfd-only)? Should a deferrable flag be added to the hrtimer core or left to the timer wheel? thanks -john -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- Best regards, Alexey Perevalov, Leading software engineer, phone: +7 (495) 797 25 00 ext 3969 e-mail: a.pereva...@samsung.com mailto:a.pereva...@samsumng.com Mobile group, Samsung RD Institute Rus 12 Dvintsev street, building 1 127018, Moscow, Russian Federation -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v4 0/1] audit: generic compat system call support
Arm64 supports 32-bit mode(AArch32) and 64-bit mode(AArch64). To enable audit on arm64, we want to use lib/audit.c and re-work it to support compat system calls as well without copying it under arch sub-directory. Since this patch is implemented in much the same way as on existing bi-architectures (ie. ppc, s390, sparc and x86), it's not difficult for them to utilize this generic code instead of their own implementation. The code was tested on armv8 fast model with 64-bit and 32-bit userland by using modified audit-test-code. As this patch is mandatory for my system call audit support for arm64 patch, please review it as well for better understandings. Changes v2 - v3: * Specify AUDIT_CLASS_XYZ_32 instead of AUDIT_CLASS_XYZ when registering compat syscalls (bug fix) Changes v3 - v4: * Add CONFIG_AUDIT_COMPAT_GENERIC to compile in compat_audit.c * Re-define audit_is_compat() in generic way in order to eliminate necessity of asm/audit.h. AKASHI Takahiro (1): audit: Add generic compat syscall support include/linux/audit.h |8 +++ include/uapi/linux/audit.h |6 ++ lib/Kconfig|5 + lib/Makefile |1 + lib/audit.c| 15 - lib/compat_audit.c | 50 6 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 lib/compat_audit.c -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v4 1/1] audit: Add generic compat syscall support
lib/audit.c provides a generic definition for auditing system calls. This patch extends it for compat syscall support on bi-architectures (32/64-bit) by adding lib/compat_audit.c. What is required to support this feature are: * add asm/unistd32.h for compat system call names * enable CONFIG_AUDIT_COMPAT_GENERIC Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org --- include/linux/audit.h |8 +++ include/uapi/linux/audit.h |6 ++ lib/Kconfig|5 + lib/Makefile |1 + lib/audit.c| 15 - lib/compat_audit.c | 50 6 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 lib/compat_audit.c diff --git a/include/linux/audit.h b/include/linux/audit.h index bf1ef22..b5d5cca 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -78,6 +78,14 @@ extern int is_audit_feature_set(int which); extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); +/* only for compat system calls */ +extern unsigned compat_write_class[]; +extern unsigned compat_read_class[]; +extern unsigned compat_dir_class[]; +extern unsigned compat_chattr_class[]; +extern unsigned compat_signal_class[]; + +extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names-type values */ #defineAUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 44b05a0..0a73cf3 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -355,6 +355,12 @@ enum { #define AUDIT_ARCH_SPARC64 (EM_SPARCV9|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#ifdef CONFIG_COMPAT +#define audit_is_compat(arch) (!((arch) __AUDIT_ARCH_64BIT)) +#else +#define audit_is_compat(arch) false +#endif + #define AUDIT_PERM_EXEC1 #define AUDIT_PERM_WRITE 2 #define AUDIT_PERM_READ4 diff --git a/lib/Kconfig b/lib/Kconfig index 991c98b..48896db 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -182,6 +182,11 @@ config AUDIT_GENERIC depends on AUDIT !AUDIT_ARCH default y +config AUDIT_COMPAT_GENERIC + bool + depends on AUDIT_GENERIC COMPAT + default y + config RANDOM32_SELFTEST bool PRNG perform self test on init default n diff --git a/lib/Makefile b/lib/Makefile index a459c31..972552b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o obj-$(CONFIG_SMP) += percpu_counter.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o +obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o diff --git a/lib/audit.c b/lib/audit.c index 76bbed4..1d726a2 100644 --- a/lib/audit.c +++ b/lib/audit.c @@ -30,11 +30,17 @@ static unsigned signal_class[] = { int audit_classify_arch(int arch) { - return 0; + if (audit_is_compat(arch)) + return 1; + else + return 0; } int audit_classify_syscall(int abi, unsigned syscall) { + if (audit_is_compat(abi)) + return audit_classify_compat_syscall(abi, syscall); + switch(syscall) { #ifdef __NR_open case __NR_open: @@ -57,6 +63,13 @@ int audit_classify_syscall(int abi, unsigned syscall) static int __init audit_classes_init(void) { +#ifdef CONFIG_AUDIT_COMPAT_GENERIC + audit_register_class(AUDIT_CLASS_WRITE_32, compat_write_class); + audit_register_class(AUDIT_CLASS_READ_32, compat_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, compat_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, compat_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, compat_signal_class); +#endif audit_register_class(AUDIT_CLASS_WRITE, write_class); audit_register_class(AUDIT_CLASS_READ, read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); diff --git a/lib/compat_audit.c b/lib/compat_audit.c new file mode 100644 index 000..873f75b --- /dev/null +++ b/lib/compat_audit.c @@ -0,0 +1,50 @@ +#include linux/init.h +#include linux/types.h +#include asm/unistd32.h + +unsigned compat_dir_class[] = { +#include asm-generic/audit_dir_write.h +~0U +}; + +unsigned compat_read_class[] = { +#include asm-generic/audit_read.h +~0U +}; + +unsigned compat_write_class[] = { +#include asm-generic/audit_write.h +~0U +}; + +unsigned compat_chattr_class[] = { +#include asm-generic/audit_change_attr.h +~0U +}; + +unsigned compat_signal_class[] = { +#include asm-generic/audit_signal.h +~0U +}; + +int audit_classify_compat_syscall(int abi, unsigned syscall) +{ + switch (syscall) { +#ifdef __NR_open
[PATCH v3 0/3] arm64: Add audit support
This patchset adds system call audit support on arm64. Both 32-bit (AUDIT_ARCH_ARM) and 64-bit tasks (AUDIT_ARCH_AARCH64) are supported. Since arm64 has the exact same set of system calls on LE and BE, we don't care about endianness (or more specifically __AUDIT_ARCH_64BIT bit in AUDIT_ARCH_*). There are some prerequisites for this patch to work correctly: * generic compat system call audit support patch * correct a type mismatch in audit_syscall_exit() patch (already accepted and queued in 3.14) * Modify a set of system calls in audit class patch (already accepted and queued in 3.14) * __NR_* definitions for compat syscalls patch from Catalin * userspace audit tool (v2.3.2 + my patch for arm64) Please review them as well for better understandings. This code was tested on both 32-bit and 64-bit LE userland in the following two ways: 1) basic operations with auditctl/autrace # auditctl -a exit,always -S openat -F path=/etc/inittab # auditctl -a exit,always -F dir=/tmp -F perm=rw # auditctl -a task,always # autrace /bin/ls by comparing output from autrace with one from strace 2) audit-test-code (+ my workarounds for arm/arm64) by running audit-tool, filter and syscalls test categories. Changes v1 - v2: * Modified to utilize generic compat system call audit [3/6, 4/6, 5/6] Please note that a required header, unistd_32.h, is automatically generated from unistd32.h. * Refer to regs-orig_x0 instead of regs-x0 as the first argument of system call in audit_syscall_entry() [6/6] * Include Add regs_return_value() in syscall.h patch [2/6], which was not intentionally included in v1 because it could be added by kprobes support. Changes v2 - v3: * Remove asm/audit.h. See generic compat syscall audit support patch v4 * Remove endianness dependency, ie. AUDIT_ARCH_ARMEB/AARCH64EB. * Remove kernel/syscalls/Makefile which was used to create unistd32.h. See Catalin's Add __NR_* definitions for compat syscalls patch AKASHI Takahiro (3): arm64: Add regs_return_value() in syscall.h arm64: Add audit support arm64: audit: Add audit hook in ptrace/syscall_trace arch/arm64/Kconfig |1 + arch/arm64/include/asm/ptrace.h |5 + arch/arm64/include/asm/syscall.h | 15 +++ arch/arm64/include/asm/thread_info.h |1 + arch/arm64/kernel/entry.S|3 +++ arch/arm64/kernel/ptrace.c | 10 ++ include/uapi/linux/audit.h |1 + 7 files changed, 36 insertions(+) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 1/3] arm64: Add regs_return_value() in syscall.h
This macro, regs_return_value, is used mainly for audit to record system call's results, but may also be used in test_kprobes.c. Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org --- arch/arm64/include/asm/ptrace.h |5 + 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 0e7fa49..5800ec1 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -134,6 +134,11 @@ struct pt_regs { #define user_stack_pointer(regs) \ ((regs)-sp) +static inline unsigned long regs_return_value(struct pt_regs *regs) +{ + return regs-regs[0]; +} + /* * Are the current registers suitable for user mode? (used to maintain * security in signal handlers) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Make math_state_restore() save and restore the interrupt flag
On Sun, 2014-02-02 at 11:15 -0800, Linus Torvalds wrote: On Sat, Feb 1, 2014 at 11:19 PM, Suresh Siddha sbsid...@gmail.com wrote: The real fix for Nate's problem will be coming from Linus, with a slightly modified option-b that Linus proposed. Linus, please let me know if you want me to spin it. I can do it sunday night. Please do it, since clearly I wasn't aware enough about the whole non-TS-checking FPU state details. Also, since this issue doesn't seem to be a recent regression, I'm not going to take this patch directly (even though I'm planning on doing -rc1 in a few hours), and expect that I'll get it through the normal channels (presumably together with the __kernel_fpu_end cleanups). Ok with everybody? Here is the second patch, which should fix the issue reported in this thread. Maarten, Nate, George, please give this patch a try as is and see if it helps address the issue you ran into. And please ack/review with your test results. Other patch which cleans up the irq_enable/disable logic in math_state_restore() has been sent yesterday. You can run your experiments with both these patches if you want. But your issue should get fixed with just the appended patch here. Peter, Please push both these patches through normal channels depending on the results. thanks, suresh --- From: Suresh Siddha sbsid...@gmail.com Subject: x86, fpu: check tsk_used_math() in kernel_fpu_end() for eager fpu For non-eager fpu mode, thread's fpu state is allocated during the first fpu usage (in the context of device not available exception). This (math_state_restore()) can be a blocking call and hence we enable interrupts (which were originally disabled when the exception happened), allocate memory and disable interrupts etc. But the eager-fpu mode, call's the same math_state_restore() from kernel_fpu_end(). The assumption being that tsk_used_math() is always set for the eager-fpu mode and thus avoid the code path of enabling interrupts, allocating fpu state using blocking call and disable interrupts etc. But the below issue was noticed by Maarten Baert, Nate Eldredge and few others: If a user process dumps core on an ecrypt fs while aesni-intel is loaded, we get a BUG() in __find_get_block() complaining that it was called with interrupts disabled; then all further accesses to our ecrypt fs hang and we have to reboot. The aesni-intel code (encrypting the core file that we are writing) needs the FPU and quite properly wraps its code in kernel_fpu_{begin,end}(), the latter of which calls math_state_restore(). So after kernel_fpu_end(), interrupts may be disabled, which nobody seems to expect, and they stay that way until we eventually get to __find_get_block() which barfs. For eager fpu, most the time, tsk_used_math() is true. At few instances during thread exit, signal return handling etc, tsk_used_math() might be false. In kernel_fpu_end(), for eager-fpu, call math_state_restore() only if tsk_used_math() is set. Otherwise, don't bother. Kernel code path which cleared tsk_used_math() knows what needs to be done with the fpu state. Reported-by: Maarten Baert maarten-ba...@hotmail.com Reported-by: Nate Eldredge n...@thatsmathematics.com Suggested-by: Linus Torvalds torva...@linux-foundation.org Signed-off-by: Suresh Siddha sbsid...@gmail.com Cc: George Spelvin li...@horizon.com --- arch/x86/kernel/i387.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 4e5f770..670bba1 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -87,10 +87,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin); void __kernel_fpu_end(void) { - if (use_eager_fpu()) - math_state_restore(); - else + if (use_eager_fpu()) { + /* +* For eager fpu, most the time, tsk_used_math() is true. +* Restore the user math as we are done with the kernel usage. +* At few instances during thread exit, signal handling etc, +* tsk_used_math() is false. Those few places will take proper +* actions, so we don't need to restore the math here. +*/ + if (likely(tsk_used_math(current))) + math_state_restore(); + } else { stts(); + } } EXPORT_SYMBOL(__kernel_fpu_end); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 2/3] arm64: Add audit support
On AArch64, audit is supported through generic lib/audit.c and compat_audit.c, and so this patch adds arch specific definitions required. Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org --- arch/arm64/Kconfig |1 + arch/arm64/include/asm/syscall.h | 15 +++ include/uapi/linux/audit.h |1 + 3 files changed, 17 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d4dd22..3c21405 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,6 +19,7 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index 70ba9d4..6900183 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -16,7 +16,9 @@ #ifndef __ASM_SYSCALL_H #define __ASM_SYSCALL_H +#include linux/audit.h #include linux/err.h +#include asm/compat.h static inline int syscall_get_nr(struct task_struct *task, @@ -104,4 +106,17 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(regs-regs[i], args, n * sizeof(args[0])); } +/* + * We don't care about endianness (__AUDIT_ARCH_LE bit) here because + * AArch64 has the same system calls both on little- and big- endian. + */ +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ + if (is_compat_thread(task_thread_info(task))) + return AUDIT_ARCH_ARM; + + return AUDIT_ARCH_AARCH64; +} + #endif /* __ASM_SYSCALL_H */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 0a73cf3..cf27cae 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -327,6 +327,7 @@ enum { /* distinguish syscall tables */ #define __AUDIT_ARCH_64BIT 0x8000 #define __AUDIT_ARCH_LE 0x4000 +#define AUDIT_ARCH_AARCH64 (EM_AARCH64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_ALPHA (EM_ALPHA|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_ARM (EM_ARM|__AUDIT_ARCH_LE) #define AUDIT_ARCH_ARMEB (EM_ARM) -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 3/3] arm64: audit: Add audit hook in ptrace/syscall_trace
This patch adds auditing functions on entry to or exit from every system call invocation. Signed-off-by: AKASHI Takahiro takahiro.aka...@linaro.org --- arch/arm64/include/asm/thread_info.h |1 + arch/arm64/kernel/entry.S|3 +++ arch/arm64/kernel/ptrace.c | 10 ++ 3 files changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 720e70b..7468388 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -101,6 +101,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 +#define TIF_SYSCALL_AUDIT 9 #define TIF_POLLING_NRFLAG 16 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 827cbad..83c4b29 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -630,6 +630,9 @@ el0_svc_naked: // compat entry point get_thread_info tsk ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing tbnzx16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls? +#ifdef CONFIG_AUDITSYSCALL + tbnzx16, #TIF_SYSCALL_AUDIT, __sys_trace // auditing syscalls? +#endif adr lr, ret_fast_syscall// return address cmp scno, sc_nr // check upper syscall limit b.hsni_sys diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 6777a21..75a3f23 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -19,6 +19,7 @@ * along with this program. If not, see http://www.gnu.org/licenses/. */ +#include linux/audit.h #include linux/kernel.h #include linux/sched.h #include linux/mm.h @@ -38,6 +39,7 @@ #include asm/compat.h #include asm/debug-monitors.h #include asm/pgtable.h +#include asm/syscall.h #include asm/traps.h #include asm/system_misc.h @@ -1064,6 +1066,14 @@ asmlinkage int syscall_trace(int dir, struct pt_regs *regs) { unsigned long saved_reg; + if (dir) + audit_syscall_exit(regs); + else + audit_syscall_entry(syscall_get_arch(current, regs), + (int)regs-syscallno, + regs-orig_x0, regs-regs[1], + regs-regs[2], regs-regs[3]); + if (!test_thread_flag(TIF_SYSCALL_TRACE)) return regs-syscallno; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs
On 02/03/2014 10:21 AM, David Rientjes wrote: On Sun, 2 Feb 2014, Vladimir Davydov wrote: Per-memcg kmem caches are named as follows: global-cache-name(cgroup-kmem-id:cgroup-name) where cgroup-kmem-id is the unique id of the memcg the cache belongs to, cgroup-name is the relative name of the memcg on the cgroup fs. Cache names are exposed to userspace for debugging purposes (e.g. via sysfs in case of slub or via dmesg). Using relative names makes it impossible in general (in case the cgroup hierarchy is not flat) to find out which memcg a particular cache belongs to, because cgroup-kmem-id is not known to the user. Since using absolute cgroup names would be an overkill, let's fix this by exporting the id of kmem-active memcg via cgroup fs file memory.kmem.id. Hmm, I'm not sure exporting additional information is the best way to do it only for this purpose. I do understand the problem in naming collisions if the hierarchy isn't flat and we typically work around that by ensuring child memcgs still have a unique memcg. This isn't only a problem in slab cache naming, me also avoid printing the entire absolute names for things like the oom killer. AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and memcg slab cache names serve for different purposes. The point is oom is a perfectly normal situation for the kernel, and info dumped to dmesg is for admin to find out the cause of the problem (a greedy user or cgroup). On the other hand, slab cache names are dumped to dmesg only on extraordinary situations - like bugs in slab implementation, or double free, or detected memory leaks - where we usually do not need the name of the memcg that triggered the problem, because the bug is likely to be in the kernel subsys using the cache. Plus, the names are exported to sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the use cases for oom vs slab names are completely different - information vs debugging - and I want to export kmem.id only for the ability of debugging kmemcg and slab subsystems. So it would be nice to have consensus on how people are supposed to identify memcgs with a hierarchy: either by exporting information like the id like you do here (but leave the oom killer still problematic) or by insisting people name their memcgs with unique names if they care to differentiate them. Anyway, I agree with you that this needs a consensus, because this is a functional change. Thanks. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/10] watchdog: xilinx: Enable this driver for Zynq
On 01/31/2014 03:52 PM, Guenter Roeck wrote: On 01/31/2014 06:18 AM, Michal Simek wrote: Enable this driver for Zynq. Move it to architecture independent Kconfig part. Signed-off-by: Michal Simek michal.si...@xilinx.com --- Build tested by zero day testing system. --- drivers/watchdog/Kconfig | 22 +- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 9db5d3c..6120403 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -111,6 +111,15 @@ config WM8350_WATCHDOG Support for the watchdog in the WM8350 AudioPlus PMIC. When the watchdog triggers the system will be reset. +config XILINX_WATCHDOG +tristate Xilinx Watchdog timer +select WATCHDOG_CORE This needs to depend on HAS_IOMEM. Are you sure? I have no problem to do this change. Zero day testing system doesn't report any problem with it. I have checked dependencies and only score, tile and um has NO_IOMEM option enables. And below in log is tile with allyesconfig that's why I believe this driver has been also tested without any issue. Thanks, Michal git://git.monstr.eu/linux-2.6-microblaze xnext/watchdog f7bdfada576e93eaab8f6dc2ecd881da8f43911c watchdog: xilinx: Enable this driver for Zynq elapsed time: 81m configs tested: 122 alpha defconfig pariscallnoconfig parisc b180_defconfig pariscc3000_defconfig parisc defconfig arm allnoconfig arm almodconfig arm at91_dt_defconfig arm imx_v6_v7_defconfig arm marzen_defconfig arm omap2plus_defconfig arm prima2_defconfig arm s3c2410_defconfig arm spear13xx_defconfig arm tegra_defconfig m32r m32104ut_defconfig m32r mappi3.smp_defconfig m32r opsput_defconfig m32r usrv_defconfig xtensa common_defconfig xtensa iss_defconfig x86_64allnoconfig shallnoconfig sh rsk7269_defconfig sh sh7785lcr_32bit_defconfig shtitan_defconfig x86_64 randconfig-c0-0131 x86_64 randconfig-c1-0131 x86_64 randconfig-c2-0131 x86_64 randconfig-c3-0131 x86_64 randconfig-c4-0131 x86_64 randconfig-c5-0131 x86_64 randconfig-c6-0131 x86_64 randconfig-c7-0131 x86_64 randconfig-c8-0131 x86_64 randconfig-c9-0131 x86_64 allyesconfig alphaallyesconfig avr32allyesconfig blackfin allyesconfig cris allyesconfig ia64 allyesconfig m68k allyesconfig mips allyesconfig parisc allyesconfig powerpc allyesconfig s390 allyesconfig sh allyesconfig sparcallyesconfig sparc64 allyesconfig tile allyesconfig xtensa allyesconfig ia64 alldefconfig ia64 allmodconfig ia64 allnoconfig ia64defconfig x86_64lkp powerpc chroma_defconfig powerpc corenet64_smp_defconfig powerpcgamecube_defconfig powerpc linkstation_defconfig powerpc wii_defconfig x86_64 randconfig-j0-0131 x86_64 randconfig-j1-0131 x86_64 randconfig-j2-0131 x86_64 randconfig-j3-0131 x86_64 randconfig-j4-0131 x86_64 randconfig-j5-0131 m68k allmodconfig m68k amiga_defconfig m68k m5475evb_defconfig m68k multi_defconfig blackfinBF526-EZBRD_defconfig blackfinBF533-EZKIT_defconfig blackfinBF561-EZKIT-SMP_defconfig blackfin TCM-BF537_defconfig cris etrax-100lx_v2_defconfig i386 randconfig-r0-0131 i386
Re: [PATCH] ipv6: default route for link local address is not added while assigning a address
On Wednesday 29 January 2014 04:08 PM, Nicolas Dichtel wrote: Le 29/01/2014 07:41, Sohny Thomas a écrit : Resending this on netdev mailing list: Default route for link local address is configured automatically if NETWORKING_IPV6=yes is in ifcfg-eth*. When the route table for the interface is flushed and a new address is added to the same device with out removing linklocal addr, default route for link local address has to added by default. I have found the issue to be caused by this checkin http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/net/ipv6?id=62b54dd91567686a1cb118f76a72d5f4764a86dd According to this change : He removes adding a link local route if any other address is added , applicable across all interfaces though there's mentioned only lo interface So below patch fixes for other devices Signed-off-by: Sohny THomas sohth...@linux.vnet.ibm.com Your email client has corrupted the patch, it cannot be applied. Please read Documentation/email-clients.txt Sorry about that. Will resend again About the patch, I still think that the flush is too agressive. Link local routes are marked as 'proto kernel', removing them without the link local address is wrong. With this patch, you will add a link local route even if you don't have a link local address. I think it wouldn't hurt to have a Link local route for NDP in case a the routes become unreachable -Regards, Sohny -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Linux 3.14-rc1 is out
Is that OK/wanted to note a possibly wide regression as Reply-To in the announce thread? If your USB 3.0 stopped working with 3.14-rc1, please note it's already tracked regression reported in: xhci regression since xhci: replace xhci_write_64() with writeq() - devices not detected http://www.spinics.net/lists/linux-usb/msg101628.html http://comments.gmane.org/gmane.linux.usb.general/102295 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] edac/85xx: Remove deprecated IRQF_DISABLED
On Tue, Jan 21, 2014 at 09:42:27AM +0100, Johannes Thumshirn wrote: Remove IRQF_DISABLED as it is a NOOP. Signed-off-by: Johannes Thumshirn johannes.thumsh...@men.de --- drivers/edac/mpc85xx_edac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 8f918217..f4aec2e 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -357,7 +357,7 @@ int mpc85xx_pci_err_probe(struct platform_device *op) pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0); res = devm_request_irq(op-dev, pdata-irq, mpc85xx_pci_isr, -IRQF_DISABLED | IRQF_SHARED, +IRQF_SHARED, [EDAC] PCI err, pci); if (res 0) { printk(KERN_ERR @@ -633,7 +633,7 @@ static int mpc85xx_l2_err_probe(struct platform_device *op) if (edac_op_state == EDAC_OPSTATE_INT) { pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0); res = devm_request_irq(op-dev, pdata-irq, -mpc85xx_l2_isr, IRQF_DISABLED, +mpc85xx_l2_isr, 0, [EDAC] L2 err, edac_dev); if (res 0) { printk(KERN_ERR @@ -1133,7 +1133,7 @@ static int mpc85xx_mc_err_probe(struct platform_device *op) pdata-irq = irq_of_parse_and_map(op-dev.of_node, 0); res = devm_request_irq(op-dev, pdata-irq, mpc85xx_mc_isr, - IRQF_DISABLED | IRQF_SHARED, +IRQF_SHARED, [EDAC] MC err, mci); if (res 0) { printk(KERN_ERR %s: Unable to request irq %d for -- 1.8.5.2 Boris, Mauro: Ping? Johannes -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ipv6: default route for link local address is not added while assigning a address
Actually I am not so sure, there is no defined semantic of flush. I would be ok with all three solutions: leave it as is, always add link-local address (it does not matter if we don't have a link-local address on that interface, as a global scoped one is just fine enough) or make flush not remove the link-local address (but this seems a bit too special cased for me). 1) In case if we leave it as it is, there is rfc 6724 rule 2 to be considered ( previously rfc 3484) Rule 2: Prefer appropriate scope. If Scope(SA) Scope(SB): If Scope(SA) Scope(D), then prefer SB and otherwise prefer SA. Similarly, if Scope(SB) Scope(SA): If Scope(SB) Scope(D), then prefer SA and otherwise prefer SB. Test: Destination: fe80::2(LS) Candidate Source Addresses: 3ffe::1(GS) or fec0::1(SS) or LLA(LS) Result: LLA(LS) Scope(LLA) Scope(fec0::1): If Scope(LLA) Scope(fe80::2), no, prefer LLA Scope(LLA) Scope(3ffe::1): If Scope(LLA) Scope(fe80::2), no, prefer LLA Now the above test fails since the route itself is not present, and the test assumes that the route gets added since the LLA is not removed during the test 2) having a LLA always helps in NDP i think 3) making flush not remove link-local address will be chnaging functionality of ip flush command Regards, Sohny Greetings, Hannes -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] memcg: export kmemcg cache id via cgroup fs
[adding Johannes Weiner and Hugh Dickins to cc in case they have something to object against this] On 02/03/2014 10:57 AM, Vladimir Davydov wrote: On 02/03/2014 10:21 AM, David Rientjes wrote: On Sun, 2 Feb 2014, Vladimir Davydov wrote: Per-memcg kmem caches are named as follows: global-cache-name(cgroup-kmem-id:cgroup-name) where cgroup-kmem-id is the unique id of the memcg the cache belongs to, cgroup-name is the relative name of the memcg on the cgroup fs. Cache names are exposed to userspace for debugging purposes (e.g. via sysfs in case of slub or via dmesg). Using relative names makes it impossible in general (in case the cgroup hierarchy is not flat) to find out which memcg a particular cache belongs to, because cgroup-kmem-id is not known to the user. Since using absolute cgroup names would be an overkill, let's fix this by exporting the id of kmem-active memcg via cgroup fs file memory.kmem.id. Hmm, I'm not sure exporting additional information is the best way to do it only for this purpose. I do understand the problem in naming collisions if the hierarchy isn't flat and we typically work around that by ensuring child memcgs still have a unique memcg. This isn't only a problem in slab cache naming, me also avoid printing the entire absolute names for things like the oom killer. AFAIU, cgroup identifiers dumped on oom (cgroup paths, currently) and memcg slab cache names serve for different purposes. The point is oom is a perfectly normal situation for the kernel, and info dumped to dmesg is for admin to find out the cause of the problem (a greedy user or cgroup). On the other hand, slab cache names are dumped to dmesg only on extraordinary situations - like bugs in slab implementation, or double free, or detected memory leaks - where we usually do not need the name of the memcg that triggered the problem, because the bug is likely to be in the kernel subsys using the cache. Plus, the names are exported to sysfs in case of slub, again for debugging purposes, AFAIK. So IMO the use cases for oom vs slab names are completely different - information vs debugging - and I want to export kmem.id only for the ability of debugging kmemcg and slab subsystems. So it would be nice to have consensus on how people are supposed to identify memcgs with a hierarchy: either by exporting information like the id like you do here (but leave the oom killer still problematic) or by insisting people name their memcgs with unique names if they care to differentiate them. Anyway, I agree with you that this needs a consensus, because this is a functional change. Thanks. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH] powerpc: add ioremap_wt
Allow for IO memory to be mapped cacheable for performing PCI read bursts. Signed-off-by: Michael Moese michael.mo...@men.de --- arch/powerpc/include/asm/io.h | 3 +++ arch/powerpc/mm/pgtable_32.c | 8 2 files changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 45698d5..9591fff 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -631,6 +631,8 @@ static inline void iosync(void) * * * ioremap_wc enables write combining * + * * ioremap_wc enables write thru + * * * iounmap undoes such a mapping and can be hooked * * * __ioremap_at (and the pending __iounmap_at) are low level functions to @@ -652,6 +654,7 @@ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, unsigned long flags); extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); +extern void __iomem *ioremap_wt(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size)ioremap((addr), (size)) extern void iounmap(volatile void __iomem *addr); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 51f8795..9ab0a54 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -141,6 +141,14 @@ ioremap_wc(phys_addr_t addr, unsigned long size) EXPORT_SYMBOL(ioremap_wc); void __iomem * +ioremap_wt(phys_addr_t addr, unsigned long size) +{ + return __ioremap_caller(addr, size, _PAGE_WRITETHRU, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { /* writeable implies dirty for kernel addresses */ -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -tip v2 1/8] [BUGFIX] perf-probe: Fix to do exit call for symbol maps
Hi Masami, On Wed, 29 Jan 2014 09:14:52 +, Masami Hiramatsu wrote: Some perf-probe commands do symbol_init() but doesn't do exit call. This fixes that to call symbol_exit() and relase machine if needed. This also merges init_vmlinux() and init_user_exec() because both of them are doing similar things. (init_user_exec() just skips init vmlinux related symbol maps) Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com --- tools/perf/util/probe-event.c | 110 +++-- 1 file changed, 61 insertions(+), 49 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index a8a9b6c..14c649df 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -73,31 +73,35 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp); static int convert_name_to_addr(struct perf_probe_event *pev, const char *exec); static void clear_probe_trace_event(struct probe_trace_event *tev); -static struct machine machine; +static struct machine *host_machine; /* Initialize symbol maps and path of vmlinux/modules */ -static int init_vmlinux(void) +static int init_symbol_maps(bool user_only) { int ret; symbol_conf.sort_by_name = true; - if (symbol_conf.vmlinux_name == NULL) - symbol_conf.try_vmlinux_path = true; - else - pr_debug(Use vmlinux: %s\n, symbol_conf.vmlinux_name); + if (user_only) + symbol_conf.try_vmlinux_path = false; + else { + if (symbol_conf.vmlinux_name == NULL) + symbol_conf.try_vmlinux_path = true; This looks unnecessary and duplicate since we already have following code in __cmd_probe(). /* * Only consider the user's kernel image path if given. */ symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); Thanks, Namhyung + else + pr_debug(Use vmlinux: %s\n, symbol_conf.vmlinux_name); + } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 7/8] Add 32 bit VDSO time support for 32 bit kernel
Am Sonntag, den 02.02.2014, 16:12 -0800 schrieb Andy Lutomirski: On Sun, Feb 2, 2014 at 1:39 PM, Stefani Seibold stef...@seibold.net wrote: Am Sonntag, den 02.02.2014, 08:46 -0800 schrieb Andy Lutomirski: On Sun, Feb 2, 2014 at 3:27 AM, stef...@seibold.net wrote: From: Stefani Seibold stef...@seibold.net This patch add the time support for 32 bit a VDSO to a 32 bit kernel. [...] Can you address the review comments from last time around? For example, this still seems to have redundant vvar and hpet mappings, it doesn't use the VVAR macro, it moves the 32-bit compat vDSO, etc. I will address the compat VDSO issue. But the VVAR macro will be not a part of this patch set. If you depend on this, feel free to create one. From my point of view this is not feasible without a macro hacking, because the address accessing the vvar area differs in kernel and VDSO user mode. Sorry, but I will make the code messier for no apparent reason and I will not offer to fix it in the same series gets my NAK. Hint: I'm talking about two or three lines of code in vvar.h. A hint back: if you threat me with a NAK for a requested code sequence which currently no user, this is far away from professional. I am not your trainee. BTW: If it is so easy, send me the two or three lines and i will merge it ;-) I also see no redundant mapping. There are two modes, one is the map of the kernel area the other maps the VDSO into the user space area. This is exactly the behaviour of the origin VDSO implementation. No. In your series there are *three* mappings. There are: - The linear mapping that the kernel loader sets up (the writable mapping used in the kernel). This is implicit and, of course, fine. - There's the fixmap page, which aliases the normal kernel mapping at a fixed address with the user, ro, and nx attributes. The 64-bit vDSO uses that mapping. See vdso.h -- it's all arranged pretty clearly. Your code, for no discernible reason, sets up a fixmap entry on *32-bit* kernels. - The vma that you're setting up adjacent to the actual vdso text. This is what you are using. Please choose *one* user-readable mapping for the 32-bit vdso and stick with it. If the 64-bit vdso can use it to and userspace doesn't break, even better. But a pointless set of extra fixmap entries is not okay. Again: I wrote that there are two modes for a 32 bit kernel and therefore there are two mappings at the same time. Since there are both ways available in a 32 bit kernel via the vdso32= kernel parameter, both must be supported. Due the lack of a real fixmap for a 32 bit kernel (FIXADDR_TOP is a variable), the HPET and VVAR Page can only relative addressed. So this pages must located before or after the VDSO. This is why i need to setup this pages into the fixmap area, this is the compat mode vdso32=2. For vdso32=1 i need to map the VDSO Page together with the HPET and VVAR into the user space. For compability reasons both mappings are required. There is only one binary for the VDSO page, regardless of the vdso= kernel parameter and this code can only do a relative addressing. A 64 bit kernel can do it in an other way, because there is a real fixmap area, so this special handling is not needed. - Stefani -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/7] improve robustness on handling migratetype
On Wed, Jan 29, 2014 at 05:52:41PM +0100, Vlastimil Babka wrote: On 01/10/2014 09:48 AM, Joonsoo Kim wrote: On Thu, Jan 09, 2014 at 09:27:20AM +, Mel Gorman wrote: On Thu, Jan 09, 2014 at 04:04:40PM +0900, Joonsoo Kim wrote: Hello, I found some weaknesses on handling migratetype during code review and testing CMA. First, we don't have any synchronization method on get/set pageblock migratetype. When we change migratetype, we hold the zone lock. So writer-writer race doesn't exist. But while someone changes migratetype, others can get migratetype. This may introduce totally unintended value as migratetype. Although I haven't heard of any problem report about that, it is better to protect properly. This is deliberate. The migratetypes for the majority of users are advisory and aimed for fragmentation avoidance. It was important that the cost of that be kept as low as possible and the general case is that migration types change very rarely. In many cases, the zone lock is held. In other cases, such as splitting free pages, the cost is simply not justified. I doubt there is any amount of data you could add in support that would justify hammering the free fast paths (which call get_pageblock_type). Hello, Mel. There is a possibility that we can get unintended value such as 6 as migratetype if reader-writer (get/set pageblock_migratetype) race happends. It can be possible, because we read the value without any synchronization method. And this migratetype, 6, has no place in buddy freelist, so array index overrun can be possible and the system can break, although I haven't heard that it occurs. Hello, it seems this can indeed happen. I'm working on memory compaction improvements and in a prototype patch, I'm basically adding calls of start_isolate_page_range() undo_isolate_page_range() some functions under compact_zone(). With this I've seen occurrences of NULL pointers in move_freepages(), free_one_page() in places where free_list[migratetype] is manipulated by e.g. list_move(). That lead me to question the value of migratetype and I found this thread. Adding some debugging in get_pageblock_migratetype() and voila, I get a value of 6 being read. So is it just my patch adding a dangerous situation, or does it exist in mainline as well? By looking at free_one_page(), it uses zone-lock, but get_pageblock_migratetype() is called by its callers (free_hot_cold_page() or __free_pages_ok()) outside of the lock. This determined migratetype is then used under free_one_page() to access a free_list. It seems that this could race with set_pageblock_migratetype() called from try_to_steal_freepages() (despite the latter being properly locked). There are also other callers but those seem to be either limited to initialization and isolation, which should be rare (?). However, try_to_steal_freepages can occur repeatedly. So I assume that the race happens but never manifests as a fatal error as long as MIGRATE_UNMOVABLE, MIGRATE_RECLAIMABLE and MIGRATE_MOVABLE values are used. Only MIGRATE_CMA and MIGRATE_ISOLATE have values with bit 4 enabled and can thus result in invalid values due to non-atomic access. Does that make sense to you and should we thus proceed with patching this race? Hello, This race is possible without your prototype patch, however, on very low probability. Some codes related to memory failure use set_migratetype_isolate() which could result in this race. Although it may be very rare case and not critical, it is better to fix this race. I prefer that we don't depend on luck. :) Mel's suggestion looks good to me. Do you have another idea? Thanks. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 6/8] cleanup __vdso_gettimeofday
From: Stefani Seibold stef...@seibold.net This patch do a little cleanup for the __vdso_gettimeofday() function. It kick out an unneeded ret local variable and makes the code faster if only the timezone is needed. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/vdso/vclock_gettime.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 743f277..bf969a0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -259,13 +259,12 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { - long ret = VCLOCK_NONE; - if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || sizeof(*tv) != sizeof(struct timespec)); - ret = do_realtime((struct timespec *)tv); + if (do_realtime((struct timespec *)tv) == VCLOCK_NONE) + return vdso_fallback_gtod(tv, tz); tv-tv_usec /= 1000; } if (unlikely(tz != NULL)) { @@ -274,8 +273,6 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) tz-tz_dsttime = gtod-sys_tz.tz_dsttime; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gtod(tv, tz); return 0; } int gettimeofday(struct timeval *, struct timezone *) -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 1/8] Make vsyscall_gtod_data handling x86 generic
From: Stefani Seibold stef...@seibold.net This patch move the vsyscall_gtod_data handling out of vsyscall_64.c into an additonal file vsyscall_gtod.c to make the functionality available for x86 32 bit kernel. It also adds a new vsyscall_32.c which setup the VVAR page. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/Kconfig | 4 +-- arch/x86/include/asm/clocksource.h | 4 --- arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/include/asm/vvar.h| 4 +++ arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/hpet.c | 4 --- arch/x86/kernel/setup.c| 2 -- arch/x86/kernel/tsc.c | 2 -- arch/x86/kernel/vmlinux.lds.S | 3 -- arch/x86/kernel/vsyscall_32.c | 24 +++ arch/x86/kernel/vsyscall_64.c | 44 arch/x86/kernel/vsyscall_gtod.c| 60 ++ 12 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 arch/x86/kernel/vsyscall_32.c create mode 100644 arch/x86/kernel/vsyscall_gtod.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 940e50e..b556f00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,9 +107,9 @@ config X86 select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA if X86_64 + select ARCH_CLOCKSOURCE_DATA select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL if X86_64 + select GENERIC_TIME_VSYSCALL select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 16a57f4..eda81dc 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,8 +3,6 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#ifdef CONFIG_X86_64 - #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ @@ -14,6 +12,4 @@ struct arch_clocksource_data { int vclock_mode; }; -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7252cd3..094d0cc 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -75,6 +75,8 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, FIX_VDSO, + VVAR_PAGE, + VSYSCALL_HPET, #else VSYSCALL_LAST_PAGE, VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index d76ac40..c442782 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -17,7 +17,11 @@ */ /* Base address of vvars. This is not ABI. */ +#ifdef CONFIG_X86_64 #define VVAR_ADDRESS (-10*1024*1024 - 4096) +#else +#define VVAR_ADDRESS 0xd000 +#endif #if defined(__VVAR_KERNEL_LDS) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb648c8..3282eda 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-y += syscall_$(BITS).o +obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_X86_32) += vsyscall_32.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-$(CONFIG_SYSFS)+= ksysfs.o diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index da85a8e..54263f0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -74,9 +74,7 @@ static inline void hpet_writel(unsigned int d, unsigned int a) static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); -#ifdef CONFIG_X86_64 __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); -#endif } static inline void hpet_clear_mapping(void) @@ -752,9 +750,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_HPET }, -#endif }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 06853e6..56ff330 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1182,9 +1182,7 @@ void __init setup_arch(char **cmdline_p) tboot_probe(); -#ifdef CONFIG_X86_64 map_vsyscall(); -#endif generic_apic_probe(); diff --git a/arch/x86/kernel/tsc.c
[PATCH v10 5/8] replace VVAR(vsyscall_gtod_data) by gtod macro
From: Stefani Seibold stef...@seibold.net There a currently more than 30 users of the gtod macro, so replace the last VVAR(vsyscall_gtod_data) by gtod macro. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/vdso/vclock_gettime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index fd074dd..743f277 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -109,7 +109,7 @@ static notrace cycle_t vread_pvclock(int *mode) *mode = VCLOCK_NONE; /* refer to tsc.c read_tsc() comment for rationale */ - last = VVAR(vsyscall_gtod_data).clock.cycle_last; + last = gtod-clock.cycle_last; if (likely(ret = last)) return ret; @@ -133,7 +133,7 @@ notrace static cycle_t vread_tsc(void) rdtsc_barrier(); ret = (cycle_t)vget_cycles(); - last = VVAR(vsyscall_gtod_data).clock.cycle_last; + last = gtod-clock.cycle_last; if (likely(ret = last)) return ret; @@ -288,7 +288,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86_64 so we don't need any locks. */ - time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + time_t result = ACCESS_ONCE(gtod-wall_time_sec); if (t) *t = result; -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 4/8] vclock_gettime.c __vdso_clock_gettime cleanup
From: Stefani Seibold stef...@seibold.net This patch is a small code cleanup for the __vdso_clock_gettime() function. It removes the unneeded return values from do_monotonic_coarse() and do_realtime_coarse() and add a fallback label for doing the kernel gettimeofday() system call. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/vdso/vclock_gettime.c | 27 ++- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index bbc8065..fd074dd 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -209,7 +209,7 @@ notrace static int do_monotonic(struct timespec *ts) return mode; } -notrace static int do_realtime_coarse(struct timespec *ts) +notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { @@ -217,10 +217,9 @@ notrace static int do_realtime_coarse(struct timespec *ts) ts-tv_sec = gtod-wall_time_coarse.tv_sec; ts-tv_nsec = gtod-wall_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); - return 0; } -notrace static int do_monotonic_coarse(struct timespec *ts) +notrace static void do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { @@ -228,30 +227,32 @@ notrace static int do_monotonic_coarse(struct timespec *ts) ts-tv_sec = gtod-monotonic_time_coarse.tv_sec; ts-tv_nsec = gtod-monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(gtod-seq, seq))); - - return 0; } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - int ret = VCLOCK_NONE; - switch (clock) { case CLOCK_REALTIME: - ret = do_realtime(ts); + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_MONOTONIC: - ret = do_monotonic(ts); + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_REALTIME_COARSE: - return do_realtime_coarse(ts); + do_realtime_coarse(ts); + break; case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); + do_monotonic_coarse(ts); + break; + default: + goto fallback; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gettime(clock, ts); return 0; +fallback: + return vdso_fallback_gettime(clock, ts); } int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias(__vdso_clock_gettime))); -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 7/8] Add 32 bit VDSO time support for 32 bit kernel
From: Stefani Seibold stef...@seibold.net This patch add the time support for 32 bit a VDSO to a 32 bit kernel. For 32 bit programs running on a 32 bit kernel, the same mechanism is used as for 64 bit programs running on a 64 bit kernel. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/include/asm/vdso.h | 3 ++ arch/x86/include/asm/vdso32.h | 11 +++ arch/x86/vdso/Makefile| 7 + arch/x86/vdso/vclock_gettime.c| 59 +-- arch/x86/vdso/vdso-layout.lds.S | 22 + arch/x86/vdso/vdso32-setup.c | 55 arch/x86/vdso/vdso32/vclock_gettime.c | 16 ++ arch/x86/vdso/vdso32/vdso32.lds.S | 9 ++ 8 files changed, 174 insertions(+), 8 deletions(-) create mode 100644 arch/x86/include/asm/vdso32.h create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index fddb53d..fe3cef9 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -2,6 +2,9 @@ #define _ASM_X86_VDSO_H #if defined CONFIG_X86_32 || defined CONFIG_COMPAT + +#include asm/vdso32.h + extern const char VDSO32_PRELINK[]; /* diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h new file mode 100644 index 000..37e1a75 --- /dev/null +++ b/arch/x86/include/asm/vdso32.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_VDSO32_H +#define _ASM_X86_VDSO32_H + +#define VDSO_BASE_PAGE 0 +#define VDSO_VVAR_PAGE 1 +#define VDSO_HPET_PAGE 2 +#defineVDSO_PAGES 3 +#define VDSO_PREV_PAGES2 +#defineVDSO_OFFSET(x) ((x) * PAGE_SIZE) + +#endif diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index fd14be1..1ff5b0a 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -145,8 +145,15 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) $(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=3 -freg-struct-return -fpic +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ +$(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index bf969a0..42f641c 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -4,6 +4,9 @@ * * Fast user context implementation of clock_gettime, gettimeofday, and time. * + * 32 Bit compat layer by Stefani Seibold stef...@seibold.net + * sponsored by Rohde Schwarz GmbH Co. KG Munich/Germany + * * The code should have no internal unresolved relocations. * Check with readelf after changing. */ @@ -24,6 +27,8 @@ #include asm/io.h #include asm/pvclock.h +#ifndef BUILD_VDSO32 + #define gtod (VVAR(vsyscall_gtod_data)) static notrace cycle_t vread_hpet(void) @@ -47,6 +52,54 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) 0 (__NR_gettimeofday), D (tv), S (tz) : memory); return ret; } +#else + +struct vsyscall_gtod_data vvar_vsyscall_gtod_data + __attribute__((visibility(hidden))); + +u8 hpet_page + __attribute__((visibility(hidden))); + +#define gtod (vvar_vsyscall_gtod_data) + +#ifdef CONFIG_HPET_TIMER +static notrace cycle_t vread_hpet(void) +{ + return readl(hpet_page + HPET_COUNTER); +} +#endif + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + push %%ebx \n + mov %2,%%ebx \n + call VDSO32_vsyscall \n + pop %%ebx \n + : =a (ret) + : 0 (__NR_clock_gettime), d (clock), c (ts) + : memory); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, + struct timezone *tz) +{ + long ret; + + asm( + push %%ebx \n + mov %2,%%ebx \n + call VDSO32_vsyscall \n + pop %%ebx \n + : =a (ret) + : 0 (__NR_gettimeofday), d (tv), c (tz) + : memory); + return ret; +} +#endif #ifdef CONFIG_PARAVIRT_CLOCK @@ -152,12 +205,14 @@ notrace static cycle_t vread_tsc(void) notrace static inline u64 vgetsns(int *mode) { - long v; + u64 v; cycles_t cycles; if (gtod-clock.vclock_mode == VCLOCK_TSC)
[PATCH v9 0/8] Add 32 bit VDSO time function support
From: Stefani Seibold stef...@seibold.net This patch add the functions vdso_gettimeofday(), vdso_clock_gettime() and vdso_time() to the 32 bit VDSO. The reason to do this was to get a fast reliable time stamp. Many developers uses TSC to get a fast time stamp, without knowing the pitfalls. VDSO time functions a fast and a reliable way, because the kernel knows the best time source and the P- and C-state of the CPU. The helper library to use the VDSO functions can be download at http://http://seibold.net/vdso.c The libary is very small, only 228 lines of code. Compile it with gcc -Wall -O3 -fpic vdso.c -lrt -shared -o libvdso.so and use it with LD_PRELOAD=path/libvdso.so This kind of helper must be integrated into glibc, for x86 64 bit and PowerPC it is already there. Some linux 32 bit kernel benchmark results (all measurements are in nano seconds): Intel(R) Celeron(TM) CPU 400MHz Average time kernel call: gettimeofday(): 1039 clock_gettime(): 1578 time(): 526 Average time VDSO call: gettimeofday(): 378 clock_gettime(): 303 time(): 60 Celeron(R) Dual-Core CPU T3100 1.90GHz Average time kernel call: gettimeofday(): 209 clock_gettime(): 406 time(): 135 Average time VDSO call: gettimeofday(): 51 clock_gettime(): 43 time(): 10 So you can see a performance increase between 4 and 13, depending on the CPU and the function. The address layout of the VDSO has changed, because there is no fixed address space available on a x86 32 bit kernel, despite the name. Because someone decided to add an offset to the __FIXADDR_TOP for virtualization. Also the IA32 Emulation uses the whole 4 GB address space, so there is no fixed address available. This was the reason not depend on this kind of address and change the layout of the VDSO. The VDSO for a 32 bit application has now three pages: ^ Higher Address | ++ + VDSO page (includes code) ro+x + ++ + VVAR page (export kernel variables) ro + ++ + HPET page (mapped registers) ro ++ | ^ Lower Address The VDSO page for a 32 bit resided still on 0xe000, the the VVAR and HPET page are mapped before. In the non compat mode the VMA of the VDSO is now 3 pages for a 32 bit kernel. So this decrease the available logical address room by 2 pages. The patch is against kernel 3.14 (e7651b819e90da924991d727d3c007200a18670d) Changelog: 25.11.2012 - first release and proof of concept for linux 3.4 11.12.2012 - Port to linux 3.7 and code cleanup 12.12.2012 - fixes suggested by Andy Lutomirski - fixes suggested by John Stultz - use call VDSO32_vsyscall instead of int 80 - code cleanup 17.12.2012 - support for IA32_EMULATION, this includes - code cleanup - include cleanup to fix compile warnings and errors - move out seqcount from seqlock, enable use in VDSO - map FIXMAP and HPET into the 32 bit address space 18.12.2012 - split into separate patches 30.01.2014 - revamp the code - code clean up - VDSO layout changed - no fixed addresses - port to 3.14 01.02.2014 - code cleanup 02.02.2014 - code cleanup - split into more patches - use HPET_COUNTER instead of hard coded value - fix changelog to the right year ;-) 02.02.2014 - reverse the mapping, this make the new VDSO 32 bit support full compatible. 03.02.2014 - code cleanup - fix comment - fix ABI break in vdso32.lds.S -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 2/8] Add new func _install_special_mapping() to mmap.c
From: Stefani Seibold stef...@seibold.net The _install_special_mapping() is the new base function for install_special_mapping(). This function will return a pointer of the created VMA or a error code in an ERR_PTR() This new function will be needed by the for the vdso 32 bit support to map the additonal vvar and hpet pages into the 32 bit address space. This will be done with io_remap_pfn_range() and remap_pfn_range, which requieres a vm_area_struct. Signed-off-by: Stefani Seibold stef...@seibold.net --- include/linux/mm.h | 3 +++ mm/mmap.c | 20 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f28f46e..55342aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1740,6 +1740,9 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); +extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long flags, struct page **pages); extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); diff --git a/mm/mmap.c b/mm/mmap.c index 20ff0c3..81ba54f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2918,7 +2918,7 @@ static const struct vm_operations_struct special_mapping_vmops = { * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ -int install_special_mapping(struct mm_struct *mm, +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { @@ -2927,7 +2927,7 @@ int install_special_mapping(struct mm_struct *mm, vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(vma-anon_vma_chain); vma-vm_mm = mm; @@ -2948,11 +2948,23 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = _install_special_mapping(mm, + addr, len, vm_flags, pages); + + if (IS_ERR(vma)) + return PTR_ERR(vma); + return 0; } static DEFINE_MUTEX(mm_all_locks_mutex); -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH -tip v2 4/8] perf-probe: Use _stext based address instead of the symbol name
On Wed, 29 Jan 2014 09:14:59 +, Masami Hiramatsu wrote: diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 4a9f43b..120954b 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -387,6 +387,44 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, return ret; } +/* Post processing the probe events */ +static int post_process_probe_trace_events(struct probe_trace_event *tevs, +int ntevs, const char *module, +bool uprobe) +{ + struct symbol *sym; + struct map *map; + unsigned long stext = 0; + char *tmp; + int i; + + if (uprobe) + return add_exec_to_probe_trace_events(tevs, ntevs, module); + + /* Note that currently _stext based probe is not for drivers */ + if (module) + return add_module_to_probe_trace_events(tevs, ntevs, module); + + sym = __find_kernel_function_by_name(_stext, map); Couldn't we just use kmap-ref_reloc_sym instead of the hard-coded _stext? You might want to check the Adrian's recent kaslr fixes (now in tip/perf/urgent). Thanks, Namhyung + if (!sym) { + pr_debug(Failed to find _stext. Use original symbol name.\n); + return 0; + } + stext = map-unmap_ip(map, sym-start); + + for (i = 0; i ntevs; i++) { + if (tevs[i].point.address) { + tmp = strdup(_stext); + if (!tmp) + return -ENOMEM; + free(tevs[i].point.symbol); + tevs[i].point.symbol = tmp; + tevs[i].point.offset = tevs[i].point.address - stext; + } + } + return 0; +} + -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v10 8/8] Add 32 bit VDSO time support for 64 bit kernel
From: Stefani Seibold stef...@seibold.net This patch add the VDSO time support for the IA32 Emulation Layer. Due the nature of the kernel headers and the LP64 compiler where the size of a long and a pointer differs against a 32 bit compiler, there is a lot of type hacking necessary. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/vdso/vclock_gettime.c| 109 +++--- arch/x86/vdso/vdso32/vclock_gettime.c | 7 +++ 2 files changed, 95 insertions(+), 21 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 42f641c..8ba8db8 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -31,12 +31,24 @@ #define gtod (VVAR(vsyscall_gtod_data)) +struct api_timeval { + longtv_sec; /* seconds */ + longtv_usec;/* microseconds */ +}; + +struct api_timespec { + longtv_sec; /* seconds */ + longtv_nsec;/* nanoseconds */ +}; + +typedef long api_time_t; + static notrace cycle_t vread_hpet(void) { return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); } -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct api_timespec *ts) { long ret; asm(syscall : =a (ret) : @@ -44,7 +56,8 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +notrace static long vdso_fallback_gtod(struct api_timeval *tv, + struct timezone *tz) { long ret; @@ -54,14 +67,62 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) } #else +#ifdef CONFIG_IA32_EMULATION +typedef s64arch_time_t; + +struct arch_timespec { + s64 tv_sec; + s64 tv_nsec; +}; + +#define ALIGN8 __attribute__ ((aligned (8))) + +struct arch_vsyscall_gtod_data { + seqcount_t seq ALIGN8; + + struct { /* extract of a clocksource struct */ + int vclock_mode ALIGN8; + cycle_t cycle_last ALIGN8; + cycle_t mask ALIGN8; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + arch_time_t wall_time_sec; + u64 wall_time_snsec; + u64 monotonic_time_snsec; + arch_time_t monotonic_time_sec; + + struct timezone sys_tz; + struct arch_timespec wall_time_coarse; + struct arch_timespec monotonic_time_coarse; +}; + +struct arch_vsyscall_gtod_data vvar_vsyscall_gtod_data + __attribute__((visibility(hidden))); +#else struct vsyscall_gtod_data vvar_vsyscall_gtod_data __attribute__((visibility(hidden))); +#endif u8 hpet_page __attribute__((visibility(hidden))); #define gtod (vvar_vsyscall_gtod_data) +struct api_timeval { + s32 tv_sec; /* seconds */ + s32 tv_usec;/* microseconds */ +}; + +struct api_timespec { + s32 tv_sec; /* seconds */ + s32 tv_nsec;/* microseconds */ +}; + +typedef s32 api_time_t; + #ifdef CONFIG_HPET_TIMER static notrace cycle_t vread_hpet(void) { @@ -69,7 +130,7 @@ static notrace cycle_t vread_hpet(void) } #endif -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct api_timespec *ts) { long ret; @@ -79,12 +140,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) call VDSO32_vsyscall \n pop %%ebx \n : =a (ret) - : 0 (__NR_clock_gettime), d (clock), c (ts) + : 0 (__NR_ia32_clock_gettime), d (clock), c (ts) : memory); return ret; } -notrace static long vdso_fallback_gtod(struct timeval *tv, +notrace static long vdso_fallback_gtod(struct api_timeval *tv, struct timezone *tz) { long ret; @@ -95,7 +156,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, call VDSO32_vsyscall \n pop %%ebx \n : =a (ret) - : 0 (__NR_gettimeofday), d (tv), c (tz) + : 0 (__NR_ia32_gettimeofday), d (tv), c (tz) : memory); return ret; } @@ -284,42 +345,48 @@ notrace static void do_monotonic_coarse(struct timespec *ts) } while (unlikely(read_seqcount_retry(gtod-seq, seq))); } -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct api_timespec *ts) { + struct timespec tmp; + switch (clock) { case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) + if (do_realtime(tmp) ==
[PATCH v10 3/8] revamp vclock_gettime.c
From: Stefani Seibold stef...@seibold.net This intermediate patch revamps the vclock_gettime.c by moving some functions around. It is only for spliting purpose, to make whole the 32 bit vdso timer patch easier to review. Signed-off-by: Stefani Seibold stef...@seibold.net --- arch/x86/vdso/vclock_gettime.c | 85 +- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index eb5d7a5..bbc8065 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -26,41 +26,26 @@ #define gtod (VVAR(vsyscall_gtod_data)) -notrace static cycle_t vread_tsc(void) +static notrace cycle_t vread_hpet(void) { - cycle_t ret; - u64 last; - - /* -* Empirically, a fence (of type that depends on the CPU) -* before rdtsc is enough to ensure that rdtsc is ordered -* with respect to loads. The various CPU manuals are unclear -* as to whether rdtsc can be reordered with later loads, -* but no one has ever seen it happen. -*/ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret = last)) - return ret; + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); +} - /* -* GCC likes to generate cmov here, but this branch is extremely -* predictable (it's just a funciton of time and the likely is -* very likely) and there's a data dependence, so force GCC -* to generate a branch instead. I don't barrier() because -* we don't actually need a barrier, and if this function -* ever gets inlined it will generate worse code. -*/ - asm volatile (); - return last; +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm(syscall : =a (ret) : + 0 (__NR_clock_gettime), D (clock), S (ts) : memory); + return ret; } -static notrace cycle_t vread_hpet(void) +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); + long ret; + + asm(syscall : =a (ret) : + 0 (__NR_gettimeofday), D (tv), S (tz) : memory); + return ret; } #ifdef CONFIG_PARAVIRT_CLOCK @@ -133,23 +118,37 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static cycle_t vread_tsc(void) { - long ret; - asm(syscall : =a (ret) : - 0 (__NR_clock_gettime),D (clock), S (ts) : memory); - return ret; -} + cycle_t ret; + u64 last; -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; + /* +* Empirically, a fence (of type that depends on the CPU) +* before rdtsc is enough to ensure that rdtsc is ordered +* with respect to loads. The various CPU manuals are unclear +* as to whether rdtsc can be reordered with later loads, +* but no one has ever seen it happen. +*/ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); - asm(syscall : =a (ret) : - 0 (__NR_gettimeofday), D (tv), S (tz) : memory); - return ret; -} + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + if (likely(ret = last)) + return ret; + + /* +* GCC likes to generate cmov here, but this branch is extremely +* predictable (it's just a funciton of time and the likely is +* very likely) and there's a data dependence, so force GCC +* to generate a branch instead. I don't barrier() because +* we don't actually need a barrier, and if this function +* ever gets inlined it will generate worse code. +*/ + asm volatile (); + return last; +} notrace static inline u64 vgetsns(int *mode) { -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/