[tip:perf/urgent] perf data: Fix 'strncat may truncate' build failure with recent gcc
Commit-ID: 97acec7df172cd1e450f81f5e293c0aa145a2797 Gitweb: https://git.kernel.org/tip/97acec7df172cd1e450f81f5e293c0aa145a2797 Author: Shawn Landden AuthorDate: Sat, 18 May 2019 15:32:38 -0300 Committer: Arnaldo Carvalho de Melo CommitDate: Tue, 28 May 2019 09:49:03 -0300 perf data: Fix 'strncat may truncate' build failure with recent gcc This strncat() is safe because the buffer was allocated with zalloc(), however gcc doesn't know that. Since the string always has 4 non-null bytes, just use memcpy() here. CC /home/shawn/linux/tools/perf/util/data-convert-bt.o In file included from /usr/include/string.h:494, from /home/shawn/linux/tools/lib/traceevent/event-parse.h:27, from util/data-convert-bt.c:22: In function ‘strncat’, inlined from ‘string_set_value’ at util/data-convert-bt.c:274:4: /usr/include/powerpc64le-linux-gnu/bits/string_fortified.h:136:10: error: ‘__builtin_strncat’ output may be truncated copying 4 bytes from a string of length 4 [-Werror=stringop-truncation] 136 | return __builtin___strncat_chk (__dest, __src, __len, __bos (__dest)); | ^~ Signed-off-by: Shawn Landden Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan LPU-Reference: 20190518183238.10954-1-sh...@git.icu Link: https://lkml.kernel.org/n/tip-289f1jice17ta7tr3tstm...@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data-convert-bt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index e0311c9750ad..9097543a818b 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -271,7 +271,7 @@ static int string_set_value(struct bt_ctf_field *field, const char *string) if (i > 0) strncpy(buffer, string, i); } - strncat(buffer + p, numstr, 4); + memcpy(buffer + p, numstr, 4); p += 3; } }
Re: [PATCH 05/44] perf data: Fix 'strncat may truncate' build failure with recent gcc
On Tue, May 28, 2019 at 8:04 AM Arnaldo Carvalho de Melo wrote: > > Em Mon, May 27, 2019 at 05:46:26PM -0500, Shawn Landden escreveu: > > On Mon, May 27, 2019 at 5:38 PM Arnaldo Carvalho de Melo > > wrote: > > > > > > From: Shawn Landden > > > > > > This strncat() is safe because the buffer was allocated with zalloc(), > > > however gcc doesn't know that. Since the string always has 4 non-null > > > bytes, just use memcpy() here. > > > > > > CC /home/shawn/linux/tools/perf/util/data-convert-bt.o > > > In file included from /usr/include/string.h:494, > > >from > > > /home/shawn/linux/tools/lib/traceevent/event-parse.h:27, > > >from util/data-convert-bt.c:22: > > > In function ‘strncat’, > > > inlined from ‘string_set_value’ at util/data-convert-bt.c:274:4: > > > /usr/include/powerpc64le-linux-gnu/bits/string_fortified.h:136:10: > > > error: ‘__builtin_strncat’ output may be truncated copying 4 bytes from a > > > string of length 4 [-Werror=stringop-truncation] > > > 136 | return __builtin___strncat_chk (__dest, __src, __len, __bos > > > (__dest)); > > > | > > > ^~ > > > > > > Signed-off-by: Shawn Landden > > > Cc: Adrian Hunter > > > Cc: Jiri Olsa > > > Cc: Namhyung Kim > > > Cc: Wang Nan > > > LPU-Reference: 20190518183238.10954-1-sh...@git.icu > > > Link: > > > https://lkml.kernel.org/n/tip-289f1jice17ta7tr3tstm...@git.kernel.org > > > Signed-off-by: Arnaldo Carvalho de Melo > > > --- > > > tools/perf/util/data-convert-bt.c | 2 +- > > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > > > diff --git a/tools/perf/util/data-convert-bt.c > > > b/tools/perf/util/data-convert-bt.c > > > index e0311c9750ad..9097543a818b 100644 > > > --- a/tools/perf/util/data-convert-bt.c > > > +++ b/tools/perf/util/data-convert-bt.c > > > @@ -271,7 +271,7 @@ static int string_set_value(struct bt_ctf_field > > > *field, const char *string) > > > if (i > 0) > > > strncpy(buffer, string, i); > > > } > > > - strncat(buffer + p, numstr, 4); > > > + memcpy(buffer + p, numstr, 4); > > I took care to have enough context in my patch that you could see what > > was going on. I wonder if there is a way to make that care > > propate when people add Signed-off-by: lines. > > I just checked and the patch is the same, the description I only changed > the subject line, so that when one uses: Functionally, yes. However look at how my version has enough context that you can immediately know that the patch is correct (instead of the default of 5 lines): https://www.spinics.net/lists/linux-perf-users/msg08563.html > >git log --oneline > > we can know what is the component and what kind of build failure was > that. > > - Arnaldo > > > > p += 3; > > > } > > > } > > > -- > > > 2.20.1 > > > > > -- > > - Arnaldo
Re: [PATCH 05/44] perf data: Fix 'strncat may truncate' build failure with recent gcc
On Mon, May 27, 2019 at 5:38 PM Arnaldo Carvalho de Melo wrote: > > From: Shawn Landden > > This strncat() is safe because the buffer was allocated with zalloc(), > however gcc doesn't know that. Since the string always has 4 non-null > bytes, just use memcpy() here. > > CC /home/shawn/linux/tools/perf/util/data-convert-bt.o > In file included from /usr/include/string.h:494, >from > /home/shawn/linux/tools/lib/traceevent/event-parse.h:27, >from util/data-convert-bt.c:22: > In function ‘strncat’, > inlined from ‘string_set_value’ at util/data-convert-bt.c:274:4: > /usr/include/powerpc64le-linux-gnu/bits/string_fortified.h:136:10: error: > ‘__builtin_strncat’ output may be truncated copying 4 bytes from a string of > length 4 [-Werror=stringop-truncation] > 136 | return __builtin___strncat_chk (__dest, __src, __len, __bos > (__dest)); > | > ^~~~~~~~~~ > > Signed-off-by: Shawn Landden > Cc: Adrian Hunter > Cc: Jiri Olsa > Cc: Namhyung Kim > Cc: Wang Nan > LPU-Reference: 20190518183238.10954-1-sh...@git.icu > Link: https://lkml.kernel.org/n/tip-289f1jice17ta7tr3tstm...@git.kernel.org > Signed-off-by: Arnaldo Carvalho de Melo > --- > tools/perf/util/data-convert-bt.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/tools/perf/util/data-convert-bt.c > b/tools/perf/util/data-convert-bt.c > index e0311c9750ad..9097543a818b 100644 > --- a/tools/perf/util/data-convert-bt.c > +++ b/tools/perf/util/data-convert-bt.c > @@ -271,7 +271,7 @@ static int string_set_value(struct bt_ctf_field *field, > const char *string) > if (i > 0) > strncpy(buffer, string, i); > } > - strncat(buffer + p, numstr, 4); > + memcpy(buffer + p, numstr, 4); I took care to have enough context in my patch that you could see what was going on. I wonder if there is a way to make that care propate when people add Signed-off-by: lines. > p += 3; > } > } > -- > 2.20.1 >
[PATCH RESEND] powerpc: add simd.h implementation specific to PowerPC
It is safe to do SIMD in an interrupt on PowerPC. Only disable when there is no SIMD available (and this is a static branch). Tested and works with the WireGuard (Zinc) patch I wrote that needs this. Also improves performance of the crypto subsystem that checks this. Re-sending because this linuxppc-dev didn't seem to accept it. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203571 Signed-off-by: Shawn Landden --- arch/powerpc/include/asm/simd.h | 15 +++ 1 file changed, 15 insertions(+) create mode 100644 arch/powerpc/include/asm/simd.h diff --git a/arch/powerpc/include/asm/simd.h b/arch/powerpc/include/asm/simd.h new file mode 100644 index 0..b3fecb95a --- /dev/null +++ b/arch/powerpc/include/asm/simd.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include + +/* + * may_use_simd - whether it is allowable at this time to issue SIMD + *instructions or access the SIMD register file + * + * As documented in Chapter 6.2.1 Machine Status Save/Restore Registers + * of Power ISA (2.07 and 3.0), all registers are saved/restored in an interrupt. + */ +static inline bool may_use_simd(void) +{ + return !cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE); +} -- 2.21.0.1020.gf2820cf01a
[PATCH] powerpc: add simd.h implementation specific to PowerPC
It is safe to do SIMD in an interrupt on PowerPC. Only disable when there is no SIMD available (and this is a static branch). Tested and works with the WireGuard (Zinc) patch I wrote that needs this. Also improves performance of the crypto subsystem that checks this. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203571 Signed-off-by: Shawn Landden --- arch/powerpc/include/asm/simd.h | 15 +++ 1 file changed, 15 insertions(+) create mode 100644 arch/powerpc/include/asm/simd.h diff --git a/arch/powerpc/include/asm/simd.h b/arch/powerpc/include/asm/simd.h new file mode 100644 index 0..b3fecb95a --- /dev/null +++ b/arch/powerpc/include/asm/simd.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include + +/* + * may_use_simd - whether it is allowable at this time to issue SIMD + *instructions or access the SIMD register file + * + * As documented in Chapter 6.2.1 Machine Status Save/Restore Registers + * of Power ISA (2.07 and 3.0), all registers are saved/restored in an interrupt. + */ +static inline bool may_use_simd(void) +{ + return !cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE); +} -- 2.21.0.1020.gf2820cf01a
[PATCH] new flag COPY_FILE_RANGE_FILESIZE for copy_file_range()
If flags includes COPY_FILE_RANGE_FILESIZE then the length copied is the length of the file. off_in and off_out are ignored. len must be 0 or the file size. This implementation saves a call to stat() in the common case of copying files. It does not fix any race conditions, but that is possible in the future with this interface. EAGAIN: If COPY_FILE_RANGE_FILESIZE was passed and len is not 0 or the file size. Signed-off-by: Shawn Landden CC: --- fs/read_write.c | 14 +- include/uapi/linux/stat.h | 4 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index 61b43ad7608e..6d06361f0856 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1557,7 +1557,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_out = file_inode(file_out); ssize_t ret; - if (flags != 0) + if ((flags & ~COPY_FILE_RANGE_FILESIZE) != 0) return -EINVAL; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) @@ -1565,6 +1565,18 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; + if (flags & COPY_FILE_RANGE_FILESIZE) { + struct kstat stat; + int error; + error = vfs_getattr(_in->f_path, , + STATX_SIZE, 0); + if (error < 0) + return error; + if (!(len == 0 || len == stat.size)) + return -EAGAIN; + len = stat.size; + } + ret = rw_verify_area(READ, file_in, _in, len); if (unlikely(ret)) return ret; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7b35e98d3c58..1075aa4666ef 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -170,5 +170,9 @@ struct statx { #define STATX_ATTR_AUTOMOUNT 0x1000 /* Dir: Automount trigger */ +/* + * Flags for copy_file_range() + */ +#define COPY_FILE_RANGE_FILESIZE 0x0001 /* Copy the full length of the input file */ #endif /* _UAPI_LINUX_STAT_H */ -- 2.20.1
[PATCH] new flag COPY_FILE_RANGE_FILESIZE for copy_file_range()
If flags includes COPY_FILE_RANGE_FILESIZE then the length copied is the length of the file. off_in and off_out are ignored. len must be 0 or the file size. This implementation saves a call to stat() in the common case of copying files. It does not fix any race conditions, but that is possible in the future with this interface. EAGAIN: If COPY_FILE_RANGE_FILESIZE was passed and len is not 0 or the file size. Signed-off-by: Shawn Landden CC: --- fs/read_write.c | 14 +- include/uapi/linux/stat.h | 4 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index 61b43ad7608e..6d06361f0856 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1557,7 +1557,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_out = file_inode(file_out); ssize_t ret; - if (flags != 0) + if ((flags & ~COPY_FILE_RANGE_FILESIZE) != 0) return -EINVAL; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) @@ -1565,6 +1565,18 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; + if (flags & COPY_FILE_RANGE_FILESIZE) { + struct kstat stat; + int error; + error = vfs_getattr(_in->f_path, , + STATX_SIZE, 0); + if (error < 0) + return error; + if (!(len == 0 || len == stat.size)) + return -EAGAIN; + len = stat.size; + } + ret = rw_verify_area(READ, file_in, _in, len); if (unlikely(ret)) return ret; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7b35e98d3c58..1075aa4666ef 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -170,5 +170,9 @@ struct statx { #define STATX_ATTR_AUTOMOUNT 0x1000 /* Dir: Automount trigger */ +/* + * Flags for copy_file_range() + */ +#define COPY_FILE_RANGE_FILESIZE 0x0001 /* Copy the full length of the input file */ #endif /* _UAPI_LINUX_STAT_H */ -- 2.20.1
Re: [RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends ma
On Mon, Nov 20, 2017 at 9:16 PM, Shawn Landden <sland...@gmail.com> wrote: > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl > > Android uses this memory model for all programs, and having it in the > kernel will enable integration with the page cache (not in this > series). What about having a dedicated way to kill these type of processes, instead of overloading the OOM killer? This was suggested by Colin Walters <walt...@verbum.org>
Re: [RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends ma
On Mon, Nov 20, 2017 at 9:16 PM, Shawn Landden wrote: > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl > > Android uses this memory model for all programs, and having it in the > kernel will enable integration with the page cache (not in this > series). What about having a dedicated way to kill these type of processes, instead of overloading the OOM killer? This was suggested by Colin Walters
[RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). v2 switch to prctl, memcg support v3 use put OOM after constraint checking v4 ignore memcg OOMs as should have been all along (sry for the noise) --- fs/eventpoll.c | 9 + fs/proc/array.c| 7 +++ include/linux/memcontrol.h | 1 + include/linux/oom.h| 4 include/linux/sched.h | 1 + include/uapi/linux/prctl.h | 4 kernel/exit.c | 1 + kernel/sys.c | 9 + mm/oom_kill.c | 43 +++ 9 files changed, 79 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..5b3f084b22d5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -1761,6 +1763,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); + + if (current->oom_target) + add_wait_queue(oom_target_get_wait(), _target_wait); if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1850,6 +1856,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) + remove_wait_queue(oom_target_get_wait(), _target_wait); + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 9390032a11e1..1954ae87cb88 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..471d1d52ae72 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -30,6 +30,7 @@ #include #include #include +#include struct mem_cgroup; struct page; diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..88acea9e0a59 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct wait_queue_head *oom_target_get_wait(void); +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index fdf74f27acf1..51b0e5987e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,6 +652,7 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsignedno_cgroup_migration:1; #endif + unsignedoom_target:1; unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b640071421f7..94868317c6f2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -198,4 +198,8 @@ struct prctl_mm_map { # define PR_CAP_AMBIENT_LOWER 3 # define PR_CAP_AMBIENT_CLEAR_ALL 4 +#define PR_SET_IDLE48 +#define PR_GET_IDLE49 +# define PR_IDLE_MODE_KILLME 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/exit.c b/kernel/exit.c index f6cad39f35df..2788fbdae267 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 524a4cb9bbe2..e1eb049a85e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SET_IDLE: +
[RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). v2 switch to prctl, memcg support v3 use put OOM after constraint checking v4 ignore memcg OOMs as should have been all along (sry for the noise) --- fs/eventpoll.c | 9 + fs/proc/array.c| 7 +++ include/linux/memcontrol.h | 1 + include/linux/oom.h| 4 include/linux/sched.h | 1 + include/uapi/linux/prctl.h | 4 kernel/exit.c | 1 + kernel/sys.c | 9 + mm/oom_kill.c | 43 +++ 9 files changed, 79 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..5b3f084b22d5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -1761,6 +1763,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); + + if (current->oom_target) + add_wait_queue(oom_target_get_wait(), _target_wait); if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1850,6 +1856,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) + remove_wait_queue(oom_target_get_wait(), _target_wait); + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 9390032a11e1..1954ae87cb88 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..471d1d52ae72 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -30,6 +30,7 @@ #include #include #include +#include struct mem_cgroup; struct page; diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..88acea9e0a59 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct wait_queue_head *oom_target_get_wait(void); +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index fdf74f27acf1..51b0e5987e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,6 +652,7 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsignedno_cgroup_migration:1; #endif + unsignedoom_target:1; unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b640071421f7..94868317c6f2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -198,4 +198,8 @@ struct prctl_mm_map { # define PR_CAP_AMBIENT_LOWER 3 # define PR_CAP_AMBIENT_CLEAR_ALL 4 +#define PR_SET_IDLE48 +#define PR_GET_IDLE49 +# define PR_IDLE_MODE_KILLME 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/exit.c b/kernel/exit.c index f6cad39f35df..2788fbdae267 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 524a4cb9bbe2..e1eb049a85e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SET_IDLE: +
Re: [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends ma
On Mon, Nov 20, 2017 at 8:49 PM, Shawn Landden <sland...@gmail.com> wrote: > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl > > Android uses this memory model for all programs, and having it in the > kernel will enable integration with the page cache (not in this > series). > > v2 > switch to prctl, memcg support > > v3 > use > put OOM after constraint checking > --- > fs/eventpoll.c | 27 > fs/proc/array.c| 7 ++ > include/linux/memcontrol.h | 3 +++ > include/linux/oom.h| 4 +++ > include/linux/sched.h | 1 + > include/uapi/linux/prctl.h | 4 +++ > kernel/cgroup/cgroup.c | 61 > ++ > kernel/exit.c | 1 + > kernel/sys.c | 9 +++ > mm/memcontrol.c| 2 ++ > mm/oom_kill.c | 47 +++ > 11 files changed, 166 insertions(+) > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > index 2fabd19cdeea..745662f9a7e1 100644 > --- a/fs/eventpoll.c > +++ b/fs/eventpoll.c > @@ -43,6 +43,8 @@ > #include > #include > #include > +#include > +#include > > /* > * LOCKING: > @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct > epoll_event __user *events, > u64 slack = 0; > wait_queue_entry_t wait; > ktime_t expires, *to = NULL; > + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); > + DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback); > + > + if (current->oom_target) { > +#ifdef CONFIG_MEMCG > + struct mem_cgroup *mcg; > + > + mcg = mem_cgroup_from_task(current); > + if (mcg) > + add_wait_queue(>oom_target, > _target_wait_mcg); > +#endif > + add_wait_queue(oom_target_get_wait(), _target_wait); > + } > > if (timeout > 0) { > struct timespec64 end_time = ep_set_mstimeout(timeout); > @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct > epoll_event __user *events, > !(res = ep_send_events(ep, events, maxevents)) && !timed_out) > goto fetch_events; > > + if (current->oom_target) { > +#ifdef CONFIG_MEMCG > + struct mem_cgroup *mcg; > + > + mcg = mem_cgroup_from_task(current); > + if (mcg) > + remove_wait_queue(>oom_target, > + _target_wait_mcg); > +#endif > + remove_wait_queue(oom_target_get_wait(), _target_wait); > + } > + > return res; > } > > diff --git a/fs/proc/array.c b/fs/proc/array.c > index 9390032a11e1..1954ae87cb88 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, > struct task_struct *p) > seq_putc(m, '\n'); > } > > +static inline void task_idle(struct seq_file *m, struct task_struct *p) > +{ > + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); > + seq_putc(m, '\n'); > +} > + > static inline void task_context_switch_counts(struct seq_file *m, > struct task_struct *p) > { > @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct > pid_namespace *ns, > task_sig(m, task); > task_cap(m, task); > task_seccomp(m, task); > + task_idle(m, task); > task_cpus_allowed(m, task); > cpuset_task_status_allowed(m, task); > task_context_switch_counts(m, task); > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 69966c461d1c..02eb92e7eff5 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > > struct mem_cgroup; > struct page; > @@ -261,6 +262,8 @@ struct mem_cgroup { > struct list_head event_list; > spinlock_t event_list_lock; > > + wait_queue_head_t oom_target; > + > struct mem_cgroup_per_node *nodeinfo[0]; > /* WARNING: nodeinfo must be the last member here */ > }; > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 01c91d874a57..88acea9e0a59 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); > > extern struct task_struct *find_lock_task_mm(struct task_struct *p); > > +extern void exit_oom_target(voi
Re: [RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends ma
On Mon, Nov 20, 2017 at 8:49 PM, Shawn Landden wrote: > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl > > Android uses this memory model for all programs, and having it in the > kernel will enable integration with the page cache (not in this > series). > > v2 > switch to prctl, memcg support > > v3 > use > put OOM after constraint checking > --- > fs/eventpoll.c | 27 > fs/proc/array.c| 7 ++ > include/linux/memcontrol.h | 3 +++ > include/linux/oom.h| 4 +++ > include/linux/sched.h | 1 + > include/uapi/linux/prctl.h | 4 +++ > kernel/cgroup/cgroup.c | 61 > ++ > kernel/exit.c | 1 + > kernel/sys.c | 9 +++ > mm/memcontrol.c| 2 ++ > mm/oom_kill.c | 47 +++ > 11 files changed, 166 insertions(+) > > diff --git a/fs/eventpoll.c b/fs/eventpoll.c > index 2fabd19cdeea..745662f9a7e1 100644 > --- a/fs/eventpoll.c > +++ b/fs/eventpoll.c > @@ -43,6 +43,8 @@ > #include > #include > #include > +#include > +#include > > /* > * LOCKING: > @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct > epoll_event __user *events, > u64 slack = 0; > wait_queue_entry_t wait; > ktime_t expires, *to = NULL; > + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); > + DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback); > + > + if (current->oom_target) { > +#ifdef CONFIG_MEMCG > + struct mem_cgroup *mcg; > + > + mcg = mem_cgroup_from_task(current); > + if (mcg) > + add_wait_queue(>oom_target, > _target_wait_mcg); > +#endif > + add_wait_queue(oom_target_get_wait(), _target_wait); > + } > > if (timeout > 0) { > struct timespec64 end_time = ep_set_mstimeout(timeout); > @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct > epoll_event __user *events, > !(res = ep_send_events(ep, events, maxevents)) && !timed_out) > goto fetch_events; > > + if (current->oom_target) { > +#ifdef CONFIG_MEMCG > + struct mem_cgroup *mcg; > + > + mcg = mem_cgroup_from_task(current); > + if (mcg) > + remove_wait_queue(>oom_target, > + _target_wait_mcg); > +#endif > + remove_wait_queue(oom_target_get_wait(), _target_wait); > + } > + > return res; > } > > diff --git a/fs/proc/array.c b/fs/proc/array.c > index 9390032a11e1..1954ae87cb88 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, > struct task_struct *p) > seq_putc(m, '\n'); > } > > +static inline void task_idle(struct seq_file *m, struct task_struct *p) > +{ > + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); > + seq_putc(m, '\n'); > +} > + > static inline void task_context_switch_counts(struct seq_file *m, > struct task_struct *p) > { > @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct > pid_namespace *ns, > task_sig(m, task); > task_cap(m, task); > task_seccomp(m, task); > + task_idle(m, task); > task_cpus_allowed(m, task); > cpuset_task_status_allowed(m, task); > task_context_switch_counts(m, task); > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 69966c461d1c..02eb92e7eff5 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > > struct mem_cgroup; > struct page; > @@ -261,6 +262,8 @@ struct mem_cgroup { > struct list_head event_list; > spinlock_t event_list_lock; > > + wait_queue_head_t oom_target; > + > struct mem_cgroup_per_node *nodeinfo[0]; > /* WARNING: nodeinfo must be the last member here */ > }; > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 01c91d874a57..88acea9e0a59 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); > > extern struct task_struct *find_lock_task_mm(struct task_struct *p); > > +extern void exit_oom_target(void); > +struct wait_queue_head *oom_target_get_
[RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). v2 switch to prctl, memcg support v3 use put OOM after constraint checking --- fs/eventpoll.c | 27 fs/proc/array.c| 7 ++ include/linux/memcontrol.h | 3 +++ include/linux/oom.h| 4 +++ include/linux/sched.h | 1 + include/uapi/linux/prctl.h | 4 +++ kernel/cgroup/cgroup.c | 61 ++ kernel/exit.c | 1 + kernel/sys.c | 9 +++ mm/memcontrol.c| 2 ++ mm/oom_kill.c | 47 +++ 11 files changed, 166 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..745662f9a7e1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); + DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback); + + if (current->oom_target) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *mcg; + + mcg = mem_cgroup_from_task(current); + if (mcg) + add_wait_queue(>oom_target, _target_wait_mcg); +#endif + add_wait_queue(oom_target_get_wait(), _target_wait); + } if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *mcg; + + mcg = mem_cgroup_from_task(current); + if (mcg) + remove_wait_queue(>oom_target, + _target_wait_mcg); +#endif + remove_wait_queue(oom_target_get_wait(), _target_wait); + } + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 9390032a11e1..1954ae87cb88 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..02eb92e7eff5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -30,6 +30,7 @@ #include #include #include +#include struct mem_cgroup; struct page; @@ -261,6 +262,8 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; + wait_queue_head_t oom_target; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..88acea9e0a59 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct wait_queue_head *oom_target_get_wait(void); +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index fdf74f27acf1..51b0e5987e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,6 +652,7 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsignedno_cgroup_migration:1; #endif + unsignedoom_target:1; unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index
[RFC v3] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). v2 switch to prctl, memcg support v3 use put OOM after constraint checking --- fs/eventpoll.c | 27 fs/proc/array.c| 7 ++ include/linux/memcontrol.h | 3 +++ include/linux/oom.h| 4 +++ include/linux/sched.h | 1 + include/uapi/linux/prctl.h | 4 +++ kernel/cgroup/cgroup.c | 61 ++ kernel/exit.c | 1 + kernel/sys.c | 9 +++ mm/memcontrol.c| 2 ++ mm/oom_kill.c | 47 +++ 11 files changed, 166 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..745662f9a7e1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -1761,6 +1763,19 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback); + DEFINE_WAIT_FUNC(oom_target_wait_mcg, oom_target_callback); + + if (current->oom_target) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *mcg; + + mcg = mem_cgroup_from_task(current); + if (mcg) + add_wait_queue(>oom_target, _target_wait_mcg); +#endif + add_wait_queue(oom_target_get_wait(), _target_wait); + } if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1850,6 +1865,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *mcg; + + mcg = mem_cgroup_from_task(current); + if (mcg) + remove_wait_queue(>oom_target, + _target_wait_mcg); +#endif + remove_wait_queue(oom_target_get_wait(), _target_wait); + } + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 9390032a11e1..1954ae87cb88 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..02eb92e7eff5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -30,6 +30,7 @@ #include #include #include +#include struct mem_cgroup; struct page; @@ -261,6 +262,8 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; + wait_queue_head_t oom_target; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..88acea9e0a59 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -102,6 +102,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct wait_queue_head *oom_target_get_wait(void); +int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index fdf74f27acf1..51b0e5987e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,6 +652,7 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsignedno_cgroup_migration:1; #endif + unsignedoom_target:1; unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index
Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
On Mon, Nov 20, 2017 at 12:35 AM, Michal Hocko <mho...@kernel.org> wrote: > On Fri 17-11-17 20:45:03, Shawn Landden wrote: >> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mho...@kernel.org> wrote: >> >> > On Thu 02-11-17 23:35:44, Shawn Landden wrote: >> > > It is common for services to be stateless around their main event loop. >> > > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it >> > > signals to the kernel that epoll_wait() and friends may not complete, >> > > and the kernel may send SIGKILL if resources get tight. >> > > >> > > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl >> > > >> > > Android uses this memory model for all programs, and having it in the >> > > kernel will enable integration with the page cache (not in this >> > > series). >> > > >> > > 16 bytes per process is kinda spendy, but I want to keep >> > > lru behavior, which mem_score_adj does not allow. When a supervisor, >> > > like Android's user input is keeping track this can be done in >> > user-space. >> > > It could be pulled out of task_struct if an cross-indexing additional >> > > red-black tree is added to support pid-based lookup. >> > >> > This is still an abuse and the patch is wrong. We really do have an API >> > to use I fail to see why you do not use it. >> > >> When I looked at wait_queue_head_t it was 20 byes. > > I do not understand. What I meant to say is that we do have a proper > user api to hint OOM killer decisions. This is a FIFO queue, rather than a heuristic, which is all you get with the current API. > -- > Michal Hocko > SUSE Labs
Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
On Mon, Nov 20, 2017 at 12:35 AM, Michal Hocko wrote: > On Fri 17-11-17 20:45:03, Shawn Landden wrote: >> On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko wrote: >> >> > On Thu 02-11-17 23:35:44, Shawn Landden wrote: >> > > It is common for services to be stateless around their main event loop. >> > > If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it >> > > signals to the kernel that epoll_wait() and friends may not complete, >> > > and the kernel may send SIGKILL if resources get tight. >> > > >> > > See my systemd patch: https://github.com/shawnl/systemd/tree/prctl >> > > >> > > Android uses this memory model for all programs, and having it in the >> > > kernel will enable integration with the page cache (not in this >> > > series). >> > > >> > > 16 bytes per process is kinda spendy, but I want to keep >> > > lru behavior, which mem_score_adj does not allow. When a supervisor, >> > > like Android's user input is keeping track this can be done in >> > user-space. >> > > It could be pulled out of task_struct if an cross-indexing additional >> > > red-black tree is added to support pid-based lookup. >> > >> > This is still an abuse and the patch is wrong. We really do have an API >> > to use I fail to see why you do not use it. >> > >> When I looked at wait_queue_head_t it was 20 byes. > > I do not understand. What I meant to say is that we do have a proper > user api to hint OOM killer decisions. This is a FIFO queue, rather than a heuristic, which is all you get with the current API. > -- > Michal Hocko > SUSE Labs
Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko <mho...@kernel.org> wrote: > On Thu 02-11-17 23:35:44, Shawn Landden wrote: >> 16 bytes per process is kinda spendy, but I want to keep >> lru behavior, which mem_score_adj does not allow. When a supervisor, >> like Android's user input is keeping track this can be done in user-space. >> It could be pulled out of task_struct if an cross-indexing additional >> red-black tree is added to support pid-based lookup. > > This is still an abuse and the patch is wrong. We really do have an API > to use I fail to see why you do not use it. When I looked at wait_queue_head_t it was 20 bytes.
Re: [RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
On Fri, Nov 3, 2017 at 2:09 AM, Michal Hocko wrote: > On Thu 02-11-17 23:35:44, Shawn Landden wrote: >> 16 bytes per process is kinda spendy, but I want to keep >> lru behavior, which mem_score_adj does not allow. When a supervisor, >> like Android's user input is keeping track this can be done in user-space. >> It could be pulled out of task_struct if an cross-indexing additional >> red-black tree is added to support pid-based lookup. > > This is still an abuse and the patch is wrong. We really do have an API > to use I fail to see why you do not use it. When I looked at wait_queue_head_t it was 20 bytes.
[RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight. See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). 16 bytes per process is kinda spendy, but I want to keep lru behavior, which mem_score_adj does not allow. When a supervisor, like Android's user input is keeping track this can be done in user-space. It could be pulled out of task_struct if an cross-indexing additional red-black tree is added to support pid-based lookup. v2 switch to prctl, memcg support --- fs/eventpoll.c | 17 + fs/proc/array.c| 7 ++ include/linux/memcontrol.h | 3 +++ include/linux/oom.h| 4 include/linux/sched.h | 4 include/uapi/linux/prctl.h | 4 kernel/cgroup/cgroup.c | 12 ++ kernel/exit.c | 2 ++ kernel/sys.c | 9 +++ mm/memcontrol.c| 4 mm/oom_kill.c | 60 ++ 11 files changed, 126 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..04011fca038b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,7 @@ #include #include #include +#include /* * LOCKING: @@ -1762,6 +1763,14 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, wait_queue_entry_t wait; ktime_t expires, *to = NULL; + if (current->oom_target) { + spin_lock(oom_target_get_spinlock(current)); + list_add(>se.oom_target_queue, +oom_target_get_queue(current)); + current->se.oom_target_on_queue = 1; + spin_unlock(oom_target_get_spinlock(current)); + } + if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1783,6 +1792,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); + spin_lock_irqsave(>lock, flags); if (!ep_events_available(ep)) { @@ -1850,6 +1860,13 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) { + spin_lock(oom_target_get_spinlock(current)); + list_del(>se.oom_target_queue); + current->se.oom_target_on_queue = 0; + spin_unlock(oom_target_get_spinlock(current)); + } + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 77a8eacbe032..cab009727a7f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -349,6 +349,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -380,6 +386,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..40a2db8ae522 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -261,6 +261,9 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; + struct list_headoom_target_queue; + spinlock_t oom_target_spinlock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/oom.h b/include/linux/oom.h index 76aac4ce39bc..a5d16eb05297 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -101,6 +101,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct list_head *oom_target_get_queue(struct task_struct *ts); +spinlock_t *oom_target_get_spinlock(struct task_struct *ts); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a7df4e558c..2b110c4d7357 100644 ---
[RFC v2] prctl: prctl(PR_SET_IDLE, PR_IDLE_MODE_KILLME), for stateless idle loops
It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may not complete, and the kernel may send SIGKILL if resources get tight. See my systemd patch: https://github.com/shawnl/systemd/tree/prctl Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). 16 bytes per process is kinda spendy, but I want to keep lru behavior, which mem_score_adj does not allow. When a supervisor, like Android's user input is keeping track this can be done in user-space. It could be pulled out of task_struct if an cross-indexing additional red-black tree is added to support pid-based lookup. v2 switch to prctl, memcg support --- fs/eventpoll.c | 17 + fs/proc/array.c| 7 ++ include/linux/memcontrol.h | 3 +++ include/linux/oom.h| 4 include/linux/sched.h | 4 include/uapi/linux/prctl.h | 4 kernel/cgroup/cgroup.c | 12 ++ kernel/exit.c | 2 ++ kernel/sys.c | 9 +++ mm/memcontrol.c| 4 mm/oom_kill.c | 60 ++ 11 files changed, 126 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..04011fca038b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,6 +43,7 @@ #include #include #include +#include /* * LOCKING: @@ -1762,6 +1763,14 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, wait_queue_entry_t wait; ktime_t expires, *to = NULL; + if (current->oom_target) { + spin_lock(oom_target_get_spinlock(current)); + list_add(>se.oom_target_queue, +oom_target_get_queue(current)); + current->se.oom_target_on_queue = 1; + spin_unlock(oom_target_get_spinlock(current)); + } + if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1783,6 +1792,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); + spin_lock_irqsave(>lock, flags); if (!ep_events_available(ep)) { @@ -1850,6 +1860,13 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; + if (current->oom_target) { + spin_lock(oom_target_get_spinlock(current)); + list_del(>se.oom_target_queue); + current->se.oom_target_on_queue = 0; + spin_unlock(oom_target_get_spinlock(current)); + } + return res; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 77a8eacbe032..cab009727a7f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -349,6 +349,12 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_putc(m, '\n'); } +static inline void task_idle(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "Idle:\t", p->oom_target); + seq_putc(m, '\n'); +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -380,6 +386,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_sig(m, task); task_cap(m, task); task_seccomp(m, task); + task_idle(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..40a2db8ae522 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -261,6 +261,9 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; + struct list_headoom_target_queue; + spinlock_t oom_target_spinlock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/oom.h b/include/linux/oom.h index 76aac4ce39bc..a5d16eb05297 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -101,6 +101,10 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern void exit_oom_target(void); +struct list_head *oom_target_get_queue(struct task_struct *ts); +spinlock_t *oom_target_get_spinlock(struct task_struct *ts); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a7df4e558c..2b110c4d7357 100644 ---
[RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)
It is common for services to be stateless around their main event loop. If a process passes the EPOLL_KILLME flag to epoll_wait5() then it signals to the kernel that epoll_wait5() may not complete, and the kernel may send SIGKILL if resources get tight. See my systemd patch: https://github.com/shawnl/systemd/tree/killme Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/eventpoll.c | 74 +- include/linux/eventpoll.h | 2 + include/linux/sched.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 ++- include/uapi/linux/eventpoll.h | 3 ++ kernel/exit.c | 2 + mm/oom_kill.c | 17 9 files changed, 105 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac2161112..040e5d02bdcc 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382i386pkey_free sys_pkey_free 383i386statx sys_statx 384i386arch_prctl sys_arch_prctl compat_sys_arch_prctl +385i386epoll_wait5 sys_epoll_wait5 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..c72802e8cf65 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330common pkey_alloc sys_pkey_alloc 331common pkey_free sys_pkey_free 332common statx sys_statx +333common epoll_wait5 sys_epoll_wait5 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..76d1c91d940b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -297,6 +297,14 @@ static LIST_HEAD(visited_list); */ static LIST_HEAD(tfile_check_list); +static LIST_HEAD(deathrow_q); +static long deathrow_len __read_mostly; + +/* TODO: Can this lock be removed by using atomic instructions to update + * queue? + */ +static DEFINE_MUTEX(deathrow_mutex); + #ifdef CONFIG_SYSCTL #include @@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = { .extra1 = , .extra2 = _max, }, + { + .procname = "deathrow_size", + .data = _len, + .maxlen = sizeof(deathrow_len), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + .extra1 = , + .extra2 = _max, + }, { } }; #endif /* CONFIG_SYSCTL */ @@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). + * + * A flags argument cannot be added to epoll_pwait cause it already has + * the maximum number of arguments (6). Can this be fixed? */ -SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, - int, maxevents, int, timeout) +SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, int, flags) { int error; struct fd f; @@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, */ ep = f.file->private_data; + /* Check the EPOLL_* constants for conflicts. */ + BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC); + + if (flags & ~EPOLL_KILLME) + return -EINVAL; + + if (flags & EPOLL_KILLME) { + /* Put process on death row. */ + mutex_lock(_mutex); + deathrow_len++; + list_add(>se.deathrow, _q); + current->se.on_deathrow = 1; + mutex_unlock(_mutex); + } + /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout); + if (flags & EPOLL_KILLME) { + /* Remove process from death row. */ + mutex_lock(_mutex); + current->se.on_deathrow = 0; + list_del(>se.deathrow); + deathrow_len--; + mutex_unlock(_mutex); + } + error_fput: fdput(f); return error; } +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + return sys_epoll_wait5(epfd, events, maxevents, timeout, 0); +} + /* * Implement the event wait interface for the
[RFC] EPOLL_KILLME: New flag to epoll_wait() that subscribes process to death row (new syscall)
It is common for services to be stateless around their main event loop. If a process passes the EPOLL_KILLME flag to epoll_wait5() then it signals to the kernel that epoll_wait5() may not complete, and the kernel may send SIGKILL if resources get tight. See my systemd patch: https://github.com/shawnl/systemd/tree/killme Android uses this memory model for all programs, and having it in the kernel will enable integration with the page cache (not in this series). --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/eventpoll.c | 74 +- include/linux/eventpoll.h | 2 + include/linux/sched.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 ++- include/uapi/linux/eventpoll.h | 3 ++ kernel/exit.c | 2 + mm/oom_kill.c | 17 9 files changed, 105 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac2161112..040e5d02bdcc 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382i386pkey_free sys_pkey_free 383i386statx sys_statx 384i386arch_prctl sys_arch_prctl compat_sys_arch_prctl +385i386epoll_wait5 sys_epoll_wait5 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..c72802e8cf65 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330common pkey_alloc sys_pkey_alloc 331common pkey_free sys_pkey_free 332common statx sys_statx +333common epoll_wait5 sys_epoll_wait5 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..76d1c91d940b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -297,6 +297,14 @@ static LIST_HEAD(visited_list); */ static LIST_HEAD(tfile_check_list); +static LIST_HEAD(deathrow_q); +static long deathrow_len __read_mostly; + +/* TODO: Can this lock be removed by using atomic instructions to update + * queue? + */ +static DEFINE_MUTEX(deathrow_mutex); + #ifdef CONFIG_SYSCTL #include @@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = { .extra1 = , .extra2 = _max, }, + { + .procname = "deathrow_size", + .data = _len, + .maxlen = sizeof(deathrow_len), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + .extra1 = , + .extra2 = _max, + }, { } }; #endif /* CONFIG_SYSCTL */ @@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). + * + * A flags argument cannot be added to epoll_pwait cause it already has + * the maximum number of arguments (6). Can this be fixed? */ -SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, - int, maxevents, int, timeout) +SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, int, flags) { int error; struct fd f; @@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, */ ep = f.file->private_data; + /* Check the EPOLL_* constants for conflicts. */ + BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC); + + if (flags & ~EPOLL_KILLME) + return -EINVAL; + + if (flags & EPOLL_KILLME) { + /* Put process on death row. */ + mutex_lock(_mutex); + deathrow_len++; + list_add(>se.deathrow, _q); + current->se.on_deathrow = 1; + mutex_unlock(_mutex); + } + /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout); + if (flags & EPOLL_KILLME) { + /* Remove process from death row. */ + mutex_lock(_mutex); + current->se.on_deathrow = 0; + list_del(>se.deathrow); + deathrow_len--; + mutex_unlock(_mutex); + } + error_fput: fdput(f); return error; } +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + return sys_epoll_wait5(epfd, events, maxevents, timeout, 0); +} + /* * Implement the event wait interface for the
[PATCH] timerfd: show procfs fdinfo helper, and now accepts write()
| pos: 0 | flags:02004002 | clockid: 0 | ticks:6 Cc: Thomas Gleixner Cc: Alexander Viro Signed-off-by: Shawn Landden --- fs/timerfd.c | 64 1 file changed, 64 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 9293121..2e81bdb 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,6 +25,7 @@ #include #include #include +#include struct timerfd_ctx { union { @@ -284,10 +285,73 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct timerfd_ctx *ctx = f->private_data; + + seq_printf(m, "clockid:\t%d\n" + "ticks:\t%llu\n", ctx->clockid, ctx->ticks); + + return 0; +} +#endif + +static ssize_t timerfd_write(struct file *file, const char __user *buf, size_t count, +loff_t *ppos) +{ + struct timerfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 ucnt; + DECLARE_WAITQUEUE(wait, current); + + if (count < sizeof(ucnt)) + return -EINVAL; + if (copy_from_user(, buf, sizeof(ucnt))) + return -EFAULT; + if (ucnt == ULLONG_MAX) + return -EINVAL; + spin_lock_irq(>wqh.lock); + res = -EAGAIN; + if (ULLONG_MAX - ctx->ticks > ucnt) + res = sizeof(ucnt); + else if (!(file->f_flags & O_NONBLOCK)) { + __add_wait_queue(>wqh, ); + for (res = 0;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ULLONG_MAX - ctx->ticks > ucnt) { + res = sizeof(ucnt); + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(>wqh.lock); + schedule(); + spin_lock_irq(>wqh.lock); + } + __remove_wait_queue(>wqh, ); + __set_current_state(TASK_RUNNING); + } + if (likely(res > 0)) { + ctx->ticks += ucnt; + if (waitqueue_active(>wqh)) + wake_up_locked_poll(>wqh, POLLIN); + } + spin_unlock_irq(>wqh.lock); + + return res; +} + static const struct file_operations timerfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo= timerfd_show_fdinfo, +#endif .release= timerfd_release, .poll = timerfd_poll, .read = timerfd_read, + .write = timerfd_write, .llseek = noop_llseek, }; -- 1.8.5.2.297.g3e57c29 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timerfd: show procfs fdinfo helper, and now accepts write()
| pos: 0 | flags:02004002 | clockid: 0 | ticks:6 Cc: Thomas Gleixner t...@linutronix.de Cc: Alexander Viro v...@zeniv.linux.org.uk Signed-off-by: Shawn Landden sh...@churchofgit.com --- fs/timerfd.c | 64 1 file changed, 64 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 9293121..2e81bdb 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,6 +25,7 @@ #include linux/syscalls.h #include linux/compat.h #include linux/rcupdate.h +#include linux/seq_file.h struct timerfd_ctx { union { @@ -284,10 +285,73 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct timerfd_ctx *ctx = f-private_data; + + seq_printf(m, clockid:\t%d\n + ticks:\t%llu\n, ctx-clockid, ctx-ticks); + + return 0; +} +#endif + +static ssize_t timerfd_write(struct file *file, const char __user *buf, size_t count, +loff_t *ppos) +{ + struct timerfd_ctx *ctx = file-private_data; + ssize_t res; + __u64 ucnt; + DECLARE_WAITQUEUE(wait, current); + + if (count sizeof(ucnt)) + return -EINVAL; + if (copy_from_user(ucnt, buf, sizeof(ucnt))) + return -EFAULT; + if (ucnt == ULLONG_MAX) + return -EINVAL; + spin_lock_irq(ctx-wqh.lock); + res = -EAGAIN; + if (ULLONG_MAX - ctx-ticks ucnt) + res = sizeof(ucnt); + else if (!(file-f_flags O_NONBLOCK)) { + __add_wait_queue(ctx-wqh, wait); + for (res = 0;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (ULLONG_MAX - ctx-ticks ucnt) { + res = sizeof(ucnt); + break; + } + if (signal_pending(current)) { + res = -ERESTARTSYS; + break; + } + spin_unlock_irq(ctx-wqh.lock); + schedule(); + spin_lock_irq(ctx-wqh.lock); + } + __remove_wait_queue(ctx-wqh, wait); + __set_current_state(TASK_RUNNING); + } + if (likely(res 0)) { + ctx-ticks += ucnt; + if (waitqueue_active(ctx-wqh)) + wake_up_locked_poll(ctx-wqh, POLLIN); + } + spin_unlock_irq(ctx-wqh.lock); + + return res; +} + static const struct file_operations timerfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo= timerfd_show_fdinfo, +#endif .release= timerfd_release, .poll = timerfd_poll, .read = timerfd_read, + .write = timerfd_write, .llseek = noop_llseek, }; -- 1.8.5.2.297.g3e57c29 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timerfd: show procfs fdinfo helper
| pos: 0 | flags:02004002 | clockid: 0 Cc: Thomas Gleixner Cc: Alexander Viro Signed-off-by: Shawn Landden --- fs/timerfd.c | 17 + 1 file changed, 17 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 9293121..e5fa587 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,6 +25,7 @@ #include #include #include +#include struct timerfd_ctx { union { @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct timerfd_ctx *ctx = f->private_data; + int clockid; + + clockid = ctx->clockid; + seq_printf(m, "clockid:\t%d\n", clockid); + + return 0; +} +#endif + static const struct file_operations timerfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo= timerfd_show_fdinfo, +#endif .release= timerfd_release, .poll = timerfd_poll, .read = timerfd_read, -- 1.8.5.2.297.g3e57c29 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] timerfd: show procfs fdinfo helper
| pos: 0 | flags:02004002 | clockid: 0 Cc: Thomas Gleixner t...@linutronix.de Cc: Alexander Viro v...@zeniv.linux.org.uk Signed-off-by: Shawn Landden sh...@churchofgit.com --- fs/timerfd.c | 17 + 1 file changed, 17 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 9293121..e5fa587 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,6 +25,7 @@ #include linux/syscalls.h #include linux/compat.h #include linux/rcupdate.h +#include linux/seq_file.h struct timerfd_ctx { union { @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct timerfd_ctx *ctx = f-private_data; + int clockid; + + clockid = ctx-clockid; + seq_printf(m, clockid:\t%d\n, clockid); + + return 0; +} +#endif + static const struct file_operations timerfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo= timerfd_show_fdinfo, +#endif .release= timerfd_release, .poll = timerfd_poll, .read = timerfd_read, -- 1.8.5.2.297.g3e57c29 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] tty: only print sysrq help for handlers that are enabled
Also print out a notice when sysrq is in selective mode. Signed-off-by: Shawn Landden --- drivers/tty/sysrq.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ec..4eee0e4 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -548,9 +548,14 @@ void __handle_sysrq(int key, bool check_mask) ; if (j != i) continue; - printk("%s ", sysrq_key_table[i]->help_msg); + /* only print if handler is enabled */ + if (sysrq_enabled & 1 || + sysrq_enabled & sysrq_key_table[i]->enable_mask) + printk("%s ", sysrq_key_table[i]->help_msg); } } + if (!(sysrq_enabled & 1)) + printk("(some options are disabled)"); printk("\n"); console_loglevel = orig_log_level; } -- 1.8.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] tty: only print sysrq help for handlers that are enabled
Also print out a notice when sysrq is in selective mode. Signed-off-by: Shawn Landden sh...@churchofgit.com --- drivers/tty/sysrq.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ec..4eee0e4 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -548,9 +548,14 @@ void __handle_sysrq(int key, bool check_mask) ; if (j != i) continue; - printk(%s , sysrq_key_table[i]-help_msg); + /* only print if handler is enabled */ + if (sysrq_enabled 1 || + sysrq_enabled sysrq_key_table[i]-enable_mask) + printk(%s , sysrq_key_table[i]-help_msg); } } + if (!(sysrq_enabled 1)) + printk((some options are disabled)); printk(\n); console_loglevel = orig_log_level; } -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
On Mon, Nov 25, 2013 at 7:36 AM, Shawn Landden wrote: > Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) > added an internal flag MSG_SENDPAGE_NOTLAST, similar to > MSG_MORE. > > algif_hash, algif_skcipher, and udp used MSG_MORE from tcp_sendpages() > and need to see the new flag as identical to MSG_MORE. > > This fixes sendfile() on AF_ALG. > > v3: also fix udp > > Cc: Tom Herbert > Cc: Eric Dumazet > Cc: David S. Miller > Cc: # 3.4.x + 3.2.x > Reported-and-tested-by: Shawn Landden > Original-patch: Richard Weinberger > Signed-off-by: Shawn Landden >May I ask why you took over the my patch without even CC'in me nor >replying to the original >thread "[PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we >set MSG_SENDPAGE_NOTLAST"? >You are acting very rude. Not CCing you was an oversight. >The discussion at the original thread is not done. >Does skcipher_sendpage() really also need fixing? or UDP? UDP needs it or it will send out packets mid-sendfile. skcipher needs it or it will produce incorrect output like hash. >I didn't send another patch because I'm waiting for Eric's answer first. Eric forgot to update consumers internal consumers of MSG_MORE when he created this distinction in sendpage. That became clear when he first responded. Changing this where the consumers are makes sense. I just want this bug fixed, feel free to send your next version. >Thanks, >//richard -Shawn -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
On Mon, Nov 25, 2013 at 12:05 PM, Mateusz Guzik wrote: > On Mon, Nov 25, 2013 at 11:53:24AM -0800, Shawn Landden wrote: >> On Mon, Nov 25, 2013 at 10:05 AM, Jason Baron wrote: >> > On 11/22/2013 12:53 PM, Shawn Landden wrote: >> >> Hello, when running the attached program on 3.12 child processes >> >> are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, >> >> and added to epoll_ctl(). >> >> >> >> This is the output I get when pointing "wget http://localhost:/; >> >> at the attached program: >> >> >> >> main PID 31591 >> >> PID 31634 started >> >> PID 31634 accept()ed connection >> >> PID 31635 started >> >> PID 31636 started >> >> PID 31635 accept() failed: Bad file descriptor >> >> PID 31636 accept() failed: Bad file descriptor >> >> PID 31634 accept()ed connection >> >> PID 31634 accept()ed connection >> >> PID 31634 accept()ed connection >> >> PID 31634 accept()ed connection >> >> >> >> >> >> While I would expect something like: >> >> >> >> main PID 31591 >> >> PID 31634 started >> >> PID 31634 accept()ed connection >> >> PID 31635 started >> >> PID 31636 started >> >> PID 31635 accept()ed connection >> >> PID 31636 accept()ed connection >> >> >> >> -more new processes, but inversely proportional to number of listening >> >> processes >> >> -accept() always returns successfully >> >> >> >> >> > >> > The 'close(sockfd);' looks to be racing with the accept() calls. Removing >> > seems >> > to get the result you are looking for. >> Interesting. That works, but it shouldn't. The close() is operating in >> the parent, so it shouldn't affect the child, >> there is a leak here of process separation. >> > > You fork, then close sockfd in the parent. Thus, the very first child > can accept connectins just fine. > > Subsequent forks give you children without sockfd, thus accept fails. > The first child continues to work just fine. Now I feel like an idiot. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
On Mon, Nov 25, 2013 at 10:05 AM, Jason Baron wrote: > On 11/22/2013 12:53 PM, Shawn Landden wrote: >> Hello, when running the attached program on 3.12 child processes >> are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, >> and added to epoll_ctl(). >> >> This is the output I get when pointing "wget http://localhost:/; >> at the attached program: >> >> main PID 31591 >> PID 31634 started >> PID 31634 accept()ed connection >> PID 31635 started >> PID 31636 started >> PID 31635 accept() failed: Bad file descriptor >> PID 31636 accept() failed: Bad file descriptor >> PID 31634 accept()ed connection >> PID 31634 accept()ed connection >> PID 31634 accept()ed connection >> PID 31634 accept()ed connection >> >> >> While I would expect something like: >> >> main PID 31591 >> PID 31634 started >> PID 31634 accept()ed connection >> PID 31635 started >> PID 31636 started >> PID 31635 accept()ed connection >> PID 31636 accept()ed connection >> >> -more new processes, but inversely proportional to number of listening >> processes >> -accept() always returns successfully >> >> > > The 'close(sockfd);' looks to be racing with the accept() calls. Removing > seems > to get the result you are looking for. Interesting. That works, but it shouldn't. The close() is operating in the parent, so it shouldn't affect the child, there is a leak here of process separation. New version with pid set to volatile, and making sure that we are in the parent. > > Thanks, > > -Jason > -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include #include #include #include #include #include #include int main( int argc, char *argv[]) { int sockfd, epollfd, acceptfd, portno; struct epoll_event event = {EPOLLIN, NULL}, gotevent; char buffer[256]; struct sockaddr_in serv_addr; int n; volatile pid_t pid; printf("main PID %d\n", getpid()); memset((char *) _addr, 0, sizeof(serv_addr)); portno = ; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = INADDR_ANY; serv_addr.sin_port = htons(portno); if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) return printf("socket() failed: %m\n"); int optval = 1; if ((setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, , sizeof(optval))) < 0) return printf("setsockopt() failed: %m"); if (bind(sockfd, (struct sockaddr *) _addr, sizeof(serv_addr)) < 0) return printf("bind() failed: %m\n"); if (listen(sockfd, SOMAXCONN) < 0) return printf("listen() failed: %m\n"); if ((epollfd = epoll_create1(0)) < 0) return printf("epoll_create1() failed: %m\n"); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, ) < 0) return printf("epoll_ctl() failed: %m\n"); while (1) { if (epoll_wait(epollfd, , 1, -1) != 1) return printf("epoll_wait() failed: %m\n"); pid = fork(); if (pid == 0) { printf("PID %d started\n", getpid()); while(1) { struct sockaddr_in cli_addr; socklen_t cli_size = sizeof cli_addr; if ((acceptfd = accept(sockfd, (struct sockaddr *)_addr, _size)) < 0) { printf("PID %d accept() failed: %m\n", getpid()); sleep(60); return 1; } printf("PID %d accept()ed connection\n", getpid()); if (close(acceptfd) < 0) return printf("close() failed: %m\n"); } } else if (pid > 0) close(sockfd); } }
Re: disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
On Mon, Nov 25, 2013 at 10:05 AM, Jason Baron jba...@akamai.com wrote: On 11/22/2013 12:53 PM, Shawn Landden wrote: Hello, when running the attached program on 3.12 child processes are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, and added to epoll_ctl(). This is the output I get when pointing wget http://localhost:/; at the attached program: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept() failed: Bad file descriptor PID 31636 accept() failed: Bad file descriptor PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection While I would expect something like: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept()ed connection PID 31636 accept()ed connection -more new processes, but inversely proportional to number of listening processes -accept() always returns successfully The 'close(sockfd);' looks to be racing with the accept() calls. Removing seems to get the result you are looking for. Interesting. That works, but it shouldn't. The close() is operating in the parent, so it shouldn't affect the child, there is a leak here of process separation. New version with pid set to volatile, and making sure that we are in the parent. Thanks, -Jason -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include stdio.h #include sys/types.h #include sys/socket.h #include netinet/in.h #include stdlib.h #include string.h #include sys/epoll.h int main( int argc, char *argv[]) { int sockfd, epollfd, acceptfd, portno; struct epoll_event event = {EPOLLIN, NULL}, gotevent; char buffer[256]; struct sockaddr_in serv_addr; int n; volatile pid_t pid; printf(main PID %d\n, getpid()); memset((char *) serv_addr, 0, sizeof(serv_addr)); portno = ; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = INADDR_ANY; serv_addr.sin_port = htons(portno); if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) 0) return printf(socket() failed: %m\n); int optval = 1; if ((setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, optval, sizeof(optval))) 0) return printf(setsockopt() failed: %m); if (bind(sockfd, (struct sockaddr *) serv_addr, sizeof(serv_addr)) 0) return printf(bind() failed: %m\n); if (listen(sockfd, SOMAXCONN) 0) return printf(listen() failed: %m\n); if ((epollfd = epoll_create1(0)) 0) return printf(epoll_create1() failed: %m\n); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, event) 0) return printf(epoll_ctl() failed: %m\n); while (1) { if (epoll_wait(epollfd, gotevent, 1, -1) != 1) return printf(epoll_wait() failed: %m\n); pid = fork(); if (pid == 0) { printf(PID %d started\n, getpid()); while(1) { struct sockaddr_in cli_addr; socklen_t cli_size = sizeof cli_addr; if ((acceptfd = accept(sockfd, (struct sockaddr *)cli_addr, cli_size)) 0) { printf(PID %d accept() failed: %m\n, getpid()); sleep(60); return 1; } printf(PID %d accept()ed connection\n, getpid()); if (close(acceptfd) 0) return printf(close() failed: %m\n); } } else if (pid 0) close(sockfd); } }
Re: disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
On Mon, Nov 25, 2013 at 12:05 PM, Mateusz Guzik mgu...@redhat.com wrote: On Mon, Nov 25, 2013 at 11:53:24AM -0800, Shawn Landden wrote: On Mon, Nov 25, 2013 at 10:05 AM, Jason Baron jba...@akamai.com wrote: On 11/22/2013 12:53 PM, Shawn Landden wrote: Hello, when running the attached program on 3.12 child processes are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, and added to epoll_ctl(). This is the output I get when pointing wget http://localhost:/; at the attached program: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept() failed: Bad file descriptor PID 31636 accept() failed: Bad file descriptor PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection While I would expect something like: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept()ed connection PID 31636 accept()ed connection -more new processes, but inversely proportional to number of listening processes -accept() always returns successfully The 'close(sockfd);' looks to be racing with the accept() calls. Removing seems to get the result you are looking for. Interesting. That works, but it shouldn't. The close() is operating in the parent, so it shouldn't affect the child, there is a leak here of process separation. You fork, then close sockfd in the parent. Thus, the very first child can accept connectins just fine. Subsequent forks give you children without sockfd, thus accept fails. The first child continues to work just fine. Now I feel like an idiot. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
On Mon, Nov 25, 2013 at 7:36 AM, Shawn Landden sh...@churchofgit.com wrote: Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE. algif_hash, algif_skcipher, and udp used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. v3: also fix udp Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden shawnland...@gmail.com Original-patch: Richard Weinberger rich...@nod.at Signed-off-by: Shawn Landden sh...@churchofgit.com May I ask why you took over the my patch without even CC'in me nor replying to the original thread [PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we set MSG_SENDPAGE_NOTLAST? You are acting very rude. Not CCing you was an oversight. The discussion at the original thread is not done. Does skcipher_sendpage() really also need fixing? or UDP? UDP needs it or it will send out packets mid-sendfile. skcipher needs it or it will produce incorrect output like hash. I didn't send another patch because I'm waiting for Eric's answer first. Eric forgot to update consumers internal consumers of MSG_MORE when he created this distinction in sendpage. That became clear when he first responded. Changing this where the consumers are makes sense. I just want this bug fixed, feel free to send your next version. Thanks, //richard -Shawn -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE. algif_hash, algif_skcipher, and udp used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. v3: also fix udp Cc: Tom Herbert Cc: Eric Dumazet Cc: David S. Miller Cc: # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden Original-patch: Richard Weinberger Signed-off-by: Shawn Landden --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ net/ipv4/udp.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask->private; int err; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx->sgl.sg, 1); sg_set_page(ctx->sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx->more && ctx->used) goto unlock; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5944d7d..8bd04df 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1098,6 +1098,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, struct udp_sock *up = udp_sk(sk); int ret; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + if (!up->pending) { struct msghdr msg = { .msg_flags = flags|MSG_MORE }; -- 1.8.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE. algif_hash and algif_skcipher used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. Cc: Tom Herbert Cc: Eric Dumazet Cc: David S. Miller Cc: # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden Original-patch: Richard Weinberger Signed-off-by: Shawn Landden --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask->private; int err; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx->sgl.sg, 1); sg_set_page(ctx->sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx->more && ctx->used) goto unlock; -- 1.8.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE.
algif_hash and algif_skcipher used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. Cc: Tom Herbert Cc: Eric Dumazet Cc: David S. Miller Cc: # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden Original-patch: Richard Weinberger Signed-off-by: Shawn Landden --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask->private; int err; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx->sgl.sg, 1); sg_set_page(ctx->sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx->more && ctx->used) goto unlock; -- 1.8.4.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we set MSG_SENDPAGE_NOTLAST
On Sun, Nov 24, 2013 at 5:25 PM, Eric Dumazet wrote: > On Mon, 2013-11-25 at 00:42 +0100, Richard Weinberger wrote: >> Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) >> added an internal flag MSG_SENDPAGE_NOTLAST. >> We have to ensure that MSG_MORE is also set if we set MSG_SENDPAGE_NOTLAST. >> Otherwise users that check against MSG_MORE will not see it. >> >> This fixes sendfile() on AF_ALG. >> >> Cc: Tom Herbert >> Cc: Eric Dumazet >> Cc: David S. Miller >> Cc: # 3.4.x >> Reported-and-tested-by: Shawn Landden >> Signed-off-by: Richard Weinberger >> --- >> fs/splice.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/fs/splice.c b/fs/splice.c >> index 3b7ee65..b93f1b8 100644 >> --- a/fs/splice.c >> +++ b/fs/splice.c >> @@ -701,7 +701,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, >> more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; >> >> if (sd->len < sd->total_len && pipe->nrbufs > 1) >> - more |= MSG_SENDPAGE_NOTLAST; >> + more |= MSG_SENDPAGE_NOTLAST | MSG_MORE; >> >> return file->f_op->sendpage(file, buf->page, buf->offset, >> sd->len, , more); > > I do not think this patch is right. It looks like a revert of a useful > patch for TCP zero copy. Given the time it took to discover this > regression, I bet tcp zero copy has more users than AF_ALG, by 5 or 6 > order of magnitude ;) > > Here we want to make the difference between the two flags, not merge > them. > > If AF_ALG do not care of the difference, try instead : > > diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c > index ef5356cd280a..850246206b12 100644 > --- a/crypto/algif_hash.c > +++ b/crypto/algif_hash.c > @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct > page *page, > struct hash_ctx *ctx = ask->private; > int err; > > + if (flags & MSG_SENDPAGE_NOTLAST) > + flags |= MSG_MORE; > + > lock_sock(sk); > sg_init_table(ctx->sgl.sg, 1); > sg_set_page(ctx->sgl.sg, page, size, offset); > >From my testing this works. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we set MSG_SENDPAGE_NOTLAST
On Sun, Nov 24, 2013 at 3:42 PM, Richard Weinberger wrote: > Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) > added an internal flag MSG_SENDPAGE_NOTLAST. > We have to ensure that MSG_MORE is also set if we set MSG_SENDPAGE_NOTLAST. > Otherwise users that check against MSG_MORE will not see it. > > This fixes sendfile() on AF_ALG. > > Cc: Tom Herbert > Cc: Eric Dumazet > Cc: David S. Miller > Cc: # 3.4.x The offending commit also got backported to the 3.2 stable kernel, so we need this fix there as well. --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: AF_ALG buggy with sendfile
If you build https://kernel.googlesource.com/pub/scm/network/connman/connman/+/0.80/tools/alg-test.c from the connman source code and compare the output to coreutils sha1sum you can see the problem. shawn@debian-T61:~/git/test$ make connman_afalg cc connman_afalg.c -o connman_afalg shawn@debian-T61:~/git/test$ ./connman_afalg /bin/true send 27080 bytes recv 20 bytes 45384483cf9cd0d82eba164131795b4807c6d39d /bin/true shawn@debian-T61:~/git/test$ sha1sum /bin/true 82667ba2ec681d8e55b0ee3b6db2970f9911680d /bin/true On Sun, Nov 24, 2013 at 2:00 PM, Shawn Landden wrote: > heres a version of the test case that builds. > > Sorry about that. > > On Sun, Nov 24, 2013 at 9:21 AM, Shawn Landden wrote: >> If I use sendfile() to send to a accept()ed AF_ALG socket set up for >> "hash", I get the wrong >> answer, if I read() and then write() I get the right answer. None of >> the system calls return an error. >> >> test case attached. >> >> -- >> >> --- >> Shawn Landden >> +1 360 389 3001 (SMS preferred) > > > > -- > > --- > Shawn Landden > +1 360 389 3001 (SMS preferred) -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: AF_ALG buggy with sendfile
heres a version of the test case that builds. Sorry about that. On Sun, Nov 24, 2013 at 9:21 AM, Shawn Landden wrote: > If I use sendfile() to send to a accept()ed AF_ALG socket set up for > "hash", I get the wrong > answer, if I read() and then write() I get the right answer. None of > the system calls return an error. > > test case attached. > > -- > > --- > Shawn Landden > +1 360 389 3001 (SMS preferred) -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include #include #include #include #include #include #include #include int main(void) { int opfd; int tfmfd; struct sockaddr_alg sa = { .salg_family = AF_ALG, .salg_type = "hash", .salg_name = "sha1" }; char *buf2; char buf[20]; int i; struct stat st; ssize_t size; tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(tfmfd, (struct sockaddr *), sizeof(sa)); opfd = accept(tfmfd, NULL, 0); int t = open("/bin/true", O_RDONLY); fstat(t, ); size = sendfile(opfd, t, NULL, st.st_size); if (size != st.st_size) exit(1); read(opfd, , 20); for (i = 0; i < 20; i++) { printf("%02x", (unsigned char)buf[i]); } printf("\n"); lseek(t, 0, SEEK_SET); buf2 = malloc(st.st_size + 1); read(t, buf2, st.st_size); write(opfd, buf2, st.st_size); read(opfd, , 20); for (i = 0; i < 20; i++) { printf("%02x", (unsigned char)buf[i]); } printf("\n"); close(opfd); close(tfmfd); return 0; }
AF_ALG buggy with sendfile
If I use sendfile() to send to a accept()ed AF_ALG socket set up for "hash", I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. test case attached. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include #include #include #include #include #include int main(void) { int opfd; int tfmfd; struct sockaddr_alg sa = { .salg_family = AF_ALG, .salg_type = "hash", .salg_name = "sha1" }; char buf2[1000]; char buf[20]; int i; struct stat st; tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(tfmfd, (struct sockaddr *), sizeof(sa)); opfd = accept(tfmfd, NULL, 0); int true = open("/bin/true", O_RDONLY); fstat(true, ); sendfile(opfd, true, NULL, st.st_size); read(opfd, , 20); for (i = 0; i < 20; i++) { printf("%02x", (unsigned char)buf[i]); } printf("\n"); lseek(true, 0, SEEK_SET); read(true, , st.st_size); write(opfd, , st.st_size); read(opfd, , 20); for (i = 0; i < 20; i++) { printf("%02x", (unsigned char)buf[i]); } printf("\n"); close(opfd); close(tfmfd); return 0; }
AF_ALG does not work with sendfile()
If I use sendfile() to send to a accept()ed AF_ALG socket set up for "hash", I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
AF_ALG does not work with sendfile()
If I use sendfile() to send to a accept()ed AF_ALG socket set up for hash, I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
AF_ALG buggy with sendfile
If I use sendfile() to send to a accept()ed AF_ALG socket set up for hash, I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. test case attached. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include sys/sendfile.h #include sys/socket.h #include linux/if_alg.h #include stdio.h #include sys/stat.h #include fcntl.h int main(void) { int opfd; int tfmfd; struct sockaddr_alg sa = { .salg_family = AF_ALG, .salg_type = hash, .salg_name = sha1 }; char buf2[1000]; char buf[20]; int i; struct stat st; tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(tfmfd, (struct sockaddr *)sa, sizeof(sa)); opfd = accept(tfmfd, NULL, 0); int true = open(/bin/true, O_RDONLY); fstat(true, st); sendfile(opfd, true, NULL, st.st_size); read(opfd, buf, 20); for (i = 0; i 20; i++) { printf(%02x, (unsigned char)buf[i]); } printf(\n); lseek(true, 0, SEEK_SET); read(true, buf2, st.st_size); write(opfd, buf2, st.st_size); read(opfd, buf, 20); for (i = 0; i 20; i++) { printf(%02x, (unsigned char)buf[i]); } printf(\n); close(opfd); close(tfmfd); return 0; }
Re: AF_ALG buggy with sendfile
heres a version of the test case that builds. Sorry about that. On Sun, Nov 24, 2013 at 9:21 AM, Shawn Landden shawnland...@gmail.com wrote: If I use sendfile() to send to a accept()ed AF_ALG socket set up for hash, I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. test case attached. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include sys/sendfile.h #include sys/socket.h #include linux/if_alg.h #include stdio.h #include sys/stat.h #include fcntl.h #include stdlib.h #include unistd.h int main(void) { int opfd; int tfmfd; struct sockaddr_alg sa = { .salg_family = AF_ALG, .salg_type = hash, .salg_name = sha1 }; char *buf2; char buf[20]; int i; struct stat st; ssize_t size; tfmfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(tfmfd, (struct sockaddr *)sa, sizeof(sa)); opfd = accept(tfmfd, NULL, 0); int t = open(/bin/true, O_RDONLY); fstat(t, st); size = sendfile(opfd, t, NULL, st.st_size); if (size != st.st_size) exit(1); read(opfd, buf, 20); for (i = 0; i 20; i++) { printf(%02x, (unsigned char)buf[i]); } printf(\n); lseek(t, 0, SEEK_SET); buf2 = malloc(st.st_size + 1); read(t, buf2, st.st_size); write(opfd, buf2, st.st_size); read(opfd, buf, 20); for (i = 0; i 20; i++) { printf(%02x, (unsigned char)buf[i]); } printf(\n); close(opfd); close(tfmfd); return 0; }
Re: AF_ALG buggy with sendfile
If you build https://kernel.googlesource.com/pub/scm/network/connman/connman/+/0.80/tools/alg-test.c from the connman source code and compare the output to coreutils sha1sum you can see the problem. shawn@debian-T61:~/git/test$ make connman_afalg cc connman_afalg.c -o connman_afalg shawn@debian-T61:~/git/test$ ./connman_afalg /bin/true send 27080 bytes recv 20 bytes 45384483cf9cd0d82eba164131795b4807c6d39d /bin/true shawn@debian-T61:~/git/test$ sha1sum /bin/true 82667ba2ec681d8e55b0ee3b6db2970f9911680d /bin/true On Sun, Nov 24, 2013 at 2:00 PM, Shawn Landden shawnland...@gmail.com wrote: heres a version of the test case that builds. Sorry about that. On Sun, Nov 24, 2013 at 9:21 AM, Shawn Landden shawnland...@gmail.com wrote: If I use sendfile() to send to a accept()ed AF_ALG socket set up for hash, I get the wrong answer, if I read() and then write() I get the right answer. None of the system calls return an error. test case attached. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we set MSG_SENDPAGE_NOTLAST
On Sun, Nov 24, 2013 at 3:42 PM, Richard Weinberger rich...@nod.at wrote: Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST. We have to ensure that MSG_MORE is also set if we set MSG_SENDPAGE_NOTLAST. Otherwise users that check against MSG_MORE will not see it. This fixes sendfile() on AF_ALG. Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x The offending commit also got backported to the 3.2 stable kernel, so we need this fix there as well. --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] pipe_to_sendpage: Ensure that MSG_MORE is set if we set MSG_SENDPAGE_NOTLAST
On Sun, Nov 24, 2013 at 5:25 PM, Eric Dumazet eric.duma...@gmail.com wrote: On Mon, 2013-11-25 at 00:42 +0100, Richard Weinberger wrote: Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST. We have to ensure that MSG_MORE is also set if we set MSG_SENDPAGE_NOTLAST. Otherwise users that check against MSG_MORE will not see it. This fixes sendfile() on AF_ALG. Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x Reported-and-tested-by: Shawn Landden shawnland...@gmail.com Signed-off-by: Richard Weinberger rich...@nod.at --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 3b7ee65..b93f1b8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -701,7 +701,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, more = (sd-flags SPLICE_F_MORE) ? MSG_MORE : 0; if (sd-len sd-total_len pipe-nrbufs 1) - more |= MSG_SENDPAGE_NOTLAST; + more |= MSG_SENDPAGE_NOTLAST | MSG_MORE; return file-f_op-sendpage(file, buf-page, buf-offset, sd-len, pos, more); I do not think this patch is right. It looks like a revert of a useful patch for TCP zero copy. Given the time it took to discover this regression, I bet tcp zero copy has more users than AF_ALG, by 5 or 6 order of magnitude ;) Here we want to make the difference between the two flags, not merge them. If AF_ALG do not care of the difference, try instead : diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356cd280a..850246206b12 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask-private; int err; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx-sgl.sg, 1); sg_set_page(ctx-sgl.sg, page, size, offset); From my testing this works. -- --- Shawn Landden +1 360 389 3001 (SMS preferred) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE.
algif_hash and algif_skcipher used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden shawnland...@gmail.com Original-patch: Richard Weinberger rich...@nod.at Signed-off-by: Shawn Landden sh...@churchofgit.com --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask-private; int err; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx-sgl.sg, 1); sg_set_page(ctx-sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx-more ctx-used) goto unlock; -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE. algif_hash and algif_skcipher used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden shawnland...@gmail.com Original-patch: Richard Weinberger rich...@nod.at Signed-off-by: Shawn Landden sh...@churchofgit.com --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask-private; int err; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx-sgl.sg, 1); sg_set_page(ctx-sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx-more ctx-used) goto unlock; -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] update consumers of MSG_MORE to recognize MSG_SENDPAGE_NOTLAST
Commit 35f9c09fe (tcp: tcp_sendpages() should call tcp_push() once) added an internal flag MSG_SENDPAGE_NOTLAST, similar to MSG_MORE. algif_hash, algif_skcipher, and udp used MSG_MORE from tcp_sendpages() and need to see the new flag as identical to MSG_MORE. This fixes sendfile() on AF_ALG. v3: also fix udp Cc: Tom Herbert therb...@google.com Cc: Eric Dumazet eric.duma...@gmail.com Cc: David S. Miller da...@davemloft.net Cc: sta...@vger.kernel.org # 3.4.x + 3.2.x Reported-and-tested-by: Shawn Landden shawnland...@gmail.com Original-patch: Richard Weinberger rich...@nod.at Signed-off-by: Shawn Landden sh...@churchofgit.com --- crypto/algif_hash.c | 3 +++ crypto/algif_skcipher.c | 3 +++ net/ipv4/udp.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index ef5356c..8502462 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -114,6 +114,9 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page, struct hash_ctx *ctx = ask-private; int err; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); sg_init_table(ctx-sgl.sg, 1); sg_set_page(ctx-sgl.sg, page, size, offset); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 6a6dfc0..a19c027 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -378,6 +378,9 @@ static ssize_t skcipher_sendpage(struct socket *sock, struct page *page, struct skcipher_sg_list *sgl; int err = -EINVAL; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + lock_sock(sk); if (!ctx-more ctx-used) goto unlock; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5944d7d..8bd04df 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1098,6 +1098,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, struct udp_sock *up = udp_sk(sk); int ret; + if (flags MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + if (!up-pending) { struct msghdr msg = { .msg_flags = flags|MSG_MORE }; -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
Hello, when running the attached program on 3.12 child processes are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, and added to epoll_ctl(). This is the output I get when pointing "wget http://localhost:/; at the attached program: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept() failed: Bad file descriptor PID 31636 accept() failed: Bad file descriptor PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection While I would expect something like: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept()ed connection PID 31636 accept()ed connection -more new processes, but inversely proportional to number of listening processes -accept() always returns successfully -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include #include #include #include #include #include #include int main( int argc, char *argv[]) { int sockfd, epollfd, acceptfd, portno; struct epoll_event event = {EPOLLIN, NULL}, gotevent; char buffer[256]; struct sockaddr_in serv_addr; int n; pid_t pid; printf("main PID %d\n", getpid()); memset((char *) _addr, 0, sizeof(serv_addr)); portno = ; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = INADDR_ANY; serv_addr.sin_port = htons(portno); if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) return printf("socket() failed: %m\n"); int optval = 1; if ((setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, , sizeof(optval))) < 0) return printf("setsockopt() failed: %m"); if (bind(sockfd, (struct sockaddr *) _addr, sizeof(serv_addr)) < 0) return printf("bind() failed: %m\n"); if (listen(sockfd, SOMAXCONN) < 0) return printf("listen() failed: %m\n"); if ((epollfd = epoll_create1(0)) < 0) return printf("epoll_create1() failed: %m\n"); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, ) < 0) return printf("epoll_ctl() failed: %m\n"); while (1) { if (epoll_wait(epollfd, , 1, -1) != 1) return printf("epoll_wait() failed: %m\n"); pid = fork(); if (pid == 0) { printf("PID %d started\n", getpid()); while(1) { struct sockaddr_in cli_addr; socklen_t cli_size = sizeof cli_addr; if ((acceptfd = accept(sockfd, (struct sockaddr *)_addr, _size)) < 0) { printf("PID %d accept() failed: %m\n", getpid()); sleep(60); return 1; } printf("PID %d accept()ed connection\n", getpid()); if (close(acceptfd) < 0) return printf("close() failed: %m\n"); } } close(sockfd); } }
disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll
Hello, when running the attached program on 3.12 child processes are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to, and added to epoll_ctl(). This is the output I get when pointing wget http://localhost:/; at the attached program: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept() failed: Bad file descriptor PID 31636 accept() failed: Bad file descriptor PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection PID 31634 accept()ed connection While I would expect something like: main PID 31591 PID 31634 started PID 31634 accept()ed connection PID 31635 started PID 31636 started PID 31635 accept()ed connection PID 31636 accept()ed connection -more new processes, but inversely proportional to number of listening processes -accept() always returns successfully -- --- Shawn Landden +1 360 389 3001 (SMS preferred) #include stdio.h #include sys/types.h #include sys/socket.h #include netinet/in.h #include stdlib.h #include string.h #include sys/epoll.h int main( int argc, char *argv[]) { int sockfd, epollfd, acceptfd, portno; struct epoll_event event = {EPOLLIN, NULL}, gotevent; char buffer[256]; struct sockaddr_in serv_addr; int n; pid_t pid; printf(main PID %d\n, getpid()); memset((char *) serv_addr, 0, sizeof(serv_addr)); portno = ; serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = INADDR_ANY; serv_addr.sin_port = htons(portno); if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) 0) return printf(socket() failed: %m\n); int optval = 1; if ((setsockopt(sockfd, SOL_SOCKET, SO_REUSEPORT, optval, sizeof(optval))) 0) return printf(setsockopt() failed: %m); if (bind(sockfd, (struct sockaddr *) serv_addr, sizeof(serv_addr)) 0) return printf(bind() failed: %m\n); if (listen(sockfd, SOMAXCONN) 0) return printf(listen() failed: %m\n); if ((epollfd = epoll_create1(0)) 0) return printf(epoll_create1() failed: %m\n); if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, event) 0) return printf(epoll_ctl() failed: %m\n); while (1) { if (epoll_wait(epollfd, gotevent, 1, -1) != 1) return printf(epoll_wait() failed: %m\n); pid = fork(); if (pid == 0) { printf(PID %d started\n, getpid()); while(1) { struct sockaddr_in cli_addr; socklen_t cli_size = sizeof cli_addr; if ((acceptfd = accept(sockfd, (struct sockaddr *)cli_addr, cli_size)) 0) { printf(PID %d accept() failed: %m\n, getpid()); sleep(60); return 1; } printf(PID %d accept()ed connection\n, getpid()); if (close(acceptfd) 0) return printf(close() failed: %m\n); } } close(sockfd); } }