cpuset cleanup race
I came across a problem with code which uses a cpuset CG and tries to be responsible and clean up after itself. The code attached at the bottom illustrates the problem. It's only long because it has no dependencies aside from the basic runtime and should work on all machines. You need to run it with privileges high enough to create a CG. The code is really simple: - a (new) CG in cpuset is created - one of the cores of the root cpuset is selected - the thread (and therefore entire process) is switched to the cpuset - a thread is created which does nothing but terminate immediately - the parent waits for the thread - then the parent removes itself from the cpuset - finally the parent tries to remove the created cpuset The last part is where things go wrong. Usually* the rmdir() call made to remove the cpuset fails because the cpuset is still busy. The program prints the members of the cpuset CG: it's the child thread. * I wrote "usually" because slowing down the parent code will help. I.e., there is a race. Successful slowdowns I found: - compile with -fsanitize=address (seems already enough) - very short wait, e.g., 1ns (you can see this by starting the program with the parameter "wait") You might want to compile the code with optimization. It is a race, after all. The pthread_join() call made by the parent won't return until the kernel signals through the futex set up at clone() time that the thread has terminated. From the perspective of the userlevel code the thread is gone. But not all bookkeeping related to the terminated thread seems to has been finished, it seems. I didn't look at the code but I can imagine that the futex notification happens as soon as all observable aspects of the thread are gone. This is of course good to not delay the waiter. Hopefully the cgroup bookkeeping can also be moved before the notification. I tested it with a recent kernel (4.5.0-0.rc7) but I doubt it's a recent issue. ~ #include #include #include #include #include #include #include #include #include #include static void *tf(void *p) { return NULL; } int main(int argc, char *argv[]) { const char *csname = argc == 1 ? "test" : argv[1]; struct mntent *me; FILE *fp = setmntent(_PATH_MOUNTED, "r"); if (fp == NULL) error(1, errno, "cannot read mounted filesystem information"); while ((me = getmntent(fp)) != NULL) { if (strcmp(me->mnt_type, "cgroup") == 0 && hasmntopt(me, "cpuset") != NULL) break; } if (me == NULL) error(1, 0, "cpuset filesystem not mounted"); endmntent(fp); char *cshier = NULL; asprintf(, "%s/%s", me->mnt_dir, csname); if (mkdir(cshier, 0777) == 0) printf("new cpuset control group: %s\n", cshier); else if (errno != EEXIST) error(1, errno, "cannot create cpuset group %s", cshier); char *csrootmems; asprintf(, "%s/cpuset.mems", me->mnt_dir); fp = fopen(csrootmems, "r"); if (fp == NULL) error(1, errno, "cannot read /cpuset.mems"); char *val = NULL; size_t vallen = 0; ssize_t n = getline(, , fp); fclose(fp); free(csrootmems); char *testmems; asprintf(, "%s/cpuset.mems", cshier); fp = fopen(testmems, "w"); if (fp == NULL) error(1, errno, "cannot read /%s/cpuset.mems", csname); fwrite(val, n, 1, fp); fclose(fp); free(testmems); free(val); cpu_set_t cs; int first = 0; sched_getaffinity(0, sizeof(cs), ); while (! CPU_ISSET(first, )) ++first; char *testcpus; asprintf(, "%s/cpuset.cpus", cshier); fp = fopen(testcpus, "w"); if (fp == NULL) error(1, errno, "cannot write /%s/cpuset.cpus", csname); fprintf(fp, "%d", first); fclose(fp); free(testcpus); char *testtasks; asprintf(, "%s/tasks", cshier); fp = fopen(testtasks, "w"); if (fp == NULL) error(1, errno, "cannot write /%s/tasks", csname); fprintf(fp, "%d", (int) getpid()); fclose(fp); pthread_t th; pthread_create(, NULL, tf, NULL); pthread_join(th, NULL); char *roottasks; asprintf(, "%s/tasks", me->mnt_dir); fp = fopen(roottasks, "w"); if (fp == NULL) error(1, errno, "cannot write /tasks"); fprintf(fp, "%d", (int) getpid()); fclose(fp); free(roottasks); if (strcmp(csname, "wait") == 0) { struct timespec s = { 0, 1 }; nanosleep(, NULL); } if (rmdir(cshier) != 0) { printf("PID = %ld\nremaining = ", (long) getpid()); fp = fopen(testtasks, "r"); char *line = NULL; size_t linelen = 0; while ((n = getline(, , fp)) > 0) fputs(line, stdout); fclose(fp); free(line); error(1, errno, "couldn't remove cpuset %s", cshier); } free(cshier); free(testtasks); return 0; }
cpuset cleanup race
I came across a problem with code which uses a cpuset CG and tries to be responsible and clean up after itself. The code attached at the bottom illustrates the problem. It's only long because it has no dependencies aside from the basic runtime and should work on all machines. You need to run it with privileges high enough to create a CG. The code is really simple: - a (new) CG in cpuset is created - one of the cores of the root cpuset is selected - the thread (and therefore entire process) is switched to the cpuset - a thread is created which does nothing but terminate immediately - the parent waits for the thread - then the parent removes itself from the cpuset - finally the parent tries to remove the created cpuset The last part is where things go wrong. Usually* the rmdir() call made to remove the cpuset fails because the cpuset is still busy. The program prints the members of the cpuset CG: it's the child thread. * I wrote "usually" because slowing down the parent code will help. I.e., there is a race. Successful slowdowns I found: - compile with -fsanitize=address (seems already enough) - very short wait, e.g., 1ns (you can see this by starting the program with the parameter "wait") You might want to compile the code with optimization. It is a race, after all. The pthread_join() call made by the parent won't return until the kernel signals through the futex set up at clone() time that the thread has terminated. From the perspective of the userlevel code the thread is gone. But not all bookkeeping related to the terminated thread seems to has been finished, it seems. I didn't look at the code but I can imagine that the futex notification happens as soon as all observable aspects of the thread are gone. This is of course good to not delay the waiter. Hopefully the cgroup bookkeeping can also be moved before the notification. I tested it with a recent kernel (4.5.0-0.rc7) but I doubt it's a recent issue. ~ #include #include #include #include #include #include #include #include #include #include static void *tf(void *p) { return NULL; } int main(int argc, char *argv[]) { const char *csname = argc == 1 ? "test" : argv[1]; struct mntent *me; FILE *fp = setmntent(_PATH_MOUNTED, "r"); if (fp == NULL) error(1, errno, "cannot read mounted filesystem information"); while ((me = getmntent(fp)) != NULL) { if (strcmp(me->mnt_type, "cgroup") == 0 && hasmntopt(me, "cpuset") != NULL) break; } if (me == NULL) error(1, 0, "cpuset filesystem not mounted"); endmntent(fp); char *cshier = NULL; asprintf(, "%s/%s", me->mnt_dir, csname); if (mkdir(cshier, 0777) == 0) printf("new cpuset control group: %s\n", cshier); else if (errno != EEXIST) error(1, errno, "cannot create cpuset group %s", cshier); char *csrootmems; asprintf(, "%s/cpuset.mems", me->mnt_dir); fp = fopen(csrootmems, "r"); if (fp == NULL) error(1, errno, "cannot read /cpuset.mems"); char *val = NULL; size_t vallen = 0; ssize_t n = getline(, , fp); fclose(fp); free(csrootmems); char *testmems; asprintf(, "%s/cpuset.mems", cshier); fp = fopen(testmems, "w"); if (fp == NULL) error(1, errno, "cannot read /%s/cpuset.mems", csname); fwrite(val, n, 1, fp); fclose(fp); free(testmems); free(val); cpu_set_t cs; int first = 0; sched_getaffinity(0, sizeof(cs), ); while (! CPU_ISSET(first, )) ++first; char *testcpus; asprintf(, "%s/cpuset.cpus", cshier); fp = fopen(testcpus, "w"); if (fp == NULL) error(1, errno, "cannot write /%s/cpuset.cpus", csname); fprintf(fp, "%d", first); fclose(fp); free(testcpus); char *testtasks; asprintf(, "%s/tasks", cshier); fp = fopen(testtasks, "w"); if (fp == NULL) error(1, errno, "cannot write /%s/tasks", csname); fprintf(fp, "%d", (int) getpid()); fclose(fp); pthread_t th; pthread_create(, NULL, tf, NULL); pthread_join(th, NULL); char *roottasks; asprintf(, "%s/tasks", me->mnt_dir); fp = fopen(roottasks, "w"); if (fp == NULL) error(1, errno, "cannot write /tasks"); fprintf(fp, "%d", (int) getpid()); fclose(fp); free(roottasks); if (strcmp(csname, "wait") == 0) { struct timespec s = { 0, 1 }; nanosleep(, NULL); } if (rmdir(cshier) != 0) { printf("PID = %ld\nremaining = ", (long) getpid()); fp = fopen(testtasks, "r"); char *line = NULL; size_t linelen = 0; while ((n = getline(, , fp)) > 0) fputs(line, stdout); fclose(fp); free(line); error(1, errno, "couldn't remove cpuset %s", cshier); } free(cshier); free(testtasks); return 0; }
Re: NUMA node information for pages
On Mon, Mar 31, 2014 at 9:24 PM, Naoya Horiguchi wrote: > The information about "pfn-node" mapping seldom (or never) changes after boot, > so it seems better to me that adding a new interface somewhere under > /sys/devices/system/node/nodeN which shows pfn range of a given node. > If this doesn't work for your usecase, could you explain more about how you > use this information? I have no problem with that type of interface. It'll be more work figuring out the details since the interface I proposed is trivial and mimics that of kpageflags etc but that's manageable. I'll see whether I can figure out the necessary details. I imagine that if the PFN are indeed always clustered for each node then, as David proposes, text output like PFNSTART PFNSTOP in a file below /sys/devices/system/node/nodeN should be sufficient. How does memory hot plug work in this situation? If the PFNs are allocated dense at startup then there might potentially be many ranges for each node. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: NUMA node information for pages
On Mon, Mar 31, 2014 at 9:24 PM, Naoya Horiguchi n-horigu...@ah.jp.nec.com wrote: The information about pfn-node mapping seldom (or never) changes after boot, so it seems better to me that adding a new interface somewhere under /sys/devices/system/node/nodeN which shows pfn range of a given node. If this doesn't work for your usecase, could you explain more about how you use this information? I have no problem with that type of interface. It'll be more work figuring out the details since the interface I proposed is trivial and mimics that of kpageflags etc but that's manageable. I'll see whether I can figure out the necessary details. I imagine that if the PFN are indeed always clustered for each node then, as David proposes, text output like PFNSTART PFNSTOP in a file below /sys/devices/system/node/nodeN should be sufficient. How does memory hot plug work in this situation? If the PFNs are allocated dense at startup then there might potentially be many ranges for each node. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
NUMA node information for pages
I might be missing something but I couldn't find a way to use the pagemap information to then look up the NUMA node the respective page is located on. Especially when analyzing anomalities this is really useful. The /proc/kpageflags and /proc/kpagecount files don't have that information. If this is correct, could the attached patch be considered? It's really simple and follows the same line as the kpageflags file. Signed-off-by: Ulrich Drepper Documentation/vm/pagemap.txt |3 ++ fs/proc/page.c | 50 +++ 2 files changed, 53 insertions(+) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5948e45..413b34c 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -34,6 +34,9 @@ There are three components to pagemap: * /proc/kpagecount. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. + * /proc/kpagenode. This file contains a 32-bit number of the NUMA node + each page is mapped on. + * /proc/kpageflags. This file contains a 64-bit set of flags for each page, indexed by PFN. diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55..65bea9f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -15,6 +15,9 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KNIDSIZE sizeof(s32) +#define KNIDMASK (KNIDSIZE - 1) + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -212,10 +215,57 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +/* /proc/kpagenode - an array exposing node information for pages + * + * Each entry is a s32 representing the corresponding + * physical page flags. + */ + +static ssize_t kpagenode_read(struct file *file, char __user *buf, +size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + unsigned long src = *ppos; + unsigned long pfn = src / KNIDSIZE; + ssize_t ret = 0; + + count = min_t(unsigned long, count, (max_pfn * KNIDSIZE) - src); + if (src & KNIDSIZE || count & KNIDMASK) + return -EINVAL; + + while (count > 0) { + int nid; + if (pfn_valid(pfn)) + nid = pfn_to_nid(pfn); + else + nid = -1; + + if (put_user(nid, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KNIDSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagenode_operations = { + .llseek = mem_lseek, + .read = kpagenode_read, +}; + static int __init proc_page_init(void) { proc_create("kpagecount", S_IRUSR, NULL, _kpagecount_operations); proc_create("kpageflags", S_IRUSR, NULL, _kpageflags_operations); + proc_create("kpagenode", S_IRUSR, NULL, _kpagenode_operations); return 0; } fs_initcall(proc_page_init); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
NUMA node information for pages
I might be missing something but I couldn't find a way to use the pagemap information to then look up the NUMA node the respective page is located on. Especially when analyzing anomalities this is really useful. The /proc/kpageflags and /proc/kpagecount files don't have that information. If this is correct, could the attached patch be considered? It's really simple and follows the same line as the kpageflags file. Signed-off-by: Ulrich Drepper drep...@gmail.com Documentation/vm/pagemap.txt |3 ++ fs/proc/page.c | 50 +++ 2 files changed, 53 insertions(+) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index 5948e45..413b34c 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -34,6 +34,9 @@ There are three components to pagemap: * /proc/kpagecount. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. + * /proc/kpagenode. This file contains a 32-bit number of the NUMA node + each page is mapped on. + * /proc/kpageflags. This file contains a 64-bit set of flags for each page, indexed by PFN. diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55..65bea9f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -15,6 +15,9 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KNIDSIZE sizeof(s32) +#define KNIDMASK (KNIDSIZE - 1) + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -212,10 +215,57 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +/* /proc/kpagenode - an array exposing node information for pages + * + * Each entry is a s32 representing the corresponding + * physical page flags. + */ + +static ssize_t kpagenode_read(struct file *file, char __user *buf, +size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + unsigned long src = *ppos; + unsigned long pfn = src / KNIDSIZE; + ssize_t ret = 0; + + count = min_t(unsigned long, count, (max_pfn * KNIDSIZE) - src); + if (src KNIDSIZE || count KNIDMASK) + return -EINVAL; + + while (count 0) { + int nid; + if (pfn_valid(pfn)) + nid = pfn_to_nid(pfn); + else + nid = -1; + + if (put_user(nid, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KNIDSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagenode_operations = { + .llseek = mem_lseek, + .read = kpagenode_read, +}; + static int __init proc_page_init(void) { proc_create(kpagecount, S_IRUSR, NULL, proc_kpagecount_operations); proc_create(kpageflags, S_IRUSR, NULL, proc_kpageflags_operations); + proc_create(kpagenode, S_IRUSR, NULL, proc_kpagenode_operations); return 0; } fs_initcall(proc_page_init); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] apparently broken RLIMIT_CORE
On Sun, Oct 6, 2013 at 4:42 PM, Linus Torvalds wrote: > I doubt it is intentional, but I also cannot really feel that we care > deeply. Afaik we don't really honor the size limit exactly anyway, ie > we tend to check only at page boundaries etc. So do we really care? I could imagine in the case Al brought up (a pipe as core file filter) we might want to have some assurance the limits are not breached. If it doesn't cost that much I'd say implement it precisely. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] apparently broken RLIMIT_CORE
On Sun, Oct 6, 2013 at 4:42 PM, Linus Torvalds torva...@linux-foundation.org wrote: I doubt it is intentional, but I also cannot really feel that we care deeply. Afaik we don't really honor the size limit exactly anyway, ie we tend to check only at page boundaries etc. So do we really care? I could imagine in the case Al brought up (a pipe as core file filter) we might want to have some assurance the limits are not breached. If it doesn't cost that much I'd say implement it precisely. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] perf: remove duplicate block from Makefile
This looks like a merge error, the code is duplicated with the first copy doing something else as well. Just remove the second block. Signed-off-by: Ulrich Drepper Makefile |8 1 file changed, 8 deletions(-) Index: perf/config/Makefile === --- perf.orig/config/Makefile +++ perf/config/Makefile @@ -200,14 +200,6 @@ endif # NO_DWARF endif # NO_LIBELF -ifndef NO_LIBELF -CFLAGS += -DLIBELF_SUPPORT -FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) -ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y) - CFLAGS += -DLIBELF_MMAP -endif # try-cc -endif # NO_LIBELF - # There's only x86 (both 32 and 64) support for CFI unwind so far ifneq ($(ARCH),x86) NO_LIBUNWIND := 1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] perf: remove duplicate block from Makefile
This looks like a merge error, the code is duplicated with the first copy doing something else as well. Just remove the second block. Signed-off-by: Ulrich Drepper drep...@gmail.com Makefile |8 1 file changed, 8 deletions(-) Index: perf/config/Makefile === --- perf.orig/config/Makefile +++ perf/config/Makefile @@ -200,14 +200,6 @@ endif # NO_DWARF endif # NO_LIBELF -ifndef NO_LIBELF -CFLAGS += -DLIBELF_SUPPORT -FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) -ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y) - CFLAGS += -DLIBELF_MMAP -endif # try-cc -endif # NO_LIBELF - # There's only x86 (both 32 and 64) support for CFI unwind so far ifneq ($(ARCH),x86) NO_LIBUNWIND := 1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sendfile and EAGAIN
On Sat, Mar 2, 2013 at 10:09 PM, Eric Dumazet wrote: > > Using non blocking IO means the sender (and the receiver) must be able > to perform several operations, as long as the whole transfert is not > finished. Certainly, and this is implemented. But the receiver never gets the rest of the data while the sender (most of the time) gets notified that everything is sent. I don't have a reduced test case yet. Hopefully I'll get to it sometime soon. For now I worked around it by not using sendfile. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sendfile and EAGAIN
On Mon, Feb 25, 2013 at 2:22 PM, Eric Dumazet wrote: > I don't understand the issue. > > sendfile() returns -EAGAIN only if no bytes were copied to the socket. There is something wrong/unexpected/... I have a program which can use either sendfile or send. When using sendfile to transmit a large block (I've seen it with 900k) the sendfile call does not transmit everything. There receiver gets only about 600k. This is the situation when I think I've seen EAGAIN errors from sendmail but I cannot just now reproduce it. This is with sockets of AF_UNIX type. Are there any limits to take into account? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sendfile and EAGAIN
On Mon, Feb 25, 2013 at 2:22 PM, Eric Dumazet eric.duma...@gmail.com wrote: I don't understand the issue. sendfile() returns -EAGAIN only if no bytes were copied to the socket. There is something wrong/unexpected/... I have a program which can use either sendfile or send. When using sendfile to transmit a large block (I've seen it with 900k) the sendfile call does not transmit everything. There receiver gets only about 600k. This is the situation when I think I've seen EAGAIN errors from sendmail but I cannot just now reproduce it. This is with sockets of AF_UNIX type. Are there any limits to take into account? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sendfile and EAGAIN
On Sat, Mar 2, 2013 at 10:09 PM, Eric Dumazet eric.duma...@gmail.com wrote: Using non blocking IO means the sender (and the receiver) must be able to perform several operations, as long as the whole transfert is not finished. Certainly, and this is implemented. But the receiver never gets the rest of the data while the sender (most of the time) gets notified that everything is sent. I don't have a reduced test case yet. Hopefully I'll get to it sometime soon. For now I worked around it by not using sendfile. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] perf tool: Adding ratios support
On Wed, Jan 16, 2013 at 9:25 AM, Jiri Olsa wrote: > I was thinking having config files (global and arch specific) > comming with perf having predefined formulas. All the more reason to not mention the file name or really any source for the definition of the formula in the name, > 1) -e 'ratio/branch-rate/' # special event class > 2) -e 'ratio-branch-rate' # 'ratio-' prefix > 3) -e cpu/branch-rate/ # handled like aliases, ratio name would need to > be unique > ... ? I think 3 is the most extensible. Perhaps use the syntax used in other places. We have these :u suffixes etc. Perhaps have :r or :R or whatever. Given the other comments, we might want to avoid right away "ratio". If the mechanism is generalized it could be used to express "counter1 - counter2" for events which cannot be expressed with a single counter but are not really ratios. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] perf tool: Adding ratios support
On Tue, Jan 15, 2013 at 8:39 AM, Jiri Olsa wrote: > $ perf stat -f formula.conf:cpi kill > usage: kill [ -s signal | -p ] [ -a ] pid ... > kill -l [ signal ] I do like this proposal. The only comment I have is that perhaps the command line syntax isn't ideal. What you use above is tied to the ratios be defined in the config file. I would imagine that at least over time (for some ratios probably right away) they become available by default and don't require a config file. Also, users might want to put individualized ratio definitions in a config file which is read by default. How about the formulas becoming available whenever the config file is read. Maybe this means a few more keywords in the config file (ratio, ratio-set, ...). E.g.: ratio-set branch { events = {instructions,branch-instructions,branch-misses}:u ratio branch-rate { formula = branch-instructions / instructions desc = branch rate } ratio branch-miss-rate { formula = branch-misses / instructions desc = branch misprediction rate } ratio branch-miss-ratio{ formula = branch-misses / branch-instructions desc = branch misprediction ratio } } You get the idea. Maybe substitute "ratio":with "formula". Then allow such a ratio/formula to be used just like a normal event, perhaps with a special suffix/prefix to designate it. This should then also mark the events as part of a group so that the underlying counters are scheduled in together. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] perf tool: Adding ratios support
On Tue, Jan 15, 2013 at 8:39 AM, Jiri Olsa jo...@redhat.com wrote: $ perf stat -f formula.conf:cpi kill usage: kill [ -s signal | -p ] [ -a ] pid ... kill -l [ signal ] I do like this proposal. The only comment I have is that perhaps the command line syntax isn't ideal. What you use above is tied to the ratios be defined in the config file. I would imagine that at least over time (for some ratios probably right away) they become available by default and don't require a config file. Also, users might want to put individualized ratio definitions in a config file which is read by default. How about the formulas becoming available whenever the config file is read. Maybe this means a few more keywords in the config file (ratio, ratio-set, ...). E.g.: ratio-set branch { events = {instructions,branch-instructions,branch-misses}:u ratio branch-rate { formula = branch-instructions / instructions desc = branch rate } ratio branch-miss-rate { formula = branch-misses / instructions desc = branch misprediction rate } ratio branch-miss-ratio{ formula = branch-misses / branch-instructions desc = branch misprediction ratio } } You get the idea. Maybe substitute ratio:with formula. Then allow such a ratio/formula to be used just like a normal event, perhaps with a special suffix/prefix to designate it. This should then also mark the events as part of a group so that the underlying counters are scheduled in together. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] perf tool: Adding ratios support
On Wed, Jan 16, 2013 at 9:25 AM, Jiri Olsa jo...@redhat.com wrote: I was thinking having config files (global and arch specific) comming with perf having predefined formulas. All the more reason to not mention the file name or really any source for the definition of the formula in the name, 1) -e 'ratio/branch-rate/' # special event class 2) -e 'ratio-branch-rate' # 'ratio-' prefix 3) -e cpu/branch-rate/ # handled like aliases, ratio name would need to be unique ... ? I think 3 is the most extensible. Perhaps use the syntax used in other places. We have these :u suffixes etc. Perhaps have :r or :R or whatever. Given the other comments, we might want to avoid right away ratio. If the mechanism is generalized it could be used to express counter1 - counter2 for events which cannot be expressed with a single counter but are not really ratios. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 5:06 PM, Kirill A. Shutemov wrote: > They are bugs. > > Let's fix strerror_r() usage. > > Signed-off-by: Kirill A. Shutemov Acked-by: Ulrich Drepper -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 4:31 PM, Kirill A. Shutemov wrote: > + const char *err = strerror_r(errnum, buf, buflen); > + > + if (err != buf && buflen > 0) { > + size_t len = strlen(err); > + char *c = mempcpy(buf, err, min(buflen - 1, len)); > + *c = '\0'; > + } No need to check for err == NULL. buflen == 0 is a possibility given the interface but I'd say this is an error and should be tested for at the beginning of the function and the call should fail or even abort the program. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 11:00 AM, Kirill A. Shutemov wrote: > The right way to fix it is to switch to XSI-compliant version. And why exactly would this be "the right way"? Just fix the use of strerror_r or use strerror_l. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 11:00 AM, Kirill A. Shutemov kir...@shutemov.name wrote: The right way to fix it is to switch to XSI-compliant version. And why exactly would this be the right way? Just fix the use of strerror_r or use strerror_l. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 4:31 PM, Kirill A. Shutemov kir...@shutemov.name wrote: + const char *err = strerror_r(errnum, buf, buflen); + + if (err != buf buflen 0) { + size_t len = strlen(err); + char *c = mempcpy(buf, err, min(buflen - 1, len)); + *c = '\0'; + } No need to check for err == NULL. buflen == 0 is a possibility given the interface but I'd say this is an error and should be tested for at the beginning of the function and the call should fail or even abort the program. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific
On Mon, Jul 23, 2012 at 5:06 PM, Kirill A. Shutemov kir...@shutemov.name wrote: They are bugs. Let's fix strerror_r() usage. Signed-off-by: Kirill A. Shutemov kir...@shutemov.name Acked-by: Ulrich Drepper drep...@gmail.com -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/3] perf tool: Add new event group management
On Wed, Jul 18, 2012 at 6:21 AM, Jiri Olsa wrote: > Well, I personally like the '{}' syntax more than '--group-events or > --group-reads > option in front', it feels more user friendly.. anyway, we can easily have > both ways. I like the actual visual grouping better, too. Also, it doesn't require us to define what -e E1,E2 --group-events -e E3,E4 means. Does --group-events also apply to the first parameter? > As for the group attributes and group leader sampling, I don't mind omitting > them at this point and get back to that if we find it useful in future. Just define the first event the leader. What reason is there which prevents this? I can only second what Andi wrote: just get it done quickly. This is functionality that is desperately needed. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/3] perf tool: Add new event group management
On Wed, Jul 18, 2012 at 6:21 AM, Jiri Olsa jo...@redhat.com wrote: Well, I personally like the '{}' syntax more than '--group-events or --group-reads option in front', it feels more user friendly.. anyway, we can easily have both ways. I like the actual visual grouping better, too. Also, it doesn't require us to define what -e E1,E2 --group-events -e E3,E4 means. Does --group-events also apply to the first parameter? As for the group attributes and group leader sampling, I don't mind omitting them at this point and get back to that if we find it useful in future. Just define the first event the leader. What reason is there which prevents this? I can only second what Andi wrote: just get it done quickly. This is functionality that is desperately needed. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] RUSAGE_THREAD
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Roland McGrath wrote: > +#define RUSAGE_LWP RUSAGE_THREAD /* Solaris name for same */ No need to clutter the kernel header with this, it'll be in the libc header. Aside from that: Acked-by: Ulrich Drepper <[EMAIL PROTECTED]> - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHkZbk2ijCOnn/RHQRAtohAKCyWgJsm20LSqxTznvff3LI8zplvgCgwttu 16eJFNgQXWNEk76b141uZvo= =DzhA -END PGP SIGNATURE- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] RUSAGE_THREAD
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Roland McGrath wrote: +#define RUSAGE_LWP RUSAGE_THREAD /* Solaris name for same */ No need to clutter the kernel header with this, it'll be in the libc header. Aside from that: Acked-by: Ulrich Drepper [EMAIL PROTECTED] - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHkZbk2ijCOnn/RHQRAtohAKCyWgJsm20LSqxTznvff3LI8zplvgCgwttu 16eJFNgQXWNEk76b141uZvo= =DzhA -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] Per-thread getrusage
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Vinay Sridhar wrote: > There are two ways to implement this in the kernel: > 1) Introduce an additional parameter 'tid' to sys_getrusage() and put > code in glibc to handle getrusage() and pthread_getrusage() calls > correctly. > 2) Introduce a new system call to handle pthread_getrusage() and leave > sys_getrusage() untouched. You're doing two things at once: a) provide a way to get a thread's usage b) provide a way to get another process's/thread's usage The former is a trivial extension and I completely agree. RUSAGE_THREAD is trivial to implement and should go in ASAP. The second part isn't that easy. The first question is: do we really need this? It is a new type of interface. We have the /proc filesystem etc for programs which want to look at other process' data. Second, more importantly right now, your patch seems not to include any security support. Correct me if I'm wrong, but find_task_by_pid will always succeed, regardless of whether the calling thread belongs to another UID or not. I.e., your patch enables any process to read any other process' usage. That's a no-no. I suggest that you split the patch in two. The first should implement RUSAGE_THREAD. You'll immediately get an ACK from me for that. The second part then should introduce a way to get another process' usage. This patch should only be used initially as a starting point for discussions. You'll have to argue why it is necessary in the first place. The argument might have to do with why you want a pthread_getrusage() interface (which, btw, is a bad name since the interface is nothing like getrusage, getrusage doesn't allow requesting any other process' data). Yes, for intra-process lookups relying on /proc is no good idea. But then, I have not seen any reason so far why such an API is needed and why a thread cannot just be responsible for reading its own usage data. Anyway, if pthread_getrusage (or whatever it'll be called) is the only usage then the syscall should require that the TID parameter is from a thread in the same process which would solve the security problem. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHj3do2ijCOnn/RHQRAiKdAKCSooiEWcxr780hJGenElyDiWPWKgCdE+6Y j6ibmGsPT4aYxhSfpimSdiw= =jOC9 -END PGP SIGNATURE- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: > No. > > I already said I'm not looking at changing the calling convention for > existing syscalls. I did not suggest or ask for that at all. I was asking you to consider the real implementation details for a new syscall mechanism. We do not want to abandon the use of syscall/sysenter and go back to int (on x86/x86-64). This means that you have to come up with a mechanism which hooks into the current syscall/sysenter path while preserving full backward compatibility. Now it's your turn. How do you do this without additional costs? > Hardly so, as evidenced by the fact that we have successfully done so > for 15 years already; a number of Linux architectures require this > information for the existing system calls. Nothing at this scale is there in the moment, as far as I can see. And nothing so critical for getting right. Talk is cheap. You still haven't shown one bit if design how you want to achieve your grand goal. The time for hand-waiving is over. Do some work or step out of the way. Nothing you have said so far in the least convinces me and your arguments like "sys_indirect adds parameters" are not really contested. Yes, that's what sys_indirect does. So what? It does this with almost no cost which outweighs the ugliness factor in my book. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHS2gQ2ijCOnn/RHQRAlN5AKCWZQL97sROWBv33//Uj/MN+CNi3gCdFgCU uLVEOfclERpakp1kdYzy2oI= =stVB -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: > The 6-word limit is a red herring. There is at least two ways to deal > with it (and this doesn't mean wiping the legacy stuff we already have): > > - Let each architecture pick a calling convention and redefine the > architecture-independent bits to take an arbitrary number of arguments. > This is a one-time panarchitectural change. > [...] Just think beyond wishful thinking for a moment. What does it take to come up with something completely new and grand? Let's start at the basic: you need to signal that the new syscall calling convention is used. Since the syscall entry code is limited (at least the likes of syscall/sysenter, it would be easy enough to use int $0x81 in addition to int $0x80) you would have to extend the use of the syscall number while keeping binary compatibility. This means additional costs for every single syscall. Once you're past that, how do you implement the expandable syscall parameter count? There are two ways: - - pass to the real sys_* implementations the number of provided syscall parameters and have each function figure out what this means - - dynamically construct a call to the sys_* functions where the syscall magic adds an appropriate number of parameters filled with zeros. This is quite complicated and, more importantly, it requires that you have code/data somewhere which specifies how many parameters each of the sys_* function actually requires. The actual sys_* code and the data has to be kept in sync at all times. A maintenance nightmare. The handling of syscalls with many parameters should not at all be a driver of this design at all. Syscalls shouldn't be that complicated, I completely agree with ingo. I'm perfectly willing to give you the benefit of doubt, show us a design for what you're proposing which is not slower than the current code, doesn't impact existing code, and solves the problem in a nice and clean way. I cannot really see it now but I might miss something. The sys_indirect approach ain't pretty but it does it jobs, doesn't impact performance, and is expandable in direction we *know* we will want to go very soon. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHS1X12ijCOnn/RHQRAihRAJwLNJ9fT8GTv6MAoO6RZGOub07sGgCdGBLR frXyQVB8Oh5VgWY5YJhpitg= =FuBx -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: The 6-word limit is a red herring. There is at least two ways to deal with it (and this doesn't mean wiping the legacy stuff we already have): - Let each architecture pick a calling convention and redefine the architecture-independent bits to take an arbitrary number of arguments. This is a one-time panarchitectural change. [...] Just think beyond wishful thinking for a moment. What does it take to come up with something completely new and grand? Let's start at the basic: you need to signal that the new syscall calling convention is used. Since the syscall entry code is limited (at least the likes of syscall/sysenter, it would be easy enough to use int $0x81 in addition to int $0x80) you would have to extend the use of the syscall number while keeping binary compatibility. This means additional costs for every single syscall. Once you're past that, how do you implement the expandable syscall parameter count? There are two ways: - - pass to the real sys_* implementations the number of provided syscall parameters and have each function figure out what this means - - dynamically construct a call to the sys_* functions where the syscall magic adds an appropriate number of parameters filled with zeros. This is quite complicated and, more importantly, it requires that you have code/data somewhere which specifies how many parameters each of the sys_* function actually requires. The actual sys_* code and the data has to be kept in sync at all times. A maintenance nightmare. The handling of syscalls with many parameters should not at all be a driver of this design at all. Syscalls shouldn't be that complicated, I completely agree with ingo. I'm perfectly willing to give you the benefit of doubt, show us a design for what you're proposing which is not slower than the current code, doesn't impact existing code, and solves the problem in a nice and clean way. I cannot really see it now but I might miss something. The sys_indirect approach ain't pretty but it does it jobs, doesn't impact performance, and is expandable in direction we *know* we will want to go very soon. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHS1X12ijCOnn/RHQRAihRAJwLNJ9fT8GTv6MAoO6RZGOub07sGgCdGBLR frXyQVB8Oh5VgWY5YJhpitg= =FuBx -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: No. I already said I'm not looking at changing the calling convention for existing syscalls. I did not suggest or ask for that at all. I was asking you to consider the real implementation details for a new syscall mechanism. We do not want to abandon the use of syscall/sysenter and go back to int (on x86/x86-64). This means that you have to come up with a mechanism which hooks into the current syscall/sysenter path while preserving full backward compatibility. Now it's your turn. How do you do this without additional costs? Hardly so, as evidenced by the fact that we have successfully done so for 15 years already; a number of Linux architectures require this information for the existing system calls. Nothing at this scale is there in the moment, as far as I can see. And nothing so critical for getting right. Talk is cheap. You still haven't shown one bit if design how you want to achieve your grand goal. The time for hand-waiving is over. Do some work or step out of the way. Nothing you have said so far in the least convinces me and your arguments like sys_indirect adds parameters are not really contested. Yes, that's what sys_indirect does. So what? It does this with almost no cost which outweighs the ugliness factor in my book. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHS2gQ2ijCOnn/RHQRAlN5AKCWZQL97sROWBv33//Uj/MN+CNi3gCdFgCU uLVEOfclERpakp1kdYzy2oI= =stVB -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
On Nov 24, 2007 12:28 AM, Eric Dumazet <[EMAIL PROTECTED]> wrote: > OK, but maybe for consistency, we might accept the two mechanisms. It's not a question of the kernel interface. The issue with all these extensions is the userlevel interface. Ideally no new userlevel interface is needed. This is the case for open() and incidentally also for this case (through the flags parameter for recvmsg). For socket(), accept(), the situation is unfortunately different and we need a new interface. With your proposed patch, we would have to introduce another recvmsg() interface to take advantage of the additional functionality. This just doesn't make any sense. This is no contest in aesthetics. You first have to think about the interface presented to the programmer at userlevel and then design the syscall interface. This is how MSG_CMSG_CLOEXEC came about. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
On Nov 24, 2007 12:28 AM, Eric Dumazet [EMAIL PROTECTED] wrote: OK, but maybe for consistency, we might accept the two mechanisms. It's not a question of the kernel interface. The issue with all these extensions is the userlevel interface. Ideally no new userlevel interface is needed. This is the case for open() and incidentally also for this case (through the flags parameter for recvmsg). For socket(), accept(), the situation is unfortunately different and we need a new interface. With your proposed patch, we would have to introduce another recvmsg() interface to take advantage of the additional functionality. This just doesn't make any sense. This is no contest in aesthetics. You first have to think about the interface presented to the programmer at userlevel and then design the syscall interface. This is how MSG_CMSG_CLOEXEC came about. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: > 1) Can the fd passing with recvmsg() on AF_UNIX also gets O_CLOEXEC > support ? Already there, see MSG_CMSG_CLOEXEC. > 2) Why this O_NONBLOCK ability is needed for sockets ? Is it a security > issue, and if yes could you remind it to me ? No security issue. But look at any correct network program, all need to set the mode to non-blocking. Adding this support to the syscall comes at almost no cost and it cuts the cost for every program down by one or two syscalls. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHR9YQ2ijCOnn/RHQRArbyAJ0d25FPg/BWmJ4YIzJKhO9iaBJNXwCgmpuX PAA6u3Dc56AlBegTRqtqJPc= =j5vi -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Where is the new timerfd?
On Nov 23, 2007 9:29 AM, Davide Libenzi <[EMAIL PROTECTED]> wrote: > Yes, it's disabled, and yes, I'll repost today ... I haven't seen the patch and don't feel like searching. So I say it here: please mak sure you add a flags parameter to the system call itself (instead of adding it on as for eventfd and signalfd). We need to be able to use O_CLOEXEC some way or another. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Where is the new timerfd?
On Nov 23, 2007 9:29 AM, Davide Libenzi [EMAIL PROTECTED] wrote: Yes, it's disabled, and yes, I'll repost today ... I haven't seen the patch and don't feel like searching. So I say it here: please mak sure you add a flags parameter to the system call itself (instead of adding it on as for eventfd and signalfd). We need to be able to use O_CLOEXEC some way or another. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: 1) Can the fd passing with recvmsg() on AF_UNIX also gets O_CLOEXEC support ? Already there, see MSG_CMSG_CLOEXEC. 2) Why this O_NONBLOCK ability is needed for sockets ? Is it a security issue, and if yes could you remind it to me ? No security issue. But look at any correct network program, all need to set the mode to non-blocking. Adding this support to the syscall comes at almost no cost and it cuts the cost for every program down by one or two syscalls. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHR9YQ2ijCOnn/RHQRArbyAJ0d25FPg/BWmJ4YIzJKhO9iaBJNXwCgmpuX PAA6u3Dc56AlBegTRqtqJPc= =j5vi -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 1/5] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h | 17 + include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |3 +++ kernel/indirect.c| 40 5 files changed, 68 insertions(+) diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include + + +/* IMPORTANT: + All the elements of this union must be neutral to the word size + and must not require reworking when used in compat syscalls. Used + fixed-size types or types which are known to not vary in size across + architectures. */ +union indirect_params { +}; + +#define INDIRECT_PARAM(set, name) current->indirect_params.set.name + +#endif diff -u linux/kernel/Makefile linux/kernel/Makefile --- linux/kernel/Makefile +++ linux/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_ARCH_HAS_INDIRECT_SYSCALLS) += indirect.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <[EMAIL PROTECTED]>, the -fno-omit-frame-pointer is @@ -67,6 +68,8 @@ CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u linux/kernel/indirect.c linux/kernel/indirect.c --- linux/kernel/indirect.c +++ linux/kernel/indirect.c @@ -0,0 +1,40 @@ +#include +#include +#include +#include + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL ()) + { +#define INDSYSCALL(name) __NR_##name +#include + break; + + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = call_indirect(); + + memset(>indirect_params, '\0', paramslen); + + return result; +} diff -u linux/include/linux/syscalls.h linux/include/linux/syscalls.h --- linux/include/linux/syscalls.h +++ linux/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include #include @@ -611,6 +612,9 @@ const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- linux/include/linux/sched.h +++ linux/include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 0/5] sys_indirect system call
The following patches provide an alternative implementation of the sys_indirect system call which has been discussed a few times. This is a system call that allows us to extend existing system call interfaces by adding more system call parameters. Davide's previous implementation is IMO far more complex than warranted. This code here is trivial, as you can see. I've discussed this approach with Linus recently and for a brief moment we actually agreed on something. We pass an additional block of data to the kernel, it is copied into the task_struct, and then it is up to the function implementing the system call to interpret the data. Each system call, which is meant to be extended this way, has to be white-listed in sys_indirect. The alternative is to filter out those system calls which absolutely cannot be handled using sys_indirect (like clone, execve) since they require the stack layout of an ordinary system call. This is more dangerous since it is too easy to miss a call. Note that the sys_indirect system call takes an additional parameter which is for now forced to be zero. This parameter is meant to enable the use of sys_indirect to create syslets, asynchronously executed system calls. This syslet approach is also the main reason for the interface in the form proposed here. The code for x86 and x86-64 gets by without a single line of assembly code. This is likely to be true for many other archs as well. There is architecture-dependent code, though. The last three patches show the first application of the functionality. They also show a complication: we need the test for valid sub-syscalls in the main implementation and in the compatibility code. And more: the actual sources and generated binary for the test are very different (the numbers differ). Duplicating the information is a big problem, though. I've used some macro tricks to avoid this. All the information about the flags and the system calls using them is concentrated in one header. This should keep maintenance bearable. This patch to use sys_indirect is just the beginning. More will follow, but I want to see how these patches are received before I spend more time on it. This code is enough to test the implementation with the following test program. Adjust it for architectures other than x86 and x86-64. What is not addressed are differences in opinion about the whole approach. Maybe Linus can chime in a defend what is basically his design. #include #include #include #include #include #include #include #include typedef uint32_t __u32; typedef uint64_t __u64; union indirect_params { struct { int flags; } file_flags; }; #ifdef __x86_64__ # define __NR_indirect 286 struct indirect_registers { __u64 rax; __u64 rdi; __u64 rsi; __u64 rdx; __u64 r10; __u64 r8; __u64 r9; }; #elif defined __i386__ # define __NR_indirect 325 struct indirect_registers { __u32 eax; __u32 ebx; __u32 ecx; __u32 edx; __u32 esi; __u32 edi; __u32 ebp; }; #else # error "need to define __NR_indirect and struct indirect_params" #endif #define FILL_IN(var, values...) \ var = (struct indirect_registers) { values } int main (void) { int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP); int s1 = fcntl (fd, F_GETFD); int t1 = fcntl (fd, F_GETFL); printf ("old: FD_CLOEXEC %s set, NONBLOCK %s set\n", s1 == 0 ? "not" : "is", (t1 & O_NONBLOCK) ? "is" : "not"); close (fd); union indirect_params i; memset(, '\0', sizeof(i)); i.file_flags.flags = O_CLOEXEC|O_NONBLOCK; struct indirect_registers r; #ifdef __NR_socketcall # define SOCKOP_socket 1 long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP }; FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args); #else FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP); #endif fd = syscall (__NR_indirect, , , sizeof (i), 0); int s2 = fcntl (fd, F_GETFD); int t2 = fcntl (fd, F_GETFL); printf ("new: FD_CLOEXEC %s set, NONBLOCK %s set\n", s2 == 0 ? "not" : "is", (t2 & O_NONBLOCK) ? "is" : "not"); close (fd); i.file_flags.flags = O_CLOEXEC; sigset_t ss; sigemptyset(); FILL_IN(r, __NR_signalfd, -1, (long) , 8); fd = syscall (__NR_indirect, , , sizeof (i), 0); int s3 = fcntl (fd, F_GETFD); printf ("signalfd: FD_CLOEXEC %s set\n", s3 == 0 ? "not" : "is"); close (fd); FILL_IN(r, __NR_eventfd, 8); fd = syscall (__NR_indirect, , , sizeof (i), 0); int s4 = fcntl (fd, F_GETFD); printf ("eventfd: FD_CLOEXEC %s set\n", s4 == 0 ? "not" : "is"); close (fd); return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0; } Signed-off-by: Ulrich Drepper <
[PATCHv5 2/5] x86 support for sys_indirect
This part adds support for sys_indirect on x86 and x86-64. arch/x86/Kconfig |3 ++ arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S |2 + arch/x86/ia32/sys_ia32.c | 38 + arch/x86/kernel/syscall_table_32.S |1 include/asm-x86/indirect.h |5 include/asm-x86/indirect_32.h | 25 include/asm-x86/indirect_64.h | 36 +++ include/asm-x86/unistd_32.h|3 +- include/asm-x86/unistd_64.h|2 + 10 files changed, 115 insertions(+), 1 deletion(-) --- linux/arch/x86/Kconfig +++ linux/arch/x86/Kconfig @@ -112,6 +112,9 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config ARCH_HAS_INDIRECT_SYSCALLS + def_bool y + diff -u linux/include/asm-x86/indirect_32.h linux/include/asm-x86/indirect_32.h --- linux/include/asm-x86/indirect_32.h +++ linux/include/asm-x86/indirect_32.h @@ -0,0 +1,25 @@ +#ifndef _ASM_X86_INDIRECT_32_H +#define _ASM_X86_INDIRECT_32_H + +struct indirect_registers { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->eax + +static inline long call_indirect(struct indirect_registers *regs) +{ + extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32); + + return sys_call_table[INDIRECT_SYSCALL(regs)](regs->ebx, regs->ecx, + regs->edx, regs->esi, + regs->edi, regs->ebp); +} + +#endif diff -u linux/include/asm-x86/indirect_64.h linux/include/asm-x86/indirect_64.h --- linux/include/asm-x86/indirect_64.h +++ linux/include/asm-x86/indirect_64.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_INDIRECT_64_H +#define _ASM_X86_INDIRECT_64_H + +struct indirect_registers { + __u64 rax; + __u64 rdi; + __u64 rsi; + __u64 rdx; + __u64 r10; + __u64 r8; + __u64 r9; +}; + +struct indirect_registers32 { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->rax +#define INDIRECT_SYSCALL32(regs) (regs)->eax + +static inline long call_indirect(struct indirect_registers *regs) +{ + extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64); + + return sys_call_table[INDIRECT_SYSCALL(regs)](regs->rdi, regs->rsi, + regs->rdx, regs->r10, + regs->r8, regs->r9); +} + +#endif diff -u linux/arch/x86/ia32/sys_ia32.c linux/arch/x86/ia32/sys_ia32.c --- linux/arch/x86/ia32/sys_ia32.c +++ linux/arch/x86/ia32/sys_ia32.c @@ -889,0 +890,38 @@ + +asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs, + void __user *userparams, size_t paramslen, + int flags) +{ + extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32); + + struct indirect_registers32 regs; + long result; + + if (flags != 0) + return -EINVAL; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL32()) + { +#define INDSYSCALL(name) __NR_ia32_##name +#include + break; + + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx, + regs.edx, regs.esi, + regs.edi, regs.ebp); + + memset(>indirect_params, '\0', paramslen); + + return result; +} --- linux/arch/x86/ia32/Makefile +++ linux/arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- linux/arch/x86/ia32/ia32entry.S +++ linux/arch/x86/ia32/ia32entry.S @@ -400,6 +400,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 + .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -726,4 +727,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate + .quad sys32_indirect/* 325 */ ia32_syscall_end: --- linux/arch/x86/kernel/syscall_table_32.S +++
[PATCHv5 3/5] Allow setting FD_CLOEXEC flag for new sockets
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 27 +++ net/socket.c | 21 + 3 files changed, 41 insertions(+), 8 deletions(-) diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -1,3 +1,4 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H @@ -13,5 +14,31 @@ + struct { +int flags; + } file_flags; }; #define INDIRECT_PARAM(set, name) current->indirect_params.set.name #endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro "INDSYSCALL" + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + +#endif --- linux/include/asm-x86/ia32_unistd.h +++ linux/include/asm-x86/ia32_unistd.h @@ -12,6 +12,7 @@ #define __NR_ia32_exit 1 #define __NR_ia32_read 3 #define __NR_ia32_write 4 +#define __NR_ia32_socketcall 102 #define __NR_ia32_sigreturn119 #define __NR_ia32_rt_sigreturn 173 diff -u linux/net/socket.c linux/net/socket.c --- linux/net/socket.c +++ linux/net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd >= 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(); + int fd = sock_alloc_fd(, flags); if (likely(fd >= 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file->f_op == _file_ops) @@ -1208,7 +1213,7 @@ if (retval < 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval < 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err < 0) goto out_release_both; - fd1 = sock_alloc_fd(); + fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 < 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(); + fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 < 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock->ops->owner); - newfd = sock_alloc_fd(); + newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 5/5] FD_CLOEXEC support for eventfd, signalfd, timerfd
This patch adds support to set the FD_CLOEXEC flag for the file descriptors returned by eventfd, signalfd, timerfd. fs/anon_inodes.c | 15 +++ fs/eventfd.c |5 +++-- fs/signalfd.c |6 -- fs/timerfd.c |6 -- include/asm-x86/ia32_unistd.h |3 +++ include/linux/anon_inodes.h |3 +++ include/linux/indirect.h |3 +++ 7 files changed, 31 insertions(+), 10 deletions(-) --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -40,5 +40,8 @@ union indirect_params { #if INDSYSCALL(socketpair) case INDSYSCALL(socketpair): #endif + case INDSYSCALL(eventfd): + case INDSYSCALL(signalfd): + case INDSYSCALL(timerfd): #endif --- linux/fs/anon_inodes.c +++ linux/fs/anon_inodes.c @@ -70,9 +70,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, -const char *name, const struct file_operations *fops, -void *priv) +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, if (!file) return -ENFILE; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_put_filp; fd = error; @@ -138,6 +138,13 @@ err_put_filp: put_filp(file); return error; } + +int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, +const char *name, const struct file_operations *fops, +void *priv) +{ + return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0); +} EXPORT_SYMBOL_GPL(anon_inode_getfd); /* --- linux/include/linux/anon_inodes.h +++ linux/include/linux/anon_inodes.h @@ -8,6 +8,9 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags); int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, const char *name, const struct file_operations *fops, void *priv); --- linux/fs/eventfd.c +++ linux/fs/eventfd.c @@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[eventfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , "[eventfd]", + _fops, ctx, + INDIRECT_PARAM(file_flags, flags)); if (!error) return fd; --- linux/fs/signalfd.c +++ linux/fs/signalfd.c @@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[signalfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , + "[signalfd]", _fops, + ctx, INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_fdalloc; } else { --- linux/fs/timerfd.c +++ linux/fs/timerfd.c @@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[timerfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , "[timerfd]", + _fops, ctx, + INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_tmrcancel; } else { --- linux/include/asm-x86/ia32_unistd.h +++ linux/include/asm-x86/ia32_unistd.h @@ -15,5 +15,8 @@ #define __NR_ia32_socketcall 102 #define
[PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
This patch adds support for setting the O_NONBLOCK flag of the file descriptors returned by socket, socketpair, and accept. socket.c | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) --- linux/net/socket.c +++ linux/net/socket.c @@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags) return fd; } -static int sock_attach_fd(struct socket *sock, struct file *file) +static int sock_attach_fd(struct socket *sock, struct file *file, int flags) { struct dentry *dentry; struct qstr name = { .name = "" }; @@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file *file) init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, _file_ops); SOCK_INODE(sock)->i_fop = _file_ops; - file->f_flags = O_RDWR; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_pos = 0; file->private_data = sock; @@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags) int fd = sock_alloc_fd(, flags); if (likely(fd >= 0)) { - int err = sock_attach_fd(sock, newfile); + int err = sock_attach_fd(sock, newfile, flags); if (unlikely(err < 0)) { put_filp(newfile); @@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, int protocol, goto out_release_both; } - err = sock_attach_fd(sock1, newfile1); + err = sock_attach_fd(sock1, newfile1, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err < 0)) { goto out_fd2; } - err = sock_attach_fd(sock2, newfile2); + err = sock_attach_fd(sock2, newfile2, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err < 0)) { fput(newfile1); goto out_fd1; @@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, goto out_put; } - err = sock_attach_fd(newsock, newfile); + err = sock_attach_fd(newsock, newfile, +INDIRECT_PARAM(file_flags, flags)); if (err < 0) goto out_fd_simple; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.24-rc3: find complains about /proc/net
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Roland McGrath wrote: > Oh, it seems it has indeed been that way for a very long time, so I was > mistaken. It still seems a little odd to me. Ulrich can say definitively > whether the kind of concern I mentioned really matters one way or the other > for glibc. glibc cannot survive (at least NPTL) if somebody uses funny CLONE_* flags to separate various pieces of information, e.g., file descriptors. So, all the information in each thread's /proc/self should be identical. When the information is not the same, the current semantics seems to be more useful. So I guess, no change is the way to go here. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQ25/2ijCOnn/RHQRAmhhAJsHRF7FqO8DWwZ97gHxIO/i4Z1AAQCffCGa Q2J8kjthKbbNQf1USWMAw3Y= =xl/a -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 0/6] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Zach Brown wrote: > I'm sure the additional parameter will be needed, and it might be pretty > involved. I think the current notion of syslets needs, at the very least: All correct. I just want to point out that the proposed interface is sufficiently prepared for this and that there is no need to wait adding this initial, synchronous syscall stuff before the syslet stuff is ready. These interface changes are security-relevant and should be added ASAP. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQySu2ijCOnn/RHQRAnQqAKCz0JzvmAeEcL8m77jbEYAZ4ZFWXwCgpfvE do7pJGn9XBu9jfQhfLkxQSc= =eX6m -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Zach Brown wrote: > Have you given thought to having to perform compat translation on this? > Today it's only copied directly from the user pointer into the union > in the task_struct. Since there is no legacy interface to worry about all members added to the structure can and should be neutral of the word size. We've done this with some syscalls already (like pread64) where we always use the wide form in the parameter list. It's just more simple here since it does not have to split into two 32-bit registers. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQyJn2ijCOnn/RHQRAmWeAJ0Q6qBDtZDvsZYlfBnPFL6n11Z+lwCghiVp NklFHsSnVyQYMD5rinDFQPo= =Yo5E -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: > as an application writer how do i access accept(2) with FD_CLOEXEC > functionality? will glibc expose an accept2() with a flags param? Not yet decided. There is the alternative to extend the accept() interface to have both interfaces: int accept(int, struct sockaddr *, socklen_t *); and int accept(int, struct sockaddr *, socklen_t *, int); We can do this with type safety even in C nowadays. > if so... why don't we just have an accept2() syscall? If you read the mails of my first submission you'll find that I explained this. I talked to Andrew and he favored new syscalls. But then I talked to Linus and he favored this approach. Probably especially because it can be used for syslets as well. And it is less code and data than introducing new syscalls. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwhx2ijCOnn/RHQRAnezAKCkFmGwlwDZjpfKTRSUN4yLIeGTkACgtMK/ OcHdIaR8wbp848D3GU2iNYQ= =nTu9 -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 2/6] x86 support for sys_indirect
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Heiko Carstens wrote: > All these macros could be functions, or? Would give us some type checking > and avoids the capital letters. Should be possible now. I didn't do it initially since the macro used the macro for the largest syscall number. That macro wasn't always available. I'll test it. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwdg2ijCOnn/RHQRAmh9AJ9EuthsaoupSHn3kR/x0cWxqR3FoQCfSbmE 8RIDWzPKZ6cv+QVGNl0fawM= =ScgY -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 0/6] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: > I am wondering if some parts are missing from your ChangeLog > > You apparently added in v3 a new 'flags' parameter to indirect syscall > but no trace of this change in Changelog, and why it was added. This > seems to imply a future multiplexor. This was mentioned in one of my mails. I added the parameter to accommodate Linus's and Zack's idea to use the functionality for syslets as well. Not really a multiplexer, it is meant to be a "execute synchronously or asynchronously" flag. In the latter case an additional parameter might be needed to indicate the notification mechanism. > And no change in the test program reflecting this 'flags' new param, so > it fails. Yep, sorry, I didn't update the text by including the most recent test program. I'll do that for the next version. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwca2ijCOnn/RHQRAgQJAKDH+N3+FSJ0kD5VbzbAFN4918wREwCePHbc nSY/t9x1FuYstYDaaT6Kut0= =c95e -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 David Miller wrote: > FWIW, I think this indirect syscall stuff is the most ugly interface > I've ever seen proposed for the kernel. Well, the alternative is to introduce a dozens of new interfaces. It was Linus who suggested this alternative. Plus, it seems that for syslets we need basically the same interface anyway. > And I agree with all of the objections raised by both H. Pater Anvin > and Eric Dumazet. Eric had no arguments and HP's comments lack a viable alternative proposal. > Where does this INDIRECT_PARAM() macro get defined? I do not > see it being defined anywhere in these patches. Defined in : +#define INDIRECT_PARAM(set, name) current->indirect_params.set.name Not my idea, I was following one review comment. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwWl2ijCOnn/RHQRAhEbAJ9/bkrb/phOMRl16Fb0N1TDYglSsgCeNhHQ 3huhdKCAVTu4CJnktf/ufy4= =Jj6h -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 David Miller wrote: FWIW, I think this indirect syscall stuff is the most ugly interface I've ever seen proposed for the kernel. Well, the alternative is to introduce a dozens of new interfaces. It was Linus who suggested this alternative. Plus, it seems that for syslets we need basically the same interface anyway. And I agree with all of the objections raised by both H. Pater Anvin and Eric Dumazet. Eric had no arguments and HP's comments lack a viable alternative proposal. Where does this INDIRECT_PARAM() macro get defined? I do not see it being defined anywhere in these patches. Defined in linux/indirect.h: +#define INDIRECT_PARAM(set, name) current-indirect_params.set.name Not my idea, I was following one review comment. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwWl2ijCOnn/RHQRAhEbAJ9/bkrb/phOMRl16Fb0N1TDYglSsgCeNhHQ 3huhdKCAVTu4CJnktf/ufy4= =Jj6h -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 0/6] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: I am wondering if some parts are missing from your ChangeLog You apparently added in v3 a new 'flags' parameter to indirect syscall but no trace of this change in Changelog, and why it was added. This seems to imply a future multiplexor. This was mentioned in one of my mails. I added the parameter to accommodate Linus's and Zack's idea to use the functionality for syslets as well. Not really a multiplexer, it is meant to be a execute synchronously or asynchronously flag. In the latter case an additional parameter might be needed to indicate the notification mechanism. And no change in the test program reflecting this 'flags' new param, so it fails. Yep, sorry, I didn't update the text by including the most recent test program. I'll do that for the next version. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwca2ijCOnn/RHQRAgQJAKDH+N3+FSJ0kD5VbzbAFN4918wREwCePHbc nSY/t9x1FuYstYDaaT6Kut0= =c95e -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: as an application writer how do i access accept(2) with FD_CLOEXEC functionality? will glibc expose an accept2() with a flags param? Not yet decided. There is the alternative to extend the accept() interface to have both interfaces: int accept(int, struct sockaddr *, socklen_t *); and int accept(int, struct sockaddr *, socklen_t *, int); We can do this with type safety even in C nowadays. if so... why don't we just have an accept2() syscall? If you read the mails of my first submission you'll find that I explained this. I talked to Andrew and he favored new syscalls. But then I talked to Linus and he favored this approach. Probably especially because it can be used for syslets as well. And it is less code and data than introducing new syscalls. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwhx2ijCOnn/RHQRAnezAKCkFmGwlwDZjpfKTRSUN4yLIeGTkACgtMK/ OcHdIaR8wbp848D3GU2iNYQ= =nTu9 -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 2/6] x86x86-64 support for sys_indirect
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Heiko Carstens wrote: All these macros could be functions, or? Would give us some type checking and avoids the capital letters. Should be possible now. I didn't do it initially since the macro used the macro for the largest syscall number. That macro wasn't always available. I'll test it. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQwdg2ijCOnn/RHQRAmh9AJ9EuthsaoupSHn3kR/x0cWxqR3FoQCfSbmE 8RIDWzPKZ6cv+QVGNl0fawM= =ScgY -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 0/6] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Zach Brown wrote: I'm sure the additional parameter will be needed, and it might be pretty involved. I think the current notion of syslets needs, at the very least: All correct. I just want to point out that the proposed interface is sufficiently prepared for this and that there is no need to wait adding this initial, synchronous syscall stuff before the syslet stuff is ready. These interface changes are security-relevant and should be added ASAP. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQySu2ijCOnn/RHQRAnQqAKCz0JzvmAeEcL8m77jbEYAZ4ZFWXwCgpfvE do7pJGn9XBu9jfQhfLkxQSc= =eX6m -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Zach Brown wrote: Have you given thought to having to perform compat translation on this? Today it's only copied directly from the user pointer into the union in the task_struct. Since there is no legacy interface to worry about all members added to the structure can and should be neutral of the word size. We've done this with some syscalls already (like pread64) where we always use the wide form in the parameter list. It's just more simple here since it does not have to split into two 32-bit registers. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQyJn2ijCOnn/RHQRAmWeAJ0Q6qBDtZDvsZYlfBnPFL6n11Z+lwCghiVp NklFHsSnVyQYMD5rinDFQPo= =Yo5E -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.24-rc3: find complains about /proc/net
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Roland McGrath wrote: Oh, it seems it has indeed been that way for a very long time, so I was mistaken. It still seems a little odd to me. Ulrich can say definitively whether the kind of concern I mentioned really matters one way or the other for glibc. glibc cannot survive (at least NPTL) if somebody uses funny CLONE_* flags to separate various pieces of information, e.g., file descriptors. So, all the information in each thread's /proc/self should be identical. When the information is not the same, the current semantics seems to be more useful. So I guess, no change is the way to go here. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQ25/2ijCOnn/RHQRAmhhAJsHRF7FqO8DWwZ97gHxIO/i4Z1AAQCffCGa Q2J8kjthKbbNQf1USWMAw3Y= =xl/a -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 5/5] FD_CLOEXEC support for eventfd, signalfd, timerfd
This patch adds support to set the FD_CLOEXEC flag for the file descriptors returned by eventfd, signalfd, timerfd. fs/anon_inodes.c | 15 +++ fs/eventfd.c |5 +++-- fs/signalfd.c |6 -- fs/timerfd.c |6 -- include/asm-x86/ia32_unistd.h |3 +++ include/linux/anon_inodes.h |3 +++ include/linux/indirect.h |3 +++ 7 files changed, 31 insertions(+), 10 deletions(-) --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -40,5 +40,8 @@ union indirect_params { #if INDSYSCALL(socketpair) case INDSYSCALL(socketpair): #endif + case INDSYSCALL(eventfd): + case INDSYSCALL(signalfd): + case INDSYSCALL(timerfd): #endif --- linux/fs/anon_inodes.c +++ linux/fs/anon_inodes.c @@ -70,9 +70,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, -const char *name, const struct file_operations *fops, -void *priv) +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, if (!file) return -ENFILE; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error 0) goto err_put_filp; fd = error; @@ -138,6 +138,13 @@ err_put_filp: put_filp(file); return error; } + +int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, +const char *name, const struct file_operations *fops, +void *priv) +{ + return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0); +} EXPORT_SYMBOL_GPL(anon_inode_getfd); /* --- linux/include/linux/anon_inodes.h +++ linux/include/linux/anon_inodes.h @@ -8,6 +8,9 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags); int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, const char *name, const struct file_operations *fops, void *priv); --- linux/fs/eventfd.c +++ linux/fs/eventfd.c @@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(fd, inode, file, [eventfd], -eventfd_fops, ctx); + error = anon_inode_getfd_flags(fd, inode, file, [eventfd], + eventfd_fops, ctx, + INDIRECT_PARAM(file_flags, flags)); if (!error) return fd; --- linux/fs/signalfd.c +++ linux/fs/signalfd.c @@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(ufd, inode, file, [signalfd], -signalfd_fops, ctx); + error = anon_inode_getfd_flags(ufd, inode, file, + [signalfd], signalfd_fops, + ctx, INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_fdalloc; } else { --- linux/fs/timerfd.c +++ linux/fs/timerfd.c @@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(ufd, inode, file, [timerfd], -timerfd_fops, ctx); + error = anon_inode_getfd_flags(ufd, inode, file, [timerfd], + timerfd_fops, ctx, + INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_tmrcancel; } else { --- linux/include/asm-x86/ia32_unistd.h +++
[PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets
This patch adds support for setting the O_NONBLOCK flag of the file descriptors returned by socket, socketpair, and accept. socket.c | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) --- linux/net/socket.c +++ linux/net/socket.c @@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags) return fd; } -static int sock_attach_fd(struct socket *sock, struct file *file) +static int sock_attach_fd(struct socket *sock, struct file *file, int flags) { struct dentry *dentry; struct qstr name = { .name = }; @@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file *file) init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, socket_file_ops); SOCK_INODE(sock)-i_fop = socket_file_ops; - file-f_flags = O_RDWR; + file-f_flags = O_RDWR | (flags O_NONBLOCK); file-f_pos = 0; file-private_data = sock; @@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags) int fd = sock_alloc_fd(newfile, flags); if (likely(fd = 0)) { - int err = sock_attach_fd(sock, newfile); + int err = sock_attach_fd(sock, newfile, flags); if (unlikely(err 0)) { put_filp(newfile); @@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, int protocol, goto out_release_both; } - err = sock_attach_fd(sock1, newfile1); + err = sock_attach_fd(sock1, newfile1, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err 0)) { goto out_fd2; } - err = sock_attach_fd(sock2, newfile2); + err = sock_attach_fd(sock2, newfile2, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err 0)) { fput(newfile1); goto out_fd1; @@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, goto out_put; } - err = sock_attach_fd(newsock, newfile); + err = sock_attach_fd(newsock, newfile, +INDIRECT_PARAM(file_flags, flags)); if (err 0) goto out_fd_simple; - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 3/5] Allow setting FD_CLOEXEC flag for new sockets
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 27 +++ net/socket.c | 21 + 3 files changed, 41 insertions(+), 8 deletions(-) diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -1,3 +1,4 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H @@ -13,5 +14,31 @@ + struct { +int flags; + } file_flags; }; #define INDIRECT_PARAM(set, name) current-indirect_params.set.name #endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro INDSYSCALL + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + +#endif --- linux/include/asm-x86/ia32_unistd.h +++ linux/include/asm-x86/ia32_unistd.h @@ -12,6 +12,7 @@ #define __NR_ia32_exit 1 #define __NR_ia32_read 3 #define __NR_ia32_write 4 +#define __NR_ia32_socketcall 102 #define __NR_ia32_sigreturn119 #define __NR_ia32_rt_sigreturn 173 diff -u linux/net/socket.c linux/net/socket.c --- linux/net/socket.c +++ linux/net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd = 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(newfile); + int fd = sock_alloc_fd(newfile, flags); if (likely(fd = 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file-f_op == socket_file_ops) @@ -1208,7 +1213,7 @@ if (retval 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err 0) goto out_release_both; - fd1 = sock_alloc_fd(newfile1); + fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(newfile2); + fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock-ops-owner); - newfd = sock_alloc_fd(newfile); + newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd 0)) { err = newfd; sock_release(newsock); - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 1/5] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h | 17 + include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |3 +++ kernel/indirect.c| 40 5 files changed, 68 insertions(+) diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h --- linux/include/linux/indirect.h +++ linux/include/linux/indirect.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include asm/indirect.h + + +/* IMPORTANT: + All the elements of this union must be neutral to the word size + and must not require reworking when used in compat syscalls. Used + fixed-size types or types which are known to not vary in size across + architectures. */ +union indirect_params { +}; + +#define INDIRECT_PARAM(set, name) current-indirect_params.set.name + +#endif diff -u linux/kernel/Makefile linux/kernel/Makefile --- linux/kernel/Makefile +++ linux/kernel/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_ARCH_HAS_INDIRECT_SYSCALLS) += indirect.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra [EMAIL PROTECTED], the -fno-omit-frame-pointer is @@ -67,6 +68,8 @@ CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u linux/kernel/indirect.c linux/kernel/indirect.c --- linux/kernel/indirect.c +++ linux/kernel/indirect.c @@ -0,0 +1,40 @@ +#include linux/sched.h +#include linux/uaccess.h +#include linux/unistd.h +#include asm/asm-offsets.h + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(regs, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL (regs)) + { +#define INDSYSCALL(name) __NR_##name +#include linux/indirect.h + break; + + default: + return -EINVAL; + } + + if (paramslen sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(current-indirect_params, userparams, paramslen)) + result = call_indirect(regs); + + memset(current-indirect_params, '\0', paramslen); + + return result; +} diff -u linux/include/linux/syscalls.h linux/include/linux/syscalls.h --- linux/include/linux/syscalls.h +++ linux/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include linux/types.h #include linux/aio_abi.h @@ -611,6 +612,9 @@ const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- linux/include/linux/sched.h +++ linux/include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include linux/rcupdate.h #include linux/futex.h #include linux/rtmutex.h +#include linux/indirect.h #include linux/time.h #include linux/param.h @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 0/5] sys_indirect system call
The following patches provide an alternative implementation of the sys_indirect system call which has been discussed a few times. This is a system call that allows us to extend existing system call interfaces by adding more system call parameters. Davide's previous implementation is IMO far more complex than warranted. This code here is trivial, as you can see. I've discussed this approach with Linus recently and for a brief moment we actually agreed on something. We pass an additional block of data to the kernel, it is copied into the task_struct, and then it is up to the function implementing the system call to interpret the data. Each system call, which is meant to be extended this way, has to be white-listed in sys_indirect. The alternative is to filter out those system calls which absolutely cannot be handled using sys_indirect (like clone, execve) since they require the stack layout of an ordinary system call. This is more dangerous since it is too easy to miss a call. Note that the sys_indirect system call takes an additional parameter which is for now forced to be zero. This parameter is meant to enable the use of sys_indirect to create syslets, asynchronously executed system calls. This syslet approach is also the main reason for the interface in the form proposed here. The code for x86 and x86-64 gets by without a single line of assembly code. This is likely to be true for many other archs as well. There is architecture-dependent code, though. The last three patches show the first application of the functionality. They also show a complication: we need the test for valid sub-syscalls in the main implementation and in the compatibility code. And more: the actual sources and generated binary for the test are very different (the numbers differ). Duplicating the information is a big problem, though. I've used some macro tricks to avoid this. All the information about the flags and the system calls using them is concentrated in one header. This should keep maintenance bearable. This patch to use sys_indirect is just the beginning. More will follow, but I want to see how these patches are received before I spend more time on it. This code is enough to test the implementation with the following test program. Adjust it for architectures other than x86 and x86-64. What is not addressed are differences in opinion about the whole approach. Maybe Linus can chime in a defend what is basically his design. #include fcntl.h #include signal.h #include stdint.h #include stdio.h #include unistd.h #include netinet/in.h #include sys/socket.h #include sys/syscall.h typedef uint32_t __u32; typedef uint64_t __u64; union indirect_params { struct { int flags; } file_flags; }; #ifdef __x86_64__ # define __NR_indirect 286 struct indirect_registers { __u64 rax; __u64 rdi; __u64 rsi; __u64 rdx; __u64 r10; __u64 r8; __u64 r9; }; #elif defined __i386__ # define __NR_indirect 325 struct indirect_registers { __u32 eax; __u32 ebx; __u32 ecx; __u32 edx; __u32 esi; __u32 edi; __u32 ebp; }; #else # error need to define __NR_indirect and struct indirect_params #endif #define FILL_IN(var, values...) \ var = (struct indirect_registers) { values } int main (void) { int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP); int s1 = fcntl (fd, F_GETFD); int t1 = fcntl (fd, F_GETFL); printf (old: FD_CLOEXEC %s set, NONBLOCK %s set\n, s1 == 0 ? not : is, (t1 O_NONBLOCK) ? is : not); close (fd); union indirect_params i; memset(i, '\0', sizeof(i)); i.file_flags.flags = O_CLOEXEC|O_NONBLOCK; struct indirect_registers r; #ifdef __NR_socketcall # define SOCKOP_socket 1 long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP }; FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args); #else FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP); #endif fd = syscall (__NR_indirect, r, i, sizeof (i), 0); int s2 = fcntl (fd, F_GETFD); int t2 = fcntl (fd, F_GETFL); printf (new: FD_CLOEXEC %s set, NONBLOCK %s set\n, s2 == 0 ? not : is, (t2 O_NONBLOCK) ? is : not); close (fd); i.file_flags.flags = O_CLOEXEC; sigset_t ss; sigemptyset(ss); FILL_IN(r, __NR_signalfd, -1, (long) ss, 8); fd = syscall (__NR_indirect, r, i, sizeof (i), 0); int s3 = fcntl (fd, F_GETFD); printf (signalfd: FD_CLOEXEC %s set\n, s3 == 0 ? not : is); close (fd); FILL_IN(r, __NR_eventfd, 8); fd = syscall (__NR_indirect, r, i, sizeof (i), 0); int s4 = fcntl (fd, F_GETFD); printf (eventfd: FD_CLOEXEC %s set\n, s4 == 0 ? not : is); close (fd); return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0; } Signed-off-by: Ulrich Drepper [EMAIL PROTECTED] arch/x86/Kconfig |3 ++ arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S
[PATCHv5 2/5] x86x86-64 support for sys_indirect
This part adds support for sys_indirect on x86 and x86-64. arch/x86/Kconfig |3 ++ arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S |2 + arch/x86/ia32/sys_ia32.c | 38 + arch/x86/kernel/syscall_table_32.S |1 include/asm-x86/indirect.h |5 include/asm-x86/indirect_32.h | 25 include/asm-x86/indirect_64.h | 36 +++ include/asm-x86/unistd_32.h|3 +- include/asm-x86/unistd_64.h|2 + 10 files changed, 115 insertions(+), 1 deletion(-) --- linux/arch/x86/Kconfig +++ linux/arch/x86/Kconfig @@ -112,6 +112,9 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config ARCH_HAS_INDIRECT_SYSCALLS + def_bool y + diff -u linux/include/asm-x86/indirect_32.h linux/include/asm-x86/indirect_32.h --- linux/include/asm-x86/indirect_32.h +++ linux/include/asm-x86/indirect_32.h @@ -0,0 +1,25 @@ +#ifndef _ASM_X86_INDIRECT_32_H +#define _ASM_X86_INDIRECT_32_H + +struct indirect_registers { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)-eax + +static inline long call_indirect(struct indirect_registers *regs) +{ + extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32); + + return sys_call_table[INDIRECT_SYSCALL(regs)](regs-ebx, regs-ecx, + regs-edx, regs-esi, + regs-edi, regs-ebp); +} + +#endif diff -u linux/include/asm-x86/indirect_64.h linux/include/asm-x86/indirect_64.h --- linux/include/asm-x86/indirect_64.h +++ linux/include/asm-x86/indirect_64.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_INDIRECT_64_H +#define _ASM_X86_INDIRECT_64_H + +struct indirect_registers { + __u64 rax; + __u64 rdi; + __u64 rsi; + __u64 rdx; + __u64 r10; + __u64 r8; + __u64 r9; +}; + +struct indirect_registers32 { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)-rax +#define INDIRECT_SYSCALL32(regs) (regs)-eax + +static inline long call_indirect(struct indirect_registers *regs) +{ + extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64); + + return sys_call_table[INDIRECT_SYSCALL(regs)](regs-rdi, regs-rsi, + regs-rdx, regs-r10, + regs-r8, regs-r9); +} + +#endif diff -u linux/arch/x86/ia32/sys_ia32.c linux/arch/x86/ia32/sys_ia32.c --- linux/arch/x86/ia32/sys_ia32.c +++ linux/arch/x86/ia32/sys_ia32.c @@ -889,0 +890,38 @@ + +asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs, + void __user *userparams, size_t paramslen, + int flags) +{ + extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32); + + struct indirect_registers32 regs; + long result; + + if (flags != 0) + return -EINVAL; + + if (copy_from_user(regs, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL32(regs)) + { +#define INDSYSCALL(name) __NR_ia32_##name +#include linux/indirect.h + break; + + default: + return -EINVAL; + } + + if (paramslen sizeof(union indirect_params)) + return -EINVAL; + result = -EFAULT; + if (!copy_from_user(current-indirect_params, userparams, paramslen)) + result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx, + regs.edx, regs.esi, + regs.edi, regs.ebp); + + memset(current-indirect_params, '\0', paramslen); + + return result; +} --- linux/arch/x86/ia32/Makefile +++ linux/arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- linux/arch/x86/ia32/ia32entry.S +++ linux/arch/x86/ia32/ia32entry.S @@ -400,6 +400,7 @@ END(ia32_ptregs_common) .section .rodata,a .align 8 + .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -726,4 +727,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate + .quad sys32_indirect/* 325 */ ia32_syscall_end: --- linux/arch/x86/kernel/syscall_table_32.S +++
[PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. arch/x86/ia32/Makefile|1 + arch/x86/ia32/sys_ia32.c |4 include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 33 + kernel/Makefile |2 ++ kernel/indirect.c |4 net/socket.c | 21 + 7 files changed, 58 insertions(+), 8 deletions(-) --- arch/x86/ia32/Makefile +++ arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- kernel/Makefile +++ kernel/Makefile @@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u net/socket.c net/socket.c --- net/socket.c +++ net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd >= 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(); + int fd = sock_alloc_fd(, flags); if (likely(fd >= 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file->f_op == _file_ops) @@ -1208,7 +1213,7 @@ if (retval < 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval < 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err < 0) goto out_release_both; - fd1 = sock_alloc_fd(); + fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 < 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(); + fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 < 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock->ops->owner); - newfd = sock_alloc_fd(); + newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -902,6 +902,10 @@ switch (INDIRECT_SYSCALL32()) { +#define INDSYSCALL(name) __NR_ia32_##name +#include + break; + default: return -EINVAL; } diff -u include/linux/indirect.h include/linux/indirect.h --- include/linux/indirect.h +++ include/linux/indirect.h @@ -1,6 +1,39 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H #include + +union indirect_params { + struct { +int flags; + } file_flags; +}; + +#define INDIRECT_PARAM(set, name) current->indirect_params.set.name + +#endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro "INDSYSCALL" + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + #endif diff -u kernel/indirect.c kernel/indirect.c --- kernel/indirect.c +++ kernel/indirect.c @@ -19,6 +19,10 @@ switch (INDIRECT_SYSCALL ()) { +#define INDSYSCALL(name) __NR_##name +#include + break; + default: return -EINVAL; } --- include/asm-x86/ia32_unistd.h +++ include/asm-x86/ia32_unistd.h @@ -12,6 +12,7 @@
[PATCHv4 1/6] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h |6 ++ include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |2 +- kernel/indirect.c| 36 5 files changed, 51 insertions(+), 1 deletion(-) --- /dev/null +++ include/linux/indirect.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include + +#endif --- include/linux/sched.h +++ include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* --- include/linux/syscalls.h +++ include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include #include @@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- /dev/null +++ kernel/indirect.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL ()) + { + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = CALL_INDIRECT(); + + memset(>indirect_params, '\0', paramslen); + + return result; +} --- kernel/Makefile +++ kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ - utsname.o notifier.o + utsname.o notifier.o indirect.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
This patch adds support for setting the O_NONBLOCK flag of the file descriptors returned by socket, socketpair, and accept. socket.c | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) --- net/socket.c +++ net/socket.c @@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags) return fd; } -static int sock_attach_fd(struct socket *sock, struct file *file) +static int sock_attach_fd(struct socket *sock, struct file *file, int flags) { struct dentry *dentry; struct qstr name = { .name = "" }; @@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file *file) init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, _file_ops); SOCK_INODE(sock)->i_fop = _file_ops; - file->f_flags = O_RDWR; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_pos = 0; file->private_data = sock; @@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags) int fd = sock_alloc_fd(, flags); if (likely(fd >= 0)) { - int err = sock_attach_fd(sock, newfile); + int err = sock_attach_fd(sock, newfile, flags); if (unlikely(err < 0)) { put_filp(newfile); @@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, int protocol, goto out_release_both; } - err = sock_attach_fd(sock1, newfile1); + err = sock_attach_fd(sock1, newfile1, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err < 0)) { goto out_fd2; } - err = sock_attach_fd(sock2, newfile2); + err = sock_attach_fd(sock2, newfile2, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err < 0)) { fput(newfile1); goto out_fd1; @@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, goto out_put; } - err = sock_attach_fd(newsock, newfile); + err = sock_attach_fd(newsock, newfile, +INDIRECT_PARAM(file_flags, flags)); if (err < 0) goto out_fd_simple; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 2/6] x86 support for sys_indirect
This part adds support for sys_indirect on x86 and x86-64. arch/x86/ia32/ia32entry.S |2 ++ arch/x86/ia32/sys_ia32.c | 31 +++ arch/x86/kernel/syscall_table_32.S |1 + include/asm-x86/indirect.h |5 + include/asm-x86/indirect_32.h | 23 +++ include/asm-x86/indirect_64.h | 34 ++ include/asm-x86/unistd_32.h|3 ++- include/asm-x86/unistd_64.h|2 ++ 8 files changed, 100 insertions(+), 1 deletion(-) --- arch/x86/ia32/ia32entry.S +++ arch/x86/ia32/ia32entry.S @@ -400,6 +400,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 + .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -726,4 +727,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate + .quad sys32_indirect/* 325 */ ia32_syscall_end: --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -887,3 +887,37 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); } + +asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs, + void __user *userparams, size_t paramslen, + int flags) +{ + extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32); + + struct indirect_registers32 regs; + long result; + + if (flags != 0) + return -EINVAL; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL32()) + { + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx, + regs.edx, regs.esi, + regs.edi, regs.ebp); + + memset(>indirect_params, '\0', paramslen); + + return result; +} --- arch/x86/kernel/syscall_table_32.S +++ arch/x86/kernel/syscall_table_32.S @@ -324,3 +324,4 @@ ENTRY(sys_call_table) .long sys_timerfd .long sys_eventfd .long sys_fallocate + .long sys_indirect /* 325 */ --- /dev/null +++ include/asm-x86/indirect_32.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_INDIRECT_32_H +#define _ASM_X86_INDIRECT_32_H + +struct indirect_registers { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->ebx, (regs)->ecx, \ +(regs)->edx, (regs)->esi, \ +(regs)->edi, (regs)->ebp); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect_64.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_INDIRECT_64_H +#define _ASM_X86_INDIRECT_64_H + +struct indirect_registers { + __u64 rax; + __u64 rdi; + __u64 rsi; + __u64 rdx; + __u64 r10; + __u64 r8; + __u64 r9; +}; + +struct indirect_registers32 { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->rax +#define INDIRECT_SYSCALL32(regs) (regs)->eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->rdi, (regs)->rsi, \ +(regs)->rdx, (regs)->r10, \ +(regs)->r8, (regs)->r9); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "indirect_32.h" +#else +# include "indirect_64.h" +#endif --- include/asm-x86/unistd_32.h +++ include/asm-x86/unistd_32.h @@ -330,10 +330,11 @@ #define __NR_timerfd 322 #define __NR_eventfd 323 #define __NR_fallocate 324 +#define __NR_indirect 325 #ifdef __KERNEL__ -#define NR_syscalls 325 +#define NR_syscalls 326 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR --- include/asm-x86/unistd_64.h +++ include/asm-x86/unistd_64.h @@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd) __SYSCALL(__NR_eventfd, sys_eventfd) #define __NR_fallocate
[PATCHv4 3/6] UML support for sys_indirect
This part adds support for sys_indirect for UML. indirect.h |6 ++ 1 file changed, 6 insertions(+) --- /dev/null +++ include/asm-um/indirect.h @@ -0,0 +1,6 @@ +#ifndef __UM_INDIRECT_H +#define __UM_INDIRECT_H + +#include "asm/arch/indirect.h" + +#endif - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 0/6] sys_indirect system call
wing patches provide an alternative implementation of the sys_indirect system call which has been discussed a few times. This no system call allows us to extend existing system call interfaces with adding more system calls. Davide's previous implementation is IMO far more complex than warranted. This code here is trivial, as you can see. I've discussed this approach with Linus last week and for a brief moment we actually agreed on something. We pass an additional block of data to the kernel, it is copied into the task_struct, and then it is up to the function implementing the system call to interpret the data. Each system call, which is meant to be extended this way, has to be white-listed in sys_indirect. The alternative is to filter out those system calls which absolutely cannot be handled using sys_indirect (like clone, execve) since they require the stack layout of an ordinary system call. This is more dangerous since it is too easy to miss a call. The code for x86 and x86-64 gets by without a single line of assembly code. This is likely to be true for most/all the other archs as well. There is architecture-dependent code, though. For x86 and x86-64 I've also fixed up UML (although only x86-64 is tested, that's my setup). The last three patches show the first application of the functionality. They also show a complication: we need the test for valid sub-syscalls in the main implementation and in the compatibility code. And more: the actual sources and generated binary for the test are very different (the numbers differ). Duplicating the information is a big problem, though. I've used some macro tricks to avoid this. All the information about the flags and the system calls using them is concentrated in one header. This should maintenance bearable. This patch to use sys_indirect is just the beginning. More will follow, but I want to see how these patches are received before I spend more time on it. This code is enough to test the implementation with the following test program. Adjust it for architectures other than x86 and x86-64. #include #include #include #include #include #include #include #include typedef uint32_t __u32; typedef uint64_t __u64; union indirect_params { struct { int flags; } file_flags; }; #ifdef __x86_64__ # define __NR_indirect 286 struct indirect_registers { __u64 rax; __u64 rdi; __u64 rsi; __u64 rdx; __u64 r10; __u64 r8; __u64 r9; }; #elif defined __i386__ # define __NR_indirect 325 struct indirect_registers { __u32 eax; __u32 ebx; __u32 ecx; __u32 edx; __u32 esi; __u32 edi; __u32 ebp; }; #else # error "need to define __NR_indirect and struct indirect_params" #endif #define FILL_IN(var, values...) \ var = (struct indirect_registers) { values } int main (void) { int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP); int s1 = fcntl (fd, F_GETFD); int t1 = fcntl (fd, F_GETFL); printf ("old: FD_CLOEXEC %s set, NONBLOCK %s set\n", s1 == 0 ? "not" : "is", (t1 & O_NONBLOCK) ? "is" : "not"); close (fd); union indirect_params i; i.file_flags.flags = O_CLOEXEC|O_NONBLOCK; struct indirect_registers r; #ifdef __NR_socketcall # define SOCKOP_socket 1 long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP }; FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args); #else FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP); #endif fd = syscall (__NR_indirect, , , sizeof (i)); int s2 = fcntl (fd, F_GETFD); int t2 = fcntl (fd, F_GETFL); printf ("new: FD_CLOEXEC %s set, NONBLOCK %s set\n", s2 == 0 ? "not" : "is", (t2 & O_NONBLOCK) ? "is" : "not"); close (fd); i.file_flags.flags = O_CLOEXEC; sigset_t ss; sigemptyset(); FILL_IN(r, __NR_signalfd, -1, (long) , 8); fd = syscall (__NR_indirect, , , sizeof (i)); int s3 = fcntl (fd, F_GETFD); printf ("signalfd: FD_CLOEXEC %s set\n", s3 == 0 ? "not" : "is"); close (fd); FILL_IN(r, __NR_eventfd, 8); fd = syscall (__NR_indirect, , , sizeof (i)); int s4 = fcntl (fd, F_GETFD); printf ("eventfd: FD_CLOEXEC %s set\n", s4 == 0 ? "not" : "is"); close (fd); return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0; } Signed-off-by: Ulrich Drepper <[EMAIL PROTECTED]> arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S |2 + arch/x86/ia32/sys_ia32.c | 37 +- arch/x86/kernel/syscall_table_32.S |1 include/asm-um/indirect.h |6 + include/asm-x86/ia32_unistd.h |1 include/asm-x86/indirect.h |5 include/asm-x86/indirect_32.h | 23
[PATCHv4 6/6] FD_CLOEXEC support for eventfd, signalfd, timerfd
This patch adds support to set the FD_CLOEXEC flag for the file descriptors returned by eventfd, signalfd, timerfd. fs/anon_inodes.c | 15 +++ fs/eventfd.c |5 +++-- fs/signalfd.c |6 -- fs/timerfd.c |6 -- include/asm-x86/ia32_unistd.h |3 +++ include/linux/anon_inodes.h |3 +++ include/linux/indirect.h |3 +++ 7 files changed, 31 insertions(+), 10 deletions(-) --- fs/anon_inodes.c +++ fs/anon_inodes.c @@ -70,9 +70,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, -const char *name, const struct file_operations *fops, -void *priv) +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, if (!file) return -ENFILE; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_put_filp; fd = error; @@ -138,6 +138,13 @@ err_put_filp: put_filp(file); return error; } + +int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, +const char *name, const struct file_operations *fops, +void *priv) +{ + return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0); +} EXPORT_SYMBOL_GPL(anon_inode_getfd); /* --- fs/eventfd.c +++ fs/eventfd.c @@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[eventfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , "[eventfd]", + _fops, ctx, + INDIRECT_PARAM(file_flags, flags)); if (!error) return fd; --- fs/signalfd.c +++ fs/signalfd.c @@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[signalfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , + "[signalfd]", _fops, + ctx, INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_fdalloc; } else { --- fs/timerfd.c +++ fs/timerfd.c @@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(, , , "[timerfd]", -_fops, ctx); + error = anon_inode_getfd_flags(, , , "[timerfd]", + _fops, ctx, + INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_tmrcancel; } else { --- include/asm-x86/ia32_unistd.h +++ include/asm-x86/ia32_unistd.h @@ -15,5 +15,8 @@ #define __NR_ia32_socketcall 102 #define __NR_ia32_sigreturn119 #define __NR_ia32_rt_sigreturn 173 +#define __NR_ia32_signalfd 321 +#define __NR_ia32_timerfd 322 +#define __NR_ia32_eventfd 323 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ --- include/linux/anon_inodes.h +++ include/linux/anon_inodes.h @@ -8,6 +8,9 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags); int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, const char *name, const struct file_operations *fops, void *priv); --- include/linux/indirect.h +++ include/linux/indirect.h @@ -35,5 +35,8 @@ union indirect_params { #if INDSYSCALL(socketpair) case
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: > What bothers me about the sys_indirect approach is that it will get > increasingly expensive as time goes on, and in doing so it does a > user-space memory reference, which are extra expensive. The extra table > can be colocated with the main table (a structure, in effect) so they'll > share the same cache line. You assume that using sys_indirect will be the norm. It won't. We mustn't design system calls deliberately wrong so that they require the indirection. Beside, if the number of syscalls which has to be handled this way grows we can use something more efficient for large numbers of test than a switch statement. It could even be a word next to the system call table. But I still don't see that the magic encoding is a valid solution, it doesn't address the limited parameter number. Plus, using sys_indirect could in future be used to transport entire parameters (like a sigset_t) along with other information, thereby saving individual copy operations. I think the sys_indirect approach is the way forward. I'll submit a last version of the patch in a bit. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQlRw2ijCOnn/RHQRApifAKDE1nZqRbm4cJxbhobBb7jCx1T00QCgiSa0 EXKjL2Gwu3atSLSD+Rb4yO4= =6ZGt -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: > So when you recompile your old program (as you post it and as I commented on), > it will pass a >= 12 bytes data to kernel, with only first 4 bytes set to > O_CLOEXEC. > > Other bytes will contain junk If you don't initialize the entire structure and you use it all, of course you get undefined behavior. That's nothing new. The program I attached is not an example, it's a test for the functionality in this patch. Like with every kernel interface, you have to use it correctly. The good news is that user programs should never use this syscall directly (just like don't for existing ones). I see no problem at all here. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQbBH2ijCOnn/RHQRAkc3AKCxVTWQ3BiQnCBwdbAsT122QWWaiwCggKXN Z5Sz9/NFojMHZXXTzIMoxX4= =slte -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: >> union indirect_params i; >> i.file_flags.flags = O_CLOEXEC; > > This setup forbids future addons to file_flags > > In three years, when we want to add a new indirect feature to socket() > call, do we need a new indirect2() syscall ? No, it doesn't. The setup is indefinitely expandable. All you need to do, if it becomes necessary to have more than an int, is to define a little structure for the system call and then use it. The only requirement is that the code has to assume a value of zero is what is used today. That's the whole point. union indirect_params { struct { int flags; } file_flags; struct { int flags; int new_syscall_data1; sigset_t and_a_sigmask; } new_data; }; Old programs will set only the 'flags' member of 'new_data' while new once can also set the new elements. New programs on old kernels will eithe have failing calls since the structure is too big or the call will not have all the desired effects. The latter can be tested for. > Or better, you could avoid using 'union indirect_params' in user code, and > only use the substructs for each function. There is no overhead introduced through the union. The only reason the union is there in the first place is to allocate sufficient data in task_struct to cover all cases. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQafd2ijCOnn/RHQRAlSFAJ99lahwCDZGRSlIHCov5bWowrpoiQCgwvW4 LDSEusNUpMfIE1ywBCRDBfc= =ChVT -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: union indirect_params i; i.file_flags.flags = O_CLOEXEC; This setup forbids future addons to file_flags In three years, when we want to add a new indirect feature to socket() call, do we need a new indirect2() syscall ? No, it doesn't. The setup is indefinitely expandable. All you need to do, if it becomes necessary to have more than an int, is to define a little structure for the system call and then use it. The only requirement is that the code has to assume a value of zero is what is used today. That's the whole point. union indirect_params { struct { int flags; } file_flags; struct { int flags; int new_syscall_data1; sigset_t and_a_sigmask; } new_data; }; Old programs will set only the 'flags' member of 'new_data' while new once can also set the new elements. New programs on old kernels will eithe have failing calls since the structure is too big or the call will not have all the desired effects. The latter can be tested for. Or better, you could avoid using 'union indirect_params' in user code, and only use the substructs for each function. There is no overhead introduced through the union. The only reason the union is there in the first place is to allocate sufficient data in task_struct to cover all cases. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQafd2ijCOnn/RHQRAlSFAJ99lahwCDZGRSlIHCov5bWowrpoiQCgwvW4 LDSEusNUpMfIE1ywBCRDBfc= =ChVT -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Eric Dumazet wrote: So when you recompile your old program (as you post it and as I commented on), it will pass a = 12 bytes data to kernel, with only first 4 bytes set to O_CLOEXEC. Other bytes will contain junk If you don't initialize the entire structure and you use it all, of course you get undefined behavior. That's nothing new. The program I attached is not an example, it's a test for the functionality in this patch. Like with every kernel interface, you have to use it correctly. The good news is that user programs should never use this syscall directly (just like don't for existing ones). I see no problem at all here. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQbBH2ijCOnn/RHQRAkc3AKCxVTWQ3BiQnCBwdbAsT122QWWaiwCggKXN Z5Sz9/NFojMHZXXTzIMoxX4= =slte -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: What bothers me about the sys_indirect approach is that it will get increasingly expensive as time goes on, and in doing so it does a user-space memory reference, which are extra expensive. The extra table can be colocated with the main table (a structure, in effect) so they'll share the same cache line. You assume that using sys_indirect will be the norm. It won't. We mustn't design system calls deliberately wrong so that they require the indirection. Beside, if the number of syscalls which has to be handled this way grows we can use something more efficient for large numbers of test than a switch statement. It could even be a word next to the system call table. But I still don't see that the magic encoding is a valid solution, it doesn't address the limited parameter number. Plus, using sys_indirect could in future be used to transport entire parameters (like a sigset_t) along with other information, thereby saving individual copy operations. I think the sys_indirect approach is the way forward. I'll submit a last version of the patch in a bit. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHQlRw2ijCOnn/RHQRApifAKDE1nZqRbm4cJxbhobBb7jCx1T00QCgiSa0 EXKjL2Gwu3atSLSD+Rb4yO4= =6ZGt -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. arch/x86/ia32/Makefile|1 + arch/x86/ia32/sys_ia32.c |4 include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 33 + kernel/Makefile |2 ++ kernel/indirect.c |4 net/socket.c | 21 + 7 files changed, 58 insertions(+), 8 deletions(-) --- arch/x86/ia32/Makefile +++ arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- kernel/Makefile +++ kernel/Makefile @@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u net/socket.c net/socket.c --- net/socket.c +++ net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd = 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(newfile); + int fd = sock_alloc_fd(newfile, flags); if (likely(fd = 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file-f_op == socket_file_ops) @@ -1208,7 +1213,7 @@ if (retval 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err 0) goto out_release_both; - fd1 = sock_alloc_fd(newfile1); + fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(newfile2); + fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock-ops-owner); - newfd = sock_alloc_fd(newfile); + newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd 0)) { err = newfd; sock_release(newsock); diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -902,6 +902,10 @@ switch (INDIRECT_SYSCALL32(regs)) { +#define INDSYSCALL(name) __NR_ia32_##name +#include linux/indirect.h + break; + default: return -EINVAL; } diff -u include/linux/indirect.h include/linux/indirect.h --- include/linux/indirect.h +++ include/linux/indirect.h @@ -1,6 +1,39 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H #include asm/indirect.h + +union indirect_params { + struct { +int flags; + } file_flags; +}; + +#define INDIRECT_PARAM(set, name) current-indirect_params.set.name + +#endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro INDSYSCALL + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + #endif diff -u kernel/indirect.c kernel/indirect.c --- kernel/indirect.c +++ kernel/indirect.c @@ -19,6 +19,10 @@ switch (INDIRECT_SYSCALL (regs)) { +#define INDSYSCALL(name) __NR_##name +#include linux/indirect.h + break; + default: return
[PATCHv4 1/6] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h |6 ++ include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |2 +- kernel/indirect.c| 36 5 files changed, 51 insertions(+), 1 deletion(-) --- /dev/null +++ include/linux/indirect.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include asm/indirect.h + +#endif --- include/linux/sched.h +++ include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include linux/rcupdate.h #include linux/futex.h #include linux/rtmutex.h +#include linux/indirect.h #include linux/time.h #include linux/param.h @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* --- include/linux/syscalls.h +++ include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include linux/types.h #include linux/aio_abi.h @@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- /dev/null +++ kernel/indirect.c @@ -0,0 +1,36 @@ +#include linux/sched.h +#include linux/uaccess.h +#include linux/unistd.h +#include asm/asm-offsets.h + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(regs, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL (regs)) + { + default: + return -EINVAL; + } + + if (paramslen sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(current-indirect_params, userparams, paramslen)) + result = CALL_INDIRECT(regs); + + memset(current-indirect_params, '\0', paramslen); + + return result; +} --- kernel/Makefile +++ kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ - utsname.o notifier.o + utsname.o notifier.o indirect.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 3/6] UML support for sys_indirect
This part adds support for sys_indirect for UML. indirect.h |6 ++ 1 file changed, 6 insertions(+) --- /dev/null +++ include/asm-um/indirect.h @@ -0,0 +1,6 @@ +#ifndef __UM_INDIRECT_H +#define __UM_INDIRECT_H + +#include asm/arch/indirect.h + +#endif - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv4 2/6] x86x86-64 support for sys_indirect
This part adds support for sys_indirect on x86 and x86-64. arch/x86/ia32/ia32entry.S |2 ++ arch/x86/ia32/sys_ia32.c | 31 +++ arch/x86/kernel/syscall_table_32.S |1 + include/asm-x86/indirect.h |5 + include/asm-x86/indirect_32.h | 23 +++ include/asm-x86/indirect_64.h | 34 ++ include/asm-x86/unistd_32.h|3 ++- include/asm-x86/unistd_64.h|2 ++ 8 files changed, 100 insertions(+), 1 deletion(-) --- arch/x86/ia32/ia32entry.S +++ arch/x86/ia32/ia32entry.S @@ -400,6 +400,7 @@ END(ia32_ptregs_common) .section .rodata,a .align 8 + .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -726,4 +727,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate + .quad sys32_indirect/* 325 */ ia32_syscall_end: --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -887,3 +887,37 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, return sys_fallocate(fd, mode, ((u64)offset_hi 32) | offset_lo, ((u64)len_hi 32) | len_lo); } + +asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs, + void __user *userparams, size_t paramslen, + int flags) +{ + extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32); + + struct indirect_registers32 regs; + long result; + + if (flags != 0) + return -EINVAL; + + if (copy_from_user(regs, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL32(regs)) + { + default: + return -EINVAL; + } + + if (paramslen sizeof(union indirect_params)) + return -EINVAL; + result = -EFAULT; + if (!copy_from_user(current-indirect_params, userparams, paramslen)) + result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx, + regs.edx, regs.esi, + regs.edi, regs.ebp); + + memset(current-indirect_params, '\0', paramslen); + + return result; +} --- arch/x86/kernel/syscall_table_32.S +++ arch/x86/kernel/syscall_table_32.S @@ -324,3 +324,4 @@ ENTRY(sys_call_table) .long sys_timerfd .long sys_eventfd .long sys_fallocate + .long sys_indirect /* 325 */ --- /dev/null +++ include/asm-x86/indirect_32.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_INDIRECT_32_H +#define _ASM_X86_INDIRECT_32_H + +struct indirect_registers { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)-eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)-ebx, (regs)-ecx, \ +(regs)-edx, (regs)-esi, \ +(regs)-edi, (regs)-ebp); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect_64.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_INDIRECT_64_H +#define _ASM_X86_INDIRECT_64_H + +struct indirect_registers { + __u64 rax; + __u64 rdi; + __u64 rsi; + __u64 rdx; + __u64 r10; + __u64 r8; + __u64 r9; +}; + +struct indirect_registers32 { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)-rax +#define INDIRECT_SYSCALL32(regs) (regs)-eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)-rdi, (regs)-rsi, \ +(regs)-rdx, (regs)-r10, \ +(regs)-r8, (regs)-r9); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include indirect_32.h +#else +# include indirect_64.h +#endif --- include/asm-x86/unistd_32.h +++ include/asm-x86/unistd_32.h @@ -330,10 +330,11 @@ #define __NR_timerfd 322 #define __NR_eventfd 323 #define __NR_fallocate 324 +#define __NR_indirect 325 #ifdef __KERNEL__ -#define NR_syscalls 325 +#define NR_syscalls 326 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR --- include/asm-x86/unistd_64.h +++ include/asm-x86/unistd_64.h @@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd) __SYSCALL(__NR_eventfd, sys_eventfd) #define __NR_fallocate
[PATCHv4 0/6] sys_indirect system call
wing patches provide an alternative implementation of the sys_indirect system call which has been discussed a few times. This no system call allows us to extend existing system call interfaces with adding more system calls. Davide's previous implementation is IMO far more complex than warranted. This code here is trivial, as you can see. I've discussed this approach with Linus last week and for a brief moment we actually agreed on something. We pass an additional block of data to the kernel, it is copied into the task_struct, and then it is up to the function implementing the system call to interpret the data. Each system call, which is meant to be extended this way, has to be white-listed in sys_indirect. The alternative is to filter out those system calls which absolutely cannot be handled using sys_indirect (like clone, execve) since they require the stack layout of an ordinary system call. This is more dangerous since it is too easy to miss a call. The code for x86 and x86-64 gets by without a single line of assembly code. This is likely to be true for most/all the other archs as well. There is architecture-dependent code, though. For x86 and x86-64 I've also fixed up UML (although only x86-64 is tested, that's my setup). The last three patches show the first application of the functionality. They also show a complication: we need the test for valid sub-syscalls in the main implementation and in the compatibility code. And more: the actual sources and generated binary for the test are very different (the numbers differ). Duplicating the information is a big problem, though. I've used some macro tricks to avoid this. All the information about the flags and the system calls using them is concentrated in one header. This should maintenance bearable. This patch to use sys_indirect is just the beginning. More will follow, but I want to see how these patches are received before I spend more time on it. This code is enough to test the implementation with the following test program. Adjust it for architectures other than x86 and x86-64. #include fcntl.h #include signal.h #include stdint.h #include stdio.h #include unistd.h #include netinet/in.h #include sys/socket.h #include sys/syscall.h typedef uint32_t __u32; typedef uint64_t __u64; union indirect_params { struct { int flags; } file_flags; }; #ifdef __x86_64__ # define __NR_indirect 286 struct indirect_registers { __u64 rax; __u64 rdi; __u64 rsi; __u64 rdx; __u64 r10; __u64 r8; __u64 r9; }; #elif defined __i386__ # define __NR_indirect 325 struct indirect_registers { __u32 eax; __u32 ebx; __u32 ecx; __u32 edx; __u32 esi; __u32 edi; __u32 ebp; }; #else # error need to define __NR_indirect and struct indirect_params #endif #define FILL_IN(var, values...) \ var = (struct indirect_registers) { values } int main (void) { int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP); int s1 = fcntl (fd, F_GETFD); int t1 = fcntl (fd, F_GETFL); printf (old: FD_CLOEXEC %s set, NONBLOCK %s set\n, s1 == 0 ? not : is, (t1 O_NONBLOCK) ? is : not); close (fd); union indirect_params i; i.file_flags.flags = O_CLOEXEC|O_NONBLOCK; struct indirect_registers r; #ifdef __NR_socketcall # define SOCKOP_socket 1 long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP }; FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args); #else FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP); #endif fd = syscall (__NR_indirect, r, i, sizeof (i)); int s2 = fcntl (fd, F_GETFD); int t2 = fcntl (fd, F_GETFL); printf (new: FD_CLOEXEC %s set, NONBLOCK %s set\n, s2 == 0 ? not : is, (t2 O_NONBLOCK) ? is : not); close (fd); i.file_flags.flags = O_CLOEXEC; sigset_t ss; sigemptyset(ss); FILL_IN(r, __NR_signalfd, -1, (long) ss, 8); fd = syscall (__NR_indirect, r, i, sizeof (i)); int s3 = fcntl (fd, F_GETFD); printf (signalfd: FD_CLOEXEC %s set\n, s3 == 0 ? not : is); close (fd); FILL_IN(r, __NR_eventfd, 8); fd = syscall (__NR_indirect, r, i, sizeof (i)); int s4 = fcntl (fd, F_GETFD); printf (eventfd: FD_CLOEXEC %s set\n, s4 == 0 ? not : is); close (fd); return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0; } Signed-off-by: Ulrich Drepper [EMAIL PROTECTED] arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S |2 + arch/x86/ia32/sys_ia32.c | 37 +- arch/x86/kernel/syscall_table_32.S |1 include/asm-um/indirect.h |6 + include/asm-x86/ia32_unistd.h |1 include/asm-x86/indirect.h |5 include/asm-x86/indirect_32.h | 23 + include/asm-x86/indirect_64.h | 34 +++ include/asm-x86/unistd_32.h|3
[PATCHv4 6/6] FD_CLOEXEC support for eventfd, signalfd, timerfd
This patch adds support to set the FD_CLOEXEC flag for the file descriptors returned by eventfd, signalfd, timerfd. fs/anon_inodes.c | 15 +++ fs/eventfd.c |5 +++-- fs/signalfd.c |6 -- fs/timerfd.c |6 -- include/asm-x86/ia32_unistd.h |3 +++ include/linux/anon_inodes.h |3 +++ include/linux/indirect.h |3 +++ 7 files changed, 31 insertions(+), 10 deletions(-) --- fs/anon_inodes.c +++ fs/anon_inodes.c @@ -70,9 +70,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, -const char *name, const struct file_operations *fops, -void *priv) +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, if (!file) return -ENFILE; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error 0) goto err_put_filp; fd = error; @@ -138,6 +138,13 @@ err_put_filp: put_filp(file); return error; } + +int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, +const char *name, const struct file_operations *fops, +void *priv) +{ + return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0); +} EXPORT_SYMBOL_GPL(anon_inode_getfd); /* --- fs/eventfd.c +++ fs/eventfd.c @@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(fd, inode, file, [eventfd], -eventfd_fops, ctx); + error = anon_inode_getfd_flags(fd, inode, file, [eventfd], + eventfd_fops, ctx, + INDIRECT_PARAM(file_flags, flags)); if (!error) return fd; --- fs/signalfd.c +++ fs/signalfd.c @@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(ufd, inode, file, [signalfd], -signalfd_fops, ctx); + error = anon_inode_getfd_flags(ufd, inode, file, + [signalfd], signalfd_fops, + ctx, INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_fdalloc; } else { --- fs/timerfd.c +++ fs/timerfd.c @@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(ufd, inode, file, [timerfd], -timerfd_fops, ctx); + error = anon_inode_getfd_flags(ufd, inode, file, [timerfd], + timerfd_fops, ctx, + INDIRECT_PARAM(file_flags, + flags)); if (error) goto err_tmrcancel; } else { --- include/asm-x86/ia32_unistd.h +++ include/asm-x86/ia32_unistd.h @@ -15,5 +15,8 @@ #define __NR_ia32_socketcall 102 #define __NR_ia32_sigreturn119 #define __NR_ia32_rt_sigreturn 173 +#define __NR_ia32_signalfd 321 +#define __NR_ia32_timerfd 322 +#define __NR_ia32_eventfd 323 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ --- include/linux/anon_inodes.h +++ include/linux/anon_inodes.h @@ -8,6 +8,9 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file **pfile, + const char *name, const struct file_operations *fops, + void *priv, int flags); int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, const char *name, const struct file_operations *fops, void *priv); --- include/linux/indirect.h +++
[PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets
This patch adds support for setting the O_NONBLOCK flag of the file descriptors returned by socket, socketpair, and accept. socket.c | 15 +-- 1 file changed, 9 insertions(+), 6 deletions(-) --- net/socket.c +++ net/socket.c @@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags) return fd; } -static int sock_attach_fd(struct socket *sock, struct file *file) +static int sock_attach_fd(struct socket *sock, struct file *file, int flags) { struct dentry *dentry; struct qstr name = { .name = }; @@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file *file) init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, socket_file_ops); SOCK_INODE(sock)-i_fop = socket_file_ops; - file-f_flags = O_RDWR; + file-f_flags = O_RDWR | (flags O_NONBLOCK); file-f_pos = 0; file-private_data = sock; @@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags) int fd = sock_alloc_fd(newfile, flags); if (likely(fd = 0)) { - int err = sock_attach_fd(sock, newfile); + int err = sock_attach_fd(sock, newfile, flags); if (unlikely(err 0)) { put_filp(newfile); @@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, int protocol, goto out_release_both; } - err = sock_attach_fd(sock1, newfile1); + err = sock_attach_fd(sock1, newfile1, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err 0)) { goto out_fd2; } - err = sock_attach_fd(sock2, newfile2); + err = sock_attach_fd(sock2, newfile2, +INDIRECT_PARAM(file_flags, flags)); if (unlikely(err 0)) { fput(newfile1); goto out_fd1; @@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, goto out_put; } - err = sock_attach_fd(newsock, newfile); + err = sock_attach_fd(newsock, newfile, +INDIRECT_PARAM(file_flags, flags)); if (err 0) goto out_fd_simple; - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: > It seems to me that we could accomplish the same thing by passing the > number of parameters in the upper bits of the system call number > register (%eax in the case of x86.) This isn't really a generic solution. The number of parameters is limited to six. There are syscalls with six parameters already. There are many more with five which could only handle one more parameter. Also, is it really simpler? You'd need to have another table which contains the default number of parameters a system call takes so that you can fill in the default value of zero. This extra memory access has to be performed for every system call. I think it is unlikely that this approach is faster. To the contrary, I'd guess. I don't have much invested into this but it seems the sys_indirect approach is so much simpler. Overhead is only paid if you really need it which is rarely the case. Plus, you might have heard Linus and Zack talk about syslets again. Starting syslets can be done using the same interface, I guess. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQAez2ijCOnn/RHQRAjoHAJ4/Qq4ygaZ4uq6uCIVNq4hfN1m2pACgpJFi Z/vBsGFpUc/EUz+VW66jEIY= =B19x -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv3 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 H. Peter Anvin wrote: It seems to me that we could accomplish the same thing by passing the number of parameters in the upper bits of the system call number register (%eax in the case of x86.) This isn't really a generic solution. The number of parameters is limited to six. There are syscalls with six parameters already. There are many more with five which could only handle one more parameter. Also, is it really simpler? You'd need to have another table which contains the default number of parameters a system call takes so that you can fill in the default value of zero. This extra memory access has to be performed for every system call. I think it is unlikely that this approach is faster. To the contrary, I'd guess. I don't have much invested into this but it seems the sys_indirect approach is so much simpler. Overhead is only paid if you really need it which is rarely the case. Plus, you might have heard Linus and Zack talk about syslets again. Starting syslets can be done using the same interface, I guess. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHQAez2ijCOnn/RHQRAjoHAJ4/Qq4ygaZ4uq6uCIVNq4hfN1m2pACgpJFi Z/vBsGFpUc/EUz+VW66jEIY= =B19x -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Linus Torvalds wrote: > Uli doesn't care that much about async syscalls, but I think that from a > kernel standpoint, we'd want to use this same indirect call for async > scheduling, Note that I added a flags parameter to sys_indirect in the v3 patch. This should allow you to add additional functionality like syslets later. Currently a zero value is enforced. In future nonzero values could also imply that the function takes additional parameters. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHPrDk2ijCOnn/RHQRAks1AJ43zF42Vy2ru2D8X3W13YlzYpazUQCfci37 wTKr35RIViiwkQWNMMCeMdk= =Gmld -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/4] sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Linus Torvalds wrote: Uli doesn't care that much about async syscalls, but I think that from a kernel standpoint, we'd want to use this same indirect call for async scheduling, Note that I added a flags parameter to sys_indirect in the v3 patch. This should allow you to add additional functionality like syslets later. Currently a zero value is enforced. In future nonzero values could also imply that the function takes additional parameters. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHPrDk2ijCOnn/RHQRAks1AJ43zF42Vy2ru2D8X3W13YlzYpazUQCfci37 wTKr35RIViiwkQWNMMCeMdk= =Gmld -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv3 0/4] sys_indirect system call
wing patches provide an alternative implementation of the sys_indirect system call which has been discussed a few times. This no system call allows us to extend existing system call interfaces with adding more system calls. Davide's previous implementation is IMO far more complex than warranted. This code here is trivial, as you can see. I've discussed this approach with Linus last week and for a brief moment we actually agreed on something. We pass an additional block of data to the kernel, it is copied into the task_struct, and then it is up to the function implementing the system call to interpret the data. Each system call, which is meant to be extended this way, has to be white-listed in sys_indirect. The alternative is to filter out those system calls which absolutely cannot be handled using sys_indirect (like clone, execve) since they require the stack layout of an ordinary system call. This is more dangerous since it is too easy to miss a call. The code for x86 and x86-64 gets by without a single line of assembly code. This is likely to be true for most/all the other archs as well. There is architecture-dependent code, though. For x86 and x86-64 I've also fixed up UML (although only x86-64 is tested, that's my setup). The last patch shows the first application of the functionality. It also shows a complication: we need the test for valid sub-syscalls in the main implementation and in the compatibility code. And more: the actual sources and generated binary are very different (the numbers differ). Duplicating the information is a big problem, though. I've used some macro tricks to avoid this. All the information about the flags and the system calls using them is concentrated in one header. This should maintenance bearable. This patch to use sys_indirect is just the beginngin. More will follow, but I want to see how these patches are received before I spend more time on it. This code is enough to test the implementation with the following test program. Adjust it for architectures other than x86 and x86-64. #include #include #include #include #include #include #include typedef uint32_t __u32; typedef uint64_t __u64; union indirect_params { struct { int flags; } file_flags; }; #ifdef __x86_64__ # define __NR_indirect 286 struct indirect_registers { __u64 rax; __u64 rdi; __u64 rsi; __u64 rdx; __u64 r10; __u64 r8; __u64 r9; }; #elif defined __i386__ # define __NR_indirect 325 struct indirect_registers { __u32 eax; __u32 ebx; __u32 ecx; __u32 edx; __u32 esi; __u32 edi; __u32 ebp; }; #else # error "need to define __NR_indirect and struct indirect_params" #endif #define FILL_IN(var, values...) \ var = (struct indirect_registers) { values } int main (void) { int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP); int s1 = fcntl (fd, F_GETFD); printf ("old: FD_CLOEXEC %s set\n", s1 == 0 ? "not" : "is"); close (fd); union indirect_params i; i.file_flags.flags = O_CLOEXEC; struct indirect_registers r; #ifdef __NR_socketcall # define SOCKOP_socket 1 long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP }; FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args); #else FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP); #endif fd = syscall (__NR_indirect, , , sizeof (i)); int s2 = fcntl (fd, F_GETFD); printf ("new: FD_CLOEXEC %s set\n", s2 == 0 ? "not" : "is"); close (fd); return s1 != 0 || s2 == 0; } ~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper <[EMAIL PROTECTED]> arch/x86/ia32/Makefile |1 arch/x86/ia32/ia32entry.S |2 + arch/x86/ia32/sys_ia32.c | 37 +- arch/x86/kernel/syscall_table_32.S |1 include/asm-um/indirect.h |6 + include/asm-x86/ia32_unistd.h |1 include/asm-x86/indirect.h |5 include/asm-x86/indirect_32.h | 23 + include/asm-x86/indirect_64.h | 34 +++ include/asm-x86/unistd_32.h|3 +- include/asm-x86/unistd_64.h|2 + include/linux/indirect.h | 39 include/linux/sched.h |4 +++ include/linux/syscalls.h |6 - kernel/Makefile|4 ++- kernel/indirect.c | 40 + net/socket.c | 29 +++--- 17 files changed, 221 insertions(+), 16 deletions(-) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv3 3/4] UML support for sys_indirect
This part adds support for sys_indirect for UML. indirect.h |6 ++ 1 file changed, 6 insertions(+) --- /dev/null +++ include/asm-um/indirect.h @@ -0,0 +1,6 @@ +#ifndef __UM_INDIRECT_H +#define __UM_INDIRECT_H + +#include "asm/arch/indirect.h" + +#endif - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv3 2/4] x86 support for sys_indirect
This part adds support for sys_indirect on x86 and x86-64. arch/x86/ia32/ia32entry.S |2 ++ arch/x86/ia32/sys_ia32.c | 31 +++ arch/x86/kernel/syscall_table_32.S |1 + include/asm-x86/indirect.h |5 + include/asm-x86/indirect_32.h | 23 +++ include/asm-x86/indirect_64.h | 34 ++ include/asm-x86/unistd_32.h|3 ++- include/asm-x86/unistd_64.h|2 ++ 8 files changed, 100 insertions(+), 1 deletion(-) --- arch/x86/ia32/ia32entry.S +++ arch/x86/ia32/ia32entry.S @@ -400,6 +400,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 + .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -726,4 +727,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate + .quad sys32_indirect/* 325 */ ia32_syscall_end: --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -887,3 +887,34 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); } + +asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs, + void __user *userparams, size_t paramslen, + int flags) +{ + extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32); + + struct indirect_registers32 regs; + long result; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL32()) + { + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx, + regs.edx, regs.esi, + regs.edi, regs.ebp); + + memset(>indirect_params, '\0', paramslen); + + return result; +} --- arch/x86/kernel/syscall_table_32.S +++ arch/x86/kernel/syscall_table_32.S @@ -324,3 +324,4 @@ ENTRY(sys_call_table) .long sys_timerfd .long sys_eventfd .long sys_fallocate + .long sys_indirect /* 325 */ --- /dev/null +++ include/asm-x86/indirect_32.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_INDIRECT_32_H +#define _ASM_X86_INDIRECT_32_H + +struct indirect_registers { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->ebx, (regs)->ecx, \ +(regs)->edx, (regs)->esi, \ +(regs)->edi, (regs)->ebp); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect_64.h @@ -0,0 +1,34 @@ +#ifndef _ASM_X86_INDIRECT_64_H +#define _ASM_X86_INDIRECT_64_H + +struct indirect_registers { + __u64 rax; + __u64 rdi; + __u64 rsi; + __u64 rdx; + __u64 r10; + __u64 r8; + __u64 r9; +}; + +struct indirect_registers32 { + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 esi; + __u32 edi; + __u32 ebp; +}; + +#define INDIRECT_SYSCALL(regs) (regs)->rax +#define INDIRECT_SYSCALL32(regs) (regs)->eax + +#define CALL_INDIRECT(regs) \ + ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64); \ + sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->rdi, (regs)->rsi, \ +(regs)->rdx, (regs)->r10, \ +(regs)->r8, (regs)->r9); \ + }) + +#endif --- /dev/null +++ include/asm-x86/indirect.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "indirect_32.h" +#else +# include "indirect_64.h" +#endif --- include/asm-x86/unistd_32.h +++ include/asm-x86/unistd_32.h @@ -330,10 +330,11 @@ #define __NR_timerfd 322 #define __NR_eventfd 323 #define __NR_fallocate 324 +#define __NR_indirect 325 #ifdef __KERNEL__ -#define NR_syscalls 325 +#define NR_syscalls 326 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR --- include/asm-x86/unistd_64.h +++ include/asm-x86/unistd_64.h @@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd) __SYSCALL(__NR_eventfd, sys_eventfd) #define __NR_fallocate 285 __SYSCALL(__NR_fallocate, sys_fallocate)
[PATCHv3 4/4] first use of sys_indirect system call
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. arch/x86/ia32/Makefile|1 + arch/x86/ia32/sys_ia32.c |4 include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 33 + kernel/Makefile |2 ++ kernel/indirect.c |4 net/socket.c | 21 + 7 files changed, 58 insertions(+), 8 deletions(-) --- arch/x86/ia32/Makefile +++ arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- kernel/Makefile +++ kernel/Makefile @@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u net/socket.c net/socket.c --- net/socket.c +++ net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd >= 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(); + int fd = sock_alloc_fd(, flags); if (likely(fd >= 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file->f_op == _file_ops) @@ -1208,7 +1213,7 @@ if (retval < 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval < 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err < 0) goto out_release_both; - fd1 = sock_alloc_fd(); + fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 < 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(); + fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 < 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock->ops->owner); - newfd = sock_alloc_fd(); + newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -902,6 +902,10 @@ switch (INDIRECT_SYSCALL32()) { +#define INDSYSCALL(name) __NR_ia32_##name +#include + break; + default: return -EINVAL; } diff -u include/linux/indirect.h include/linux/indirect.h --- include/linux/indirect.h +++ include/linux/indirect.h @@ -1,6 +1,39 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H #include + +union indirect_params { + struct { +int flags; + } file_flags; +}; + +#define INDIRECT_PARAM(set, name) current->indirect_params.set.name + +#endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro "INDSYSCALL" + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + #endif diff -u kernel/indirect.c kernel/indirect.c --- kernel/indirect.c +++ kernel/indirect.c @@ -19,6 +19,10 @@ switch (INDIRECT_SYSCALL ()) { +#define INDSYSCALL(name) __NR_##name +#include + break; + default: return -EINVAL; } --- include/asm-x86/ia32_unistd.h +++ include/asm-x86/ia32_unistd.h @@ -12,6 +12,7 @@
[PATCHv3 1/4] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h |6 ++ include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |2 +- kernel/indirect.c| 36 5 files changed, 51 insertions(+), 1 deletion(-) --- /dev/null +++ include/linux/indirect.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include + +#endif --- include/linux/sched.h +++ include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* --- include/linux/syscalls.h +++ include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include #include @@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- /dev/null +++ kernel/indirect.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL ()) + { + default: + return -EINVAL; + } + + if (paramslen > sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(>indirect_params, userparams, paramslen)) + result = CALL_INDIRECT(); + + memset(>indirect_params, '\0', paramslen); + + return result; +} --- kernel/Makefile +++ kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ - utsname.o notifier.o + utsname.o notifier.o indirect.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv2 4/4] first use of sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: > i'm not suggesting the library set the global flag. i'm suggesting that > me as an app writer will do so. > > it seems like both methods are useful. No, the global flag is hardly ever useful. You almost never know the details of all the libraries you link to well enough to determine that they don't need FD_CLOEXEC disabled. Even more problematic, you cannot know whether they will need it in future. For applications the solution is simple: wrap to appropriate calls. Apache has all these apr_ wrappers. But them to some good news after all. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHPeim2ijCOnn/RHQRAu8xAJsF/0Ir1PWMbHkVRaI5vKOGFS4tMACfVEs9 pMYAiCAU1E2B+7QR0EP+/F8= =btt9 -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv2 4/4] first use of sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: > honestly i think there should be a per-task flag which indicates whether > fds are by default F_CLOEXEC or not. my reason: third party libraries. Only somebody who thinks exclusively about applications as opposed to runtimes/libraries can say something like that. Library writers don't have the luxury of being able to modify any global state. This has all been discussed here before. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHPd8b2ijCOnn/RHQRAuPPAKCm5mcOl8dycDenxi7BNFdrf2IfWgCgmaXQ Fj7V13HU1vX6fM9bRumxRpk= =UIi1 -END PGP SIGNATURE- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv2 4/4] first use of sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: honestly i think there should be a per-task flag which indicates whether fds are by default F_CLOEXEC or not. my reason: third party libraries. Only somebody who thinks exclusively about applications as opposed to runtimes/libraries can say something like that. Library writers don't have the luxury of being able to modify any global state. This has all been discussed here before. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) iD8DBQFHPd8b2ijCOnn/RHQRAuPPAKCm5mcOl8dycDenxi7BNFdrf2IfWgCgmaXQ Fj7V13HU1vX6fM9bRumxRpk= =UIi1 -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv2 4/4] first use of sys_indirect system call
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 dean gaudet wrote: i'm not suggesting the library set the global flag. i'm suggesting that me as an app writer will do so. it seems like both methods are useful. No, the global flag is hardly ever useful. You almost never know the details of all the libraries you link to well enough to determine that they don't need FD_CLOEXEC disabled. Even more problematic, you cannot know whether they will need it in future. For applications the solution is simple: wrap to appropriate calls. Apache has all these apr_ wrappers. But them to some good news after all. - -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.7 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org iD8DBQFHPeim2ijCOnn/RHQRAu8xAJsF/0Ir1PWMbHkVRaI5vKOGFS4tMACfVEs9 pMYAiCAU1E2B+7QR0EP+/F8= =btt9 -END PGP SIGNATURE- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv3 1/4] actual sys_indirect code
This is the actual architecture-independent part of the system call implementation. include/linux/indirect.h |6 ++ include/linux/sched.h|4 include/linux/syscalls.h |4 kernel/Makefile |2 +- kernel/indirect.c| 36 5 files changed, 51 insertions(+), 1 deletion(-) --- /dev/null +++ include/linux/indirect.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_INDIRECT_H +#define _LINUX_INDIRECT_H + +#include asm/indirect.h + +#endif --- include/linux/sched.h +++ include/linux/sched.h @@ -80,6 +80,7 @@ struct sched_param { #include linux/rcupdate.h #include linux/futex.h #include linux/rtmutex.h +#include linux/indirect.h #include linux/time.h #include linux/param.h @@ -1174,6 +1175,9 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* Additional system call parameters. */ + union indirect_params indirect_params; }; /* --- include/linux/syscalls.h +++ include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct indirect_registers; #include linux/types.h #include linux/aio_abi.h @@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags, const struct itimerspec __user *utmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); --- /dev/null +++ kernel/indirect.c @@ -0,0 +1,36 @@ +#include linux/sched.h +#include linux/uaccess.h +#include linux/unistd.h +#include asm/asm-offsets.h + + +asmlinkage long sys_indirect(struct indirect_registers __user *userregs, +void __user *userparams, size_t paramslen, +int flags) +{ + struct indirect_registers regs; + long result; + + if (unlikely(flags != 0)) + return -EINVAL; + + if (copy_from_user(regs, userregs, sizeof(regs))) + return -EFAULT; + + switch (INDIRECT_SYSCALL (regs)) + { + default: + return -EINVAL; + } + + if (paramslen sizeof(union indirect_params)) + return -EINVAL; + + result = -EFAULT; + if (!copy_from_user(current-indirect_params, userparams, paramslen)) + result = CALL_INDIRECT(regs); + + memset(current-indirect_params, '\0', paramslen); + + return result; +} --- kernel/Makefile +++ kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ - utsname.o notifier.o + utsname.o notifier.o indirect.o obj-$(CONFIG_SYSCTL) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv3 4/4] first use of sys_indirect system call
This is a first user of sys_indirect. Several of the socket-related system calls which produce a file handle now can be passed an additional parameter to set the FD_CLOEXEC flag. arch/x86/ia32/Makefile|1 + arch/x86/ia32/sys_ia32.c |4 include/asm-x86/ia32_unistd.h |1 + include/linux/indirect.h | 33 + kernel/Makefile |2 ++ kernel/indirect.c |4 net/socket.c | 21 + 7 files changed, 58 insertions(+), 8 deletions(-) --- arch/x86/ia32/Makefile +++ arch/x86/ia32/Makefile @@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) +CFLAGS_sys_ia32.o = -Wno-undef AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 --- kernel/Makefile +++ kernel/Makefile @@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer endif +CFLAGS_indirect.o = -Wno-undef + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff -u net/socket.c net/socket.c --- net/socket.c +++ net/socket.c @@ -344,11 +344,11 @@ * but we take care of internal coherence yet. */ -static int sock_alloc_fd(struct file **filep) +static int sock_alloc_fd(struct file **filep, int flags) { int fd; - fd = get_unused_fd(); + fd = get_unused_fd_flags(flags); if (likely(fd = 0)) { struct file *file = get_empty_filp(); @@ -391,10 +391,10 @@ return 0; } -int sock_map_fd(struct socket *sock) +static int sock_map_fd_flags(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_fd(newfile); + int fd = sock_alloc_fd(newfile, flags); if (likely(fd = 0)) { int err = sock_attach_fd(sock, newfile); @@ -409,6 +409,11 @@ return fd; } +int sock_map_fd(struct socket *sock) +{ + return sock_map_fd_flags(sock, 0); +} + static struct socket *sock_from_file(struct file *file, int *err) { if (file-f_op == socket_file_ops) @@ -1208,7 +1213,7 @@ if (retval 0) goto out; - retval = sock_map_fd(sock); + retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags)); if (retval 0) goto out_release; @@ -1249,13 +1254,13 @@ if (err 0) goto out_release_both; - fd1 = sock_alloc_fd(newfile1); + fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd1 0)) { err = fd1; goto out_release_both; } - fd2 = sock_alloc_fd(newfile2); + fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags)); if (unlikely(fd2 0)) { err = fd2; put_filp(newfile1); @@ -1411,7 +1416,7 @@ */ __module_get(newsock-ops-owner); - newfd = sock_alloc_fd(newfile); + newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags)); if (unlikely(newfd 0)) { err = newfd; sock_release(newsock); diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c --- arch/x86/ia32/sys_ia32.c +++ arch/x86/ia32/sys_ia32.c @@ -902,6 +902,10 @@ switch (INDIRECT_SYSCALL32(regs)) { +#define INDSYSCALL(name) __NR_ia32_##name +#include linux/indirect.h + break; + default: return -EINVAL; } diff -u include/linux/indirect.h include/linux/indirect.h --- include/linux/indirect.h +++ include/linux/indirect.h @@ -1,6 +1,39 @@ +#ifndef INDSYSCALL #ifndef _LINUX_INDIRECT_H #define _LINUX_INDIRECT_H #include asm/indirect.h + +union indirect_params { + struct { +int flags; + } file_flags; +}; + +#define INDIRECT_PARAM(set, name) current-indirect_params.set.name + +#endif +#else + +/* Here comes the list of system calls which can be called through + sys_indirect. When the list if support system calls is needed the + file including this header is supposed to define a macro INDSYSCALL + which adds a prefix fitting to the use. If the resulting macro is + defined we generate a line + case MACRO: + */ +#if INDSYSCALL(accept) + case INDSYSCALL(accept): +#endif +#if INDSYSCALL(socket) + case INDSYSCALL(socket): +#endif +#if INDSYSCALL(socketcall) + case INDSYSCALL(socketcall): +#endif +#if INDSYSCALL(socketpair) + case INDSYSCALL(socketpair): +#endif + #endif diff -u kernel/indirect.c kernel/indirect.c --- kernel/indirect.c +++ kernel/indirect.c @@ -19,6 +19,10 @@ switch (INDIRECT_SYSCALL (regs)) { +#define INDSYSCALL(name) __NR_##name +#include linux/indirect.h + break; + default: return