cpuset cleanup race

2016-04-26 Thread Ulrich Drepper
I came across a problem with code which uses a cpuset CG and tries to
be responsible and clean up after itself.  The code attached at the
bottom illustrates the problem.  It's only long because it has no
dependencies aside from the basic runtime and should work on all
machines.  You need to run it with privileges high enough to create a
CG.

The code is really simple:
- a (new) CG in cpuset is created
- one of the cores of the root cpuset is selected
- the thread (and therefore entire process) is switched to the cpuset
- a thread is created which does nothing but terminate immediately
- the parent waits for the thread
- then the parent removes itself from the cpuset
- finally the parent tries to remove the created cpuset

The last part is where things go wrong.  Usually* the rmdir() call
made to remove the cpuset fails because the cpuset is still busy.  The
program prints the members of the cpuset CG: it's the child thread.

* I wrote "usually" because slowing down the parent code will help.
I.e., there is a race.  Successful slowdowns I found:
- compile with -fsanitize=address (seems already enough)
- very short wait, e.g., 1ns (you can see this by starting the program
with the parameter "wait")

You might want to compile the code with optimization.  It is a race, after all.


The pthread_join() call made by the parent won't return until the
kernel signals through the futex set up at clone() time that the
thread has terminated.  From the perspective of the userlevel code the
thread is gone.  But not all bookkeeping related to the terminated
thread seems to has been finished, it seems.


I didn't look at the code but I can imagine that the futex
notification happens as soon as all observable aspects of the thread
are gone.  This is of course good to not delay the waiter.  Hopefully
the cgroup bookkeeping can also be moved before the notification.


I tested it with a recent kernel (4.5.0-0.rc7) but I doubt it's a recent issue.


~
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static void *tf(void *p)
{
  return NULL;
}

int main(int argc, char *argv[])
{
  const char *csname = argc == 1 ? "test" : argv[1];

  struct mntent *me;
  FILE *fp = setmntent(_PATH_MOUNTED, "r");
  if (fp == NULL)
error(1, errno, "cannot read mounted filesystem information");
  while ((me = getmntent(fp)) != NULL) {
if (strcmp(me->mnt_type, "cgroup") == 0
&& hasmntopt(me, "cpuset") != NULL)
  break;
  }
  if (me == NULL)
error(1, 0, "cpuset filesystem not mounted");
  endmntent(fp);

  char *cshier = NULL;
  asprintf(, "%s/%s", me->mnt_dir, csname);

  if (mkdir(cshier, 0777) == 0)
printf("new cpuset control group: %s\n", cshier);
  else if (errno != EEXIST)
error(1, errno, "cannot create cpuset group %s", cshier);

  char *csrootmems;
  asprintf(, "%s/cpuset.mems", me->mnt_dir);
  fp = fopen(csrootmems, "r");
  if (fp == NULL)
error(1, errno, "cannot read /cpuset.mems");
  char *val = NULL;
  size_t vallen = 0;
  ssize_t n = getline(, , fp);
  fclose(fp);
  free(csrootmems);

  char *testmems;
  asprintf(, "%s/cpuset.mems", cshier);
  fp = fopen(testmems, "w");
  if (fp == NULL)
error(1, errno, "cannot read /%s/cpuset.mems", csname);
  fwrite(val, n, 1, fp);
  fclose(fp);
  free(testmems);
  free(val);

  cpu_set_t cs;
  int first = 0;
  sched_getaffinity(0, sizeof(cs), );
  while (! CPU_ISSET(first, ))
++first;

  char *testcpus;
  asprintf(, "%s/cpuset.cpus", cshier);
  fp = fopen(testcpus, "w");
  if (fp == NULL)
error(1, errno, "cannot write /%s/cpuset.cpus", csname);
  fprintf(fp, "%d", first);
  fclose(fp);
  free(testcpus);

  char *testtasks;
  asprintf(, "%s/tasks", cshier);
  fp = fopen(testtasks, "w");
  if (fp == NULL)
error(1, errno, "cannot write /%s/tasks", csname);
  fprintf(fp, "%d", (int) getpid());
  fclose(fp);

  pthread_t th;
  pthread_create(, NULL, tf, NULL);

  pthread_join(th, NULL);

  char *roottasks;
  asprintf(, "%s/tasks", me->mnt_dir);
  fp = fopen(roottasks, "w");
  if (fp == NULL)
error(1, errno, "cannot write /tasks");
  fprintf(fp, "%d", (int) getpid());
  fclose(fp);
  free(roottasks);

  if (strcmp(csname, "wait") == 0) {
struct timespec s = { 0, 1 };
nanosleep(, NULL);
  }

  if (rmdir(cshier) != 0) {
printf("PID = %ld\nremaining = ", (long) getpid());
fp = fopen(testtasks, "r");
char *line = NULL;
size_t linelen = 0;
while ((n = getline(, , fp)) > 0)
  fputs(line, stdout);
fclose(fp);
free(line);
error(1, errno, "couldn't remove cpuset %s", cshier);
  }

  free(cshier);
  free(testtasks);

  return 0;
}


cpuset cleanup race

2016-04-26 Thread Ulrich Drepper
I came across a problem with code which uses a cpuset CG and tries to
be responsible and clean up after itself.  The code attached at the
bottom illustrates the problem.  It's only long because it has no
dependencies aside from the basic runtime and should work on all
machines.  You need to run it with privileges high enough to create a
CG.

The code is really simple:
- a (new) CG in cpuset is created
- one of the cores of the root cpuset is selected
- the thread (and therefore entire process) is switched to the cpuset
- a thread is created which does nothing but terminate immediately
- the parent waits for the thread
- then the parent removes itself from the cpuset
- finally the parent tries to remove the created cpuset

The last part is where things go wrong.  Usually* the rmdir() call
made to remove the cpuset fails because the cpuset is still busy.  The
program prints the members of the cpuset CG: it's the child thread.

* I wrote "usually" because slowing down the parent code will help.
I.e., there is a race.  Successful slowdowns I found:
- compile with -fsanitize=address (seems already enough)
- very short wait, e.g., 1ns (you can see this by starting the program
with the parameter "wait")

You might want to compile the code with optimization.  It is a race, after all.


The pthread_join() call made by the parent won't return until the
kernel signals through the futex set up at clone() time that the
thread has terminated.  From the perspective of the userlevel code the
thread is gone.  But not all bookkeeping related to the terminated
thread seems to has been finished, it seems.


I didn't look at the code but I can imagine that the futex
notification happens as soon as all observable aspects of the thread
are gone.  This is of course good to not delay the waiter.  Hopefully
the cgroup bookkeeping can also be moved before the notification.


I tested it with a recent kernel (4.5.0-0.rc7) but I doubt it's a recent issue.


~
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static void *tf(void *p)
{
  return NULL;
}

int main(int argc, char *argv[])
{
  const char *csname = argc == 1 ? "test" : argv[1];

  struct mntent *me;
  FILE *fp = setmntent(_PATH_MOUNTED, "r");
  if (fp == NULL)
error(1, errno, "cannot read mounted filesystem information");
  while ((me = getmntent(fp)) != NULL) {
if (strcmp(me->mnt_type, "cgroup") == 0
&& hasmntopt(me, "cpuset") != NULL)
  break;
  }
  if (me == NULL)
error(1, 0, "cpuset filesystem not mounted");
  endmntent(fp);

  char *cshier = NULL;
  asprintf(, "%s/%s", me->mnt_dir, csname);

  if (mkdir(cshier, 0777) == 0)
printf("new cpuset control group: %s\n", cshier);
  else if (errno != EEXIST)
error(1, errno, "cannot create cpuset group %s", cshier);

  char *csrootmems;
  asprintf(, "%s/cpuset.mems", me->mnt_dir);
  fp = fopen(csrootmems, "r");
  if (fp == NULL)
error(1, errno, "cannot read /cpuset.mems");
  char *val = NULL;
  size_t vallen = 0;
  ssize_t n = getline(, , fp);
  fclose(fp);
  free(csrootmems);

  char *testmems;
  asprintf(, "%s/cpuset.mems", cshier);
  fp = fopen(testmems, "w");
  if (fp == NULL)
error(1, errno, "cannot read /%s/cpuset.mems", csname);
  fwrite(val, n, 1, fp);
  fclose(fp);
  free(testmems);
  free(val);

  cpu_set_t cs;
  int first = 0;
  sched_getaffinity(0, sizeof(cs), );
  while (! CPU_ISSET(first, ))
++first;

  char *testcpus;
  asprintf(, "%s/cpuset.cpus", cshier);
  fp = fopen(testcpus, "w");
  if (fp == NULL)
error(1, errno, "cannot write /%s/cpuset.cpus", csname);
  fprintf(fp, "%d", first);
  fclose(fp);
  free(testcpus);

  char *testtasks;
  asprintf(, "%s/tasks", cshier);
  fp = fopen(testtasks, "w");
  if (fp == NULL)
error(1, errno, "cannot write /%s/tasks", csname);
  fprintf(fp, "%d", (int) getpid());
  fclose(fp);

  pthread_t th;
  pthread_create(, NULL, tf, NULL);

  pthread_join(th, NULL);

  char *roottasks;
  asprintf(, "%s/tasks", me->mnt_dir);
  fp = fopen(roottasks, "w");
  if (fp == NULL)
error(1, errno, "cannot write /tasks");
  fprintf(fp, "%d", (int) getpid());
  fclose(fp);
  free(roottasks);

  if (strcmp(csname, "wait") == 0) {
struct timespec s = { 0, 1 };
nanosleep(, NULL);
  }

  if (rmdir(cshier) != 0) {
printf("PID = %ld\nremaining = ", (long) getpid());
fp = fopen(testtasks, "r");
char *line = NULL;
size_t linelen = 0;
while ((n = getline(, , fp)) > 0)
  fputs(line, stdout);
fclose(fp);
free(line);
error(1, errno, "couldn't remove cpuset %s", cshier);
  }

  free(cshier);
  free(testtasks);

  return 0;
}


Re: NUMA node information for pages

2014-04-07 Thread Ulrich Drepper
On Mon, Mar 31, 2014 at 9:24 PM, Naoya Horiguchi
 wrote:
> The information about "pfn-node" mapping seldom (or never) changes after boot,
> so it seems better to me that adding a new interface somewhere under
> /sys/devices/system/node/nodeN which shows pfn range of a given node.
> If this doesn't work for your usecase, could you explain more about how you
> use this information?

I have no problem with that type of interface.  It'll be more work
figuring out the details since the interface I proposed is trivial and
mimics that of kpageflags etc but that's manageable.

I'll see whether I can figure out the necessary details.  I imagine
that if the PFN are indeed always clustered for each node then, as
David proposes, text output like

  PFNSTART PFNSTOP

in a file below /sys/devices/system/node/nodeN should be sufficient.

How does memory hot plug work in this situation?  If the PFNs are
allocated dense at startup then there might potentially be many ranges
for each node.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: NUMA node information for pages

2014-04-07 Thread Ulrich Drepper
On Mon, Mar 31, 2014 at 9:24 PM, Naoya Horiguchi
n-horigu...@ah.jp.nec.com wrote:
 The information about pfn-node mapping seldom (or never) changes after boot,
 so it seems better to me that adding a new interface somewhere under
 /sys/devices/system/node/nodeN which shows pfn range of a given node.
 If this doesn't work for your usecase, could you explain more about how you
 use this information?

I have no problem with that type of interface.  It'll be more work
figuring out the details since the interface I proposed is trivial and
mimics that of kpageflags etc but that's manageable.

I'll see whether I can figure out the necessary details.  I imagine
that if the PFN are indeed always clustered for each node then, as
David proposes, text output like

  PFNSTART PFNSTOP

in a file below /sys/devices/system/node/nodeN should be sufficient.

How does memory hot plug work in this situation?  If the PFNs are
allocated dense at startup then there might potentially be many ranges
for each node.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


NUMA node information for pages

2014-03-31 Thread Ulrich Drepper
I might be missing something but I couldn't find a way to use the
pagemap information to then look up the NUMA node the respective page is
located on.  Especially when analyzing anomalities this is really
useful.  The /proc/kpageflags and /proc/kpagecount files don't have that
information.

If this is correct, could the attached patch be considered?  It's really
simple and follows the same line as the kpageflags file.


Signed-off-by: Ulrich Drepper 

 Documentation/vm/pagemap.txt |3 ++
 fs/proc/page.c   |   50
 +++
 2 files changed, 53 insertions(+)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 5948e45..413b34c 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -34,6 +34,9 @@ There are three components to pagemap:
  * /proc/kpagecount.  This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
 
+ * /proc/kpagenode.  This file contains a 32-bit number of the NUMA node
+   each page is mapped on.
+
  * /proc/kpageflags.  This file contains a 64-bit set of flags for each
page, indexed by PFN.
 
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55..65bea9f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -15,6 +15,9 @@
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
 
+#define KNIDSIZE sizeof(s32)
+#define KNIDMASK (KNIDSIZE - 1)
+
 /* /proc/kpagecount - an array exposing page counts
  *
  * Each entry is a u64 representing the corresponding
@@ -212,10 +215,57 @@ static const struct file_operations 
proc_kpageflags_operations = {
.read = kpageflags_read,
 };
 
+/* /proc/kpagenode - an array exposing node information for pages
+ *
+ * Each entry is a s32 representing the corresponding
+ * physical page flags.
+ */
+
+static ssize_t kpagenode_read(struct file *file, char __user *buf,
+size_t count, loff_t *ppos)
+{
+   u64 __user *out = (u64 __user *)buf;
+   unsigned long src = *ppos;
+   unsigned long pfn = src / KNIDSIZE;
+   ssize_t ret = 0;
+
+   count = min_t(unsigned long, count, (max_pfn * KNIDSIZE) - src);
+   if (src & KNIDSIZE || count & KNIDMASK)
+   return -EINVAL;
+
+   while (count > 0) {
+   int nid;
+   if (pfn_valid(pfn))
+   nid = pfn_to_nid(pfn);
+   else
+   nid = -1;
+
+   if (put_user(nid, out)) {
+   ret = -EFAULT;
+   break;
+   }
+
+   pfn++;
+   out++;
+   count -= KNIDSIZE;
+   }
+
+   *ppos += (char __user *)out - buf;
+   if (!ret)
+   ret = (char __user *)out - buf;
+   return ret;
+}
+
+static const struct file_operations proc_kpagenode_operations = {
+   .llseek = mem_lseek,
+   .read = kpagenode_read,
+};
+
 static int __init proc_page_init(void)
 {
proc_create("kpagecount", S_IRUSR, NULL, _kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, _kpageflags_operations);
+   proc_create("kpagenode", S_IRUSR, NULL, _kpagenode_operations);
return 0;
 }
 fs_initcall(proc_page_init);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


NUMA node information for pages

2014-03-31 Thread Ulrich Drepper
I might be missing something but I couldn't find a way to use the
pagemap information to then look up the NUMA node the respective page is
located on.  Especially when analyzing anomalities this is really
useful.  The /proc/kpageflags and /proc/kpagecount files don't have that
information.

If this is correct, could the attached patch be considered?  It's really
simple and follows the same line as the kpageflags file.


Signed-off-by: Ulrich Drepper drep...@gmail.com

 Documentation/vm/pagemap.txt |3 ++
 fs/proc/page.c   |   50
 +++
 2 files changed, 53 insertions(+)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 5948e45..413b34c 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -34,6 +34,9 @@ There are three components to pagemap:
  * /proc/kpagecount.  This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
 
+ * /proc/kpagenode.  This file contains a 32-bit number of the NUMA node
+   each page is mapped on.
+
  * /proc/kpageflags.  This file contains a 64-bit set of flags for each
page, indexed by PFN.
 
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55..65bea9f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -15,6 +15,9 @@
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
 
+#define KNIDSIZE sizeof(s32)
+#define KNIDMASK (KNIDSIZE - 1)
+
 /* /proc/kpagecount - an array exposing page counts
  *
  * Each entry is a u64 representing the corresponding
@@ -212,10 +215,57 @@ static const struct file_operations 
proc_kpageflags_operations = {
.read = kpageflags_read,
 };
 
+/* /proc/kpagenode - an array exposing node information for pages
+ *
+ * Each entry is a s32 representing the corresponding
+ * physical page flags.
+ */
+
+static ssize_t kpagenode_read(struct file *file, char __user *buf,
+size_t count, loff_t *ppos)
+{
+   u64 __user *out = (u64 __user *)buf;
+   unsigned long src = *ppos;
+   unsigned long pfn = src / KNIDSIZE;
+   ssize_t ret = 0;
+
+   count = min_t(unsigned long, count, (max_pfn * KNIDSIZE) - src);
+   if (src  KNIDSIZE || count  KNIDMASK)
+   return -EINVAL;
+
+   while (count  0) {
+   int nid;
+   if (pfn_valid(pfn))
+   nid = pfn_to_nid(pfn);
+   else
+   nid = -1;
+
+   if (put_user(nid, out)) {
+   ret = -EFAULT;
+   break;
+   }
+
+   pfn++;
+   out++;
+   count -= KNIDSIZE;
+   }
+
+   *ppos += (char __user *)out - buf;
+   if (!ret)
+   ret = (char __user *)out - buf;
+   return ret;
+}
+
+static const struct file_operations proc_kpagenode_operations = {
+   .llseek = mem_lseek,
+   .read = kpagenode_read,
+};
+
 static int __init proc_page_init(void)
 {
proc_create(kpagecount, S_IRUSR, NULL, proc_kpagecount_operations);
proc_create(kpageflags, S_IRUSR, NULL, proc_kpageflags_operations);
+   proc_create(kpagenode, S_IRUSR, NULL, proc_kpagenode_operations);
return 0;
 }
 fs_initcall(proc_page_init);
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] apparently broken RLIMIT_CORE

2013-10-06 Thread Ulrich Drepper
On Sun, Oct 6, 2013 at 4:42 PM, Linus Torvalds
 wrote:
> I doubt it is intentional, but I also cannot really feel that we care
> deeply. Afaik we don't really honor the size limit exactly anyway, ie
> we tend to check only at page boundaries etc. So do we really care?

I could imagine in the case Al brought up (a pipe as core file filter)
we might want to have some assurance the limits are not breached.  If
it doesn't cost that much I'd say implement it precisely.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] apparently broken RLIMIT_CORE

2013-10-06 Thread Ulrich Drepper
On Sun, Oct 6, 2013 at 4:42 PM, Linus Torvalds
torva...@linux-foundation.org wrote:
 I doubt it is intentional, but I also cannot really feel that we care
 deeply. Afaik we don't really honor the size limit exactly anyway, ie
 we tend to check only at page boundaries etc. So do we really care?

I could imagine in the case Al brought up (a pipe as core file filter)
we might want to have some assurance the limits are not breached.  If
it doesn't cost that much I'd say implement it precisely.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: remove duplicate block from Makefile

2013-10-05 Thread Ulrich Drepper
This looks like a merge error, the code is duplicated with the first
copy doing something else as well.  Just remove the second block.

Signed-off-by: Ulrich Drepper 

 Makefile |8 
 1 file changed, 8 deletions(-)


Index: perf/config/Makefile
===
--- perf.orig/config/Makefile
+++ perf/config/Makefile
@@ -200,14 +200,6 @@ endif # NO_DWARF
 
 endif # NO_LIBELF
 
-ifndef NO_LIBELF
-CFLAGS += -DLIBELF_SUPPORT
-FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
-ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
-  CFLAGS += -DLIBELF_MMAP
-endif # try-cc
-endif # NO_LIBELF
-
 # There's only x86 (both 32 and 64) support for CFI unwind so far
 ifneq ($(ARCH),x86)
   NO_LIBUNWIND := 1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: remove duplicate block from Makefile

2013-10-05 Thread Ulrich Drepper
This looks like a merge error, the code is duplicated with the first
copy doing something else as well.  Just remove the second block.

Signed-off-by: Ulrich Drepper drep...@gmail.com

 Makefile |8 
 1 file changed, 8 deletions(-)


Index: perf/config/Makefile
===
--- perf.orig/config/Makefile
+++ perf/config/Makefile
@@ -200,14 +200,6 @@ endif # NO_DWARF
 
 endif # NO_LIBELF
 
-ifndef NO_LIBELF
-CFLAGS += -DLIBELF_SUPPORT
-FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
-ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
-  CFLAGS += -DLIBELF_MMAP
-endif # try-cc
-endif # NO_LIBELF
-
 # There's only x86 (both 32 and 64) support for CFI unwind so far
 ifneq ($(ARCH),x86)
   NO_LIBUNWIND := 1
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sendfile and EAGAIN

2013-03-02 Thread Ulrich Drepper
On Sat, Mar 2, 2013 at 10:09 PM, Eric Dumazet  wrote:
>
> Using non blocking IO means the sender (and the receiver) must be able
> to perform several operations, as long as the whole transfert is not
> finished.

Certainly, and this is implemented.  But the receiver never gets the
rest of the data while the sender (most of the time) gets notified
that everything is sent.

I don't have a reduced test case yet.  Hopefully I'll get to it
sometime soon.  For now I worked around it by not using sendfile.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sendfile and EAGAIN

2013-03-02 Thread Ulrich Drepper
On Mon, Feb 25, 2013 at 2:22 PM, Eric Dumazet  wrote:
> I don't understand the issue.
>
> sendfile() returns -EAGAIN only if no bytes were copied to the socket.

There is something wrong/unexpected/...

I have a program which can use either sendfile or send.  When using
sendfile to transmit a large block (I've seen it with 900k) the
sendfile call does not transmit everything.  There receiver gets only
about 600k.  This is the situation when I think I've seen EAGAIN
errors from sendmail but I cannot just now reproduce it.  This is with
sockets of AF_UNIX type.

Are there any limits to take into account?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sendfile and EAGAIN

2013-03-02 Thread Ulrich Drepper
On Mon, Feb 25, 2013 at 2:22 PM, Eric Dumazet eric.duma...@gmail.com wrote:
 I don't understand the issue.

 sendfile() returns -EAGAIN only if no bytes were copied to the socket.

There is something wrong/unexpected/...

I have a program which can use either sendfile or send.  When using
sendfile to transmit a large block (I've seen it with 900k) the
sendfile call does not transmit everything.  There receiver gets only
about 600k.  This is the situation when I think I've seen EAGAIN
errors from sendmail but I cannot just now reproduce it.  This is with
sockets of AF_UNIX type.

Are there any limits to take into account?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sendfile and EAGAIN

2013-03-02 Thread Ulrich Drepper
On Sat, Mar 2, 2013 at 10:09 PM, Eric Dumazet eric.duma...@gmail.com wrote:

 Using non blocking IO means the sender (and the receiver) must be able
 to perform several operations, as long as the whole transfert is not
 finished.

Certainly, and this is implemented.  But the receiver never gets the
rest of the data while the sender (most of the time) gets notified
that everything is sent.

I don't have a reduced test case yet.  Hopefully I'll get to it
sometime soon.  For now I worked around it by not using sendfile.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] perf tool: Adding ratios support

2013-01-16 Thread Ulrich Drepper
On Wed, Jan 16, 2013 at 9:25 AM, Jiri Olsa  wrote:
> I was thinking having config files (global and arch specific)
> comming with perf having predefined formulas.

All the more reason to not mention the file name or really any source
for the definition of the formula in the name,


> 1)  -e 'ratio/branch-rate/'  # special event class
> 2)  -e 'ratio-branch-rate'   # 'ratio-' prefix
> 3)  -e cpu/branch-rate/  # handled like aliases, ratio name would need to 
> be unique
>   ... ?

I think 3 is the most extensible.  Perhaps use the syntax used in
other places.  We have these :u suffixes etc.  Perhaps have :r or :R
or whatever.

Given the other comments, we might want to avoid right away "ratio".
If the mechanism is generalized it could be used to express "counter1
- counter2" for events which cannot be expressed with a single counter
but are not really ratios.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] perf tool: Adding ratios support

2013-01-16 Thread Ulrich Drepper
On Tue, Jan 15, 2013 at 8:39 AM, Jiri Olsa  wrote:
>   $ perf stat -f formula.conf:cpi kill
>   usage: kill [ -s signal | -p ] [ -a ] pid ...
>  kill -l [ signal ]

I do like this proposal.  The only comment I have is that perhaps the
command line syntax isn't ideal.  What you use above is tied to the
ratios be defined in the config file.  I would imagine that at least
over time (for some ratios probably right away) they become available
by default and don't require a config file.  Also, users might want to
put individualized ratio definitions in a config file which is read by
default.

How about the formulas becoming available whenever the config file is
read.  Maybe this means a few more keywords in the config file (ratio,
ratio-set, ...).  E.g.:

ratio-set branch {
  events = {instructions,branch-instructions,branch-misses}:u

  ratio branch-rate {
  formula = branch-instructions / instructions
  desc = branch rate
  }

  ratio branch-miss-rate {
  formula = branch-misses / instructions
  desc = branch misprediction rate
  }

  ratio branch-miss-ratio{
  formula = branch-misses / branch-instructions
  desc = branch misprediction ratio
  }
  }

You get the idea.  Maybe substitute "ratio":with "formula". Then allow
such a ratio/formula to be used just like a normal event, perhaps with
a special suffix/prefix to designate it.  This should then also mark
the events as part of a group so that the underlying counters are
scheduled in together.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] perf tool: Adding ratios support

2013-01-16 Thread Ulrich Drepper
On Tue, Jan 15, 2013 at 8:39 AM, Jiri Olsa jo...@redhat.com wrote:
   $ perf stat -f formula.conf:cpi kill
   usage: kill [ -s signal | -p ] [ -a ] pid ...
  kill -l [ signal ]

I do like this proposal.  The only comment I have is that perhaps the
command line syntax isn't ideal.  What you use above is tied to the
ratios be defined in the config file.  I would imagine that at least
over time (for some ratios probably right away) they become available
by default and don't require a config file.  Also, users might want to
put individualized ratio definitions in a config file which is read by
default.

How about the formulas becoming available whenever the config file is
read.  Maybe this means a few more keywords in the config file (ratio,
ratio-set, ...).  E.g.:

ratio-set branch {
  events = {instructions,branch-instructions,branch-misses}:u

  ratio branch-rate {
  formula = branch-instructions / instructions
  desc = branch rate
  }

  ratio branch-miss-rate {
  formula = branch-misses / instructions
  desc = branch misprediction rate
  }

  ratio branch-miss-ratio{
  formula = branch-misses / branch-instructions
  desc = branch misprediction ratio
  }
  }

You get the idea.  Maybe substitute ratio:with formula. Then allow
such a ratio/formula to be used just like a normal event, perhaps with
a special suffix/prefix to designate it.  This should then also mark
the events as part of a group so that the underlying counters are
scheduled in together.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] perf tool: Adding ratios support

2013-01-16 Thread Ulrich Drepper
On Wed, Jan 16, 2013 at 9:25 AM, Jiri Olsa jo...@redhat.com wrote:
 I was thinking having config files (global and arch specific)
 comming with perf having predefined formulas.

All the more reason to not mention the file name or really any source
for the definition of the formula in the name,


 1)  -e 'ratio/branch-rate/'  # special event class
 2)  -e 'ratio-branch-rate'   # 'ratio-' prefix
 3)  -e cpu/branch-rate/  # handled like aliases, ratio name would need to 
 be unique
   ... ?

I think 3 is the most extensible.  Perhaps use the syntax used in
other places.  We have these :u suffixes etc.  Perhaps have :r or :R
or whatever.

Given the other comments, we might want to avoid right away ratio.
If the mechanism is generalized it could be used to express counter1
- counter2 for events which cannot be expressed with a single counter
but are not really ratios.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 5:06 PM, Kirill A. Shutemov
 wrote:
> They are bugs.
>
> Let's fix strerror_r() usage.
>
> Signed-off-by: Kirill A. Shutemov 

Acked-by: Ulrich Drepper 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 4:31 PM, Kirill A. Shutemov
 wrote:
> +   const char *err = strerror_r(errnum, buf, buflen);
> +
> +   if (err != buf && buflen > 0) {
> +   size_t len = strlen(err);
> +   char *c = mempcpy(buf, err, min(buflen - 1, len));
> +   *c = '\0';
> +   }

No need to check for err == NULL.   buflen == 0 is a possibility given
the interface but I'd say this is an error and should be tested for at
the beginning of the function and the call should fail or even abort
the program.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 11:00 AM, Kirill A. Shutemov
 wrote:
> The right way to fix it is to switch to XSI-compliant version.

And why exactly would this be "the right way"?  Just fix the use of
strerror_r or use strerror_l.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 11:00 AM, Kirill A. Shutemov
kir...@shutemov.name wrote:
 The right way to fix it is to switch to XSI-compliant version.

And why exactly would this be the right way?  Just fix the use of
strerror_r or use strerror_l.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 4:31 PM, Kirill A. Shutemov
kir...@shutemov.name wrote:
 +   const char *err = strerror_r(errnum, buf, buflen);
 +
 +   if (err != buf  buflen  0) {
 +   size_t len = strlen(err);
 +   char *c = mempcpy(buf, err, min(buflen - 1, len));
 +   *c = '\0';
 +   }

No need to check for err == NULL.   buflen == 0 is a possibility given
the interface but I'd say this is an error and should be tested for at
the beginning of the function and the call should fail or even abort
the program.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] perf: use XSI-complaint version of strerror_r() instead of GNU-specific

2012-07-23 Thread Ulrich Drepper
On Mon, Jul 23, 2012 at 5:06 PM, Kirill A. Shutemov
kir...@shutemov.name wrote:
 They are bugs.

 Let's fix strerror_r() usage.

 Signed-off-by: Kirill A. Shutemov kir...@shutemov.name

Acked-by: Ulrich Drepper drep...@gmail.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/3] perf tool: Add new event group management

2012-07-18 Thread Ulrich Drepper
On Wed, Jul 18, 2012 at 6:21 AM, Jiri Olsa  wrote:
> Well, I personally like the '{}' syntax more than '--group-events or 
> --group-reads
> option in front', it feels more user friendly.. anyway, we can easily have 
> both ways.

I like the actual visual grouping better, too.

Also, it doesn't require us to define what

   -e E1,E2 --group-events -e E3,E4

means.  Does --group-events also apply to the first parameter?


> As for the group attributes and group leader sampling, I don't mind omitting
> them at this point and get back to that if we find it useful in future.

Just define the first event the leader.  What reason is there which
prevents this?


I can only second what Andi wrote: just get it done quickly.  This is
functionality that is desperately needed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/3] perf tool: Add new event group management

2012-07-18 Thread Ulrich Drepper
On Wed, Jul 18, 2012 at 6:21 AM, Jiri Olsa jo...@redhat.com wrote:
 Well, I personally like the '{}' syntax more than '--group-events or 
 --group-reads
 option in front', it feels more user friendly.. anyway, we can easily have 
 both ways.

I like the actual visual grouping better, too.

Also, it doesn't require us to define what

   -e E1,E2 --group-events -e E3,E4

means.  Does --group-events also apply to the first parameter?


 As for the group attributes and group leader sampling, I don't mind omitting
 them at this point and get back to that if we find it useful in future.

Just define the first event the leader.  What reason is there which
prevents this?


I can only second what Andi wrote: just get it done quickly.  This is
functionality that is desperately needed.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] RUSAGE_THREAD

2008-01-18 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Roland McGrath wrote:
> +#define  RUSAGE_LWP  RUSAGE_THREAD   /* Solaris name for same */

No need to clutter the kernel header with this, it'll be in the libc header.

Aside from that:

Acked-by: Ulrich Drepper <[EMAIL PROTECTED]>

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHkZbk2ijCOnn/RHQRAtohAKCyWgJsm20LSqxTznvff3LI8zplvgCgwttu
16eJFNgQXWNEk76b141uZvo=
=DzhA
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] RUSAGE_THREAD

2008-01-18 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Roland McGrath wrote:
 +#define  RUSAGE_LWP  RUSAGE_THREAD   /* Solaris name for same */

No need to clutter the kernel header with this, it'll be in the libc header.

Aside from that:

Acked-by: Ulrich Drepper [EMAIL PROTECTED]

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHkZbk2ijCOnn/RHQRAtohAKCyWgJsm20LSqxTznvff3LI8zplvgCgwttu
16eJFNgQXWNEk76b141uZvo=
=DzhA
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] Per-thread getrusage

2008-01-17 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Vinay Sridhar wrote:
> There are two ways to implement this in the kernel:
> 1) Introduce an additional parameter 'tid' to sys_getrusage() and put
> code in glibc to handle getrusage() and pthread_getrusage() calls
> correctly.
> 2) Introduce a new system call to handle pthread_getrusage() and leave
> sys_getrusage() untouched.

You're doing two things at once:

a) provide a way to get a thread's usage

b) provide a way to get another process's/thread's usage


The former is a trivial extension and I completely agree.  RUSAGE_THREAD
is trivial to implement and should go in ASAP.

The second part isn't that easy.  The first question is: do we really
need this?  It is a new type of interface.  We have the /proc filesystem
etc for programs which want to look at other process' data.  Second,
more importantly right now, your patch seems not to include any security
support.  Correct me if I'm wrong, but find_task_by_pid will always
succeed, regardless of whether the calling thread belongs to another UID
or not.  I.e., your patch enables any process to read any other process'
usage.  That's a no-no.


I suggest that you split the patch in two.  The first should implement
RUSAGE_THREAD.  You'll immediately get an ACK from me for that.  The
second part then should introduce a way to get another process' usage.
This patch should only be used initially as a starting point for
discussions.  You'll have to argue why it is necessary in the first place.

The argument might have to do with why you want a pthread_getrusage()
interface (which, btw, is a bad name since the interface is nothing like
getrusage, getrusage doesn't allow requesting any other process' data).
 Yes, for intra-process lookups relying on /proc is no good idea.  But
then, I have not seen any reason so far why such an API is needed and
why a thread cannot just be responsible for reading its own usage data.
 Anyway, if pthread_getrusage (or whatever it'll be called) is the only
usage then the syscall should require that the TID parameter is from a
thread in the same process which would solve the security problem.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHj3do2ijCOnn/RHQRAiKdAKCSooiEWcxr780hJGenElyDiWPWKgCdE+6Y
j6ibmGsPT4aYxhSfpimSdiw=
=jOC9
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-26 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
> No.
> 
> I already said I'm not looking at changing the calling convention for
> existing syscalls.

I did not suggest or ask for that at all.

I was asking you to consider the real implementation details for a new
syscall mechanism.

We do not want to abandon the use of syscall/sysenter and go back to int
(on x86/x86-64).  This means that you have to come up with a mechanism
which hooks into the current syscall/sysenter path while preserving full
backward compatibility.

Now it's your turn.  How do you do this without additional costs?


> Hardly so, as evidenced by the fact that we have successfully done so
> for 15 years already; a number of Linux architectures require this
> information for the existing system calls.

Nothing at this scale is there in the moment, as far as I can see.  And
nothing so critical for getting right.

Talk is cheap.  You still haven't shown one bit if design how you want
to achieve your grand goal.  The time for hand-waiving is over.  Do some
work or step out of the way.  Nothing you have said so far in the least
convinces me and your arguments like "sys_indirect adds parameters" are
not really contested.  Yes, that's what sys_indirect does.  So what?  It
does this with almost no cost which outweighs the ugliness factor in my
book.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHS2gQ2ijCOnn/RHQRAlN5AKCWZQL97sROWBv33//Uj/MN+CNi3gCdFgCU
uLVEOfclERpakp1kdYzy2oI=
=stVB
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-26 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
> The 6-word limit is a red herring.  There is at least two ways to deal
> with it (and this doesn't mean wiping the legacy stuff we already have):
> 
> - Let each architecture pick a calling convention and redefine the
> architecture-independent bits to take an arbitrary number of arguments.
>  This is a one-time panarchitectural change.
> [...]

Just think beyond wishful thinking for a moment.  What does it take to
come up with something completely new and grand?

Let's start at the basic: you need to signal that the new syscall
calling convention is used.  Since the syscall entry code is limited (at
least the likes of syscall/sysenter, it would be easy enough to use int
$0x81 in addition to int $0x80) you would have to extend the use of the
syscall number while keeping binary compatibility.  This means
additional costs for every single syscall.

Once you're past that, how do you implement the expandable syscall
parameter count?  There are two ways:

- - pass to the real sys_* implementations the number of provided syscall
parameters and have each function figure out what this means

- - dynamically construct a call to the sys_* functions where the syscall
magic adds an appropriate number of parameters filled with zeros.  This
is quite complicated and, more importantly, it requires that you have
code/data somewhere which specifies how many parameters each of the
sys_* function actually requires.  The actual sys_* code and the data
has to be kept in sync at all times.  A maintenance nightmare.


The handling of syscalls with many parameters should not at all be a
driver of this design at all.  Syscalls shouldn't be that complicated, I
completely agree with ingo.


I'm perfectly willing to give you the benefit of doubt, show us a design
for what you're proposing which is not slower than the current code,
doesn't impact existing code, and solves the problem in a nice and clean
way.  I cannot really see it now but I might miss something.  The
sys_indirect approach ain't pretty but it does it jobs, doesn't impact
performance, and is expandable in direction we *know* we will want to go
very soon.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHS1X12ijCOnn/RHQRAihRAJwLNJ9fT8GTv6MAoO6RZGOub07sGgCdGBLR
frXyQVB8Oh5VgWY5YJhpitg=
=FuBx
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-26 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
 The 6-word limit is a red herring.  There is at least two ways to deal
 with it (and this doesn't mean wiping the legacy stuff we already have):
 
 - Let each architecture pick a calling convention and redefine the
 architecture-independent bits to take an arbitrary number of arguments.
  This is a one-time panarchitectural change.
 [...]

Just think beyond wishful thinking for a moment.  What does it take to
come up with something completely new and grand?

Let's start at the basic: you need to signal that the new syscall
calling convention is used.  Since the syscall entry code is limited (at
least the likes of syscall/sysenter, it would be easy enough to use int
$0x81 in addition to int $0x80) you would have to extend the use of the
syscall number while keeping binary compatibility.  This means
additional costs for every single syscall.

Once you're past that, how do you implement the expandable syscall
parameter count?  There are two ways:

- - pass to the real sys_* implementations the number of provided syscall
parameters and have each function figure out what this means

- - dynamically construct a call to the sys_* functions where the syscall
magic adds an appropriate number of parameters filled with zeros.  This
is quite complicated and, more importantly, it requires that you have
code/data somewhere which specifies how many parameters each of the
sys_* function actually requires.  The actual sys_* code and the data
has to be kept in sync at all times.  A maintenance nightmare.


The handling of syscalls with many parameters should not at all be a
driver of this design at all.  Syscalls shouldn't be that complicated, I
completely agree with ingo.


I'm perfectly willing to give you the benefit of doubt, show us a design
for what you're proposing which is not slower than the current code,
doesn't impact existing code, and solves the problem in a nice and clean
way.  I cannot really see it now but I might miss something.  The
sys_indirect approach ain't pretty but it does it jobs, doesn't impact
performance, and is expandable in direction we *know* we will want to go
very soon.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHS1X12ijCOnn/RHQRAihRAJwLNJ9fT8GTv6MAoO6RZGOub07sGgCdGBLR
frXyQVB8Oh5VgWY5YJhpitg=
=FuBx
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-26 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
 No.
 
 I already said I'm not looking at changing the calling convention for
 existing syscalls.

I did not suggest or ask for that at all.

I was asking you to consider the real implementation details for a new
syscall mechanism.

We do not want to abandon the use of syscall/sysenter and go back to int
(on x86/x86-64).  This means that you have to come up with a mechanism
which hooks into the current syscall/sysenter path while preserving full
backward compatibility.

Now it's your turn.  How do you do this without additional costs?


 Hardly so, as evidenced by the fact that we have successfully done so
 for 15 years already; a number of Linux architectures require this
 information for the existing system calls.

Nothing at this scale is there in the moment, as far as I can see.  And
nothing so critical for getting right.

Talk is cheap.  You still haven't shown one bit if design how you want
to achieve your grand goal.  The time for hand-waiving is over.  Do some
work or step out of the way.  Nothing you have said so far in the least
convinces me and your arguments like sys_indirect adds parameters are
not really contested.  Yes, that's what sys_indirect does.  So what?  It
does this with almost no cost which outweighs the ugliness factor in my
book.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHS2gQ2ijCOnn/RHQRAlN5AKCWZQL97sROWBv33//Uj/MN+CNi3gCdFgCU
uLVEOfclERpakp1kdYzy2oI=
=stVB
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-24 Thread Ulrich Drepper
On Nov 24, 2007 12:28 AM, Eric Dumazet <[EMAIL PROTECTED]> wrote:
> OK, but maybe for consistency, we might accept the two mechanisms.

It's not a question of the kernel interface.  The issue with all these
extensions is the userlevel interface.  Ideally no new userlevel
interface is needed.  This is the case for open() and incidentally
also for this case (through the flags parameter for recvmsg).  For
socket(), accept(), the situation is unfortunately different and we
need a new interface.

With your proposed patch, we would have to introduce another recvmsg()
interface to take advantage of the additional functionality.  This
just doesn't make any sense.  This is no contest in aesthetics.  You
first have to think about the interface presented to the programmer at
userlevel and then design the syscall interface.  This is how
MSG_CMSG_CLOEXEC came about.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-24 Thread Ulrich Drepper
On Nov 24, 2007 12:28 AM, Eric Dumazet [EMAIL PROTECTED] wrote:
 OK, but maybe for consistency, we might accept the two mechanisms.

It's not a question of the kernel interface.  The issue with all these
extensions is the userlevel interface.  Ideally no new userlevel
interface is needed.  This is the case for open() and incidentally
also for this case (through the flags parameter for recvmsg).  For
socket(), accept(), the situation is unfortunately different and we
need a new interface.

With your proposed patch, we would have to introduce another recvmsg()
interface to take advantage of the additional functionality.  This
just doesn't make any sense.  This is no contest in aesthetics.  You
first have to think about the interface presented to the programmer at
userlevel and then design the syscall interface.  This is how
MSG_CMSG_CLOEXEC came about.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-23 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
> 1) Can the fd passing with recvmsg() on AF_UNIX also gets O_CLOEXEC
> support ?

Already there, see MSG_CMSG_CLOEXEC.


> 2) Why this O_NONBLOCK ability is needed for sockets ? Is it a security
> issue, and if yes could you remind it to me ?

No security issue.  But look at any correct network program, all need to
set the mode to non-blocking.  Adding this support to the syscall comes
at almost no cost and it cuts the cost for every program down by one or
two syscalls.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHR9YQ2ijCOnn/RHQRArbyAJ0d25FPg/BWmJ4YIzJKhO9iaBJNXwCgmpuX
PAA6u3Dc56AlBegTRqtqJPc=
=j5vi
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Where is the new timerfd?

2007-11-23 Thread Ulrich Drepper
On Nov 23, 2007 9:29 AM, Davide Libenzi <[EMAIL PROTECTED]> wrote:
> Yes, it's disabled, and yes, I'll repost today ...

I haven't seen the patch and don't feel like searching.  So I say it
here: please mak sure you add a flags parameter to the system call
itself (instead of adding it on as for eventfd and signalfd).  We need
to be able to use O_CLOEXEC some way or another.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Where is the new timerfd?

2007-11-23 Thread Ulrich Drepper
On Nov 23, 2007 9:29 AM, Davide Libenzi [EMAIL PROTECTED] wrote:
 Yes, it's disabled, and yes, I'll repost today ...

I haven't seen the patch and don't feel like searching.  So I say it
here: please mak sure you add a flags parameter to the system call
itself (instead of adding it on as for eventfd and signalfd).  We need
to be able to use O_CLOEXEC some way or another.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-23 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
 1) Can the fd passing with recvmsg() on AF_UNIX also gets O_CLOEXEC
 support ?

Already there, see MSG_CMSG_CLOEXEC.


 2) Why this O_NONBLOCK ability is needed for sockets ? Is it a security
 issue, and if yes could you remind it to me ?

No security issue.  But look at any correct network program, all need to
set the mode to non-blocking.  Adding this support to the syscall comes
at almost no cost and it cuts the cost for every program down by one or
two syscalls.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHR9YQ2ijCOnn/RHQRArbyAJ0d25FPg/BWmJ4YIzJKhO9iaBJNXwCgmpuX
PAA6u3Dc56AlBegTRqtqJPc=
=j5vi
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 1/5] actual sys_indirect code

2007-11-20 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |   17 +
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |3 +++
 kernel/indirect.c|   40 
 5 files changed, 68 insertions(+)


diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h
--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include 
+
+
+/* IMPORTANT:
+   All the elements of this union must be neutral to the word size
+   and must not require reworking when used in compat syscalls.  Used
+   fixed-size types or types which are known to not vary in size across
+   architectures.  */
+union indirect_params {
+};
+
+#define INDIRECT_PARAM(set, name) current->indirect_params.set.name
+
+#endif
diff -u linux/kernel/Makefile linux/kernel/Makefile
--- linux/kernel/Makefile
+++ linux/kernel/Makefile
@@ -57,6 +57,7 @@
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_ARCH_HAS_INDIRECT_SYSCALLS) += indirect.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <[EMAIL PROTECTED]>, the -fno-omit-frame-pointer is
@@ -67,6 +68,8 @@
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u linux/kernel/indirect.c linux/kernel/indirect.c
--- linux/kernel/indirect.c
+++ linux/kernel/indirect.c
@@ -0,0 +1,40 @@
+#include 
+#include 
+#include 
+#include 
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL ())
+   {
+#define INDSYSCALL(name) __NR_##name
+#include 
+   break;
+
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = call_indirect();
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
diff -u linux/include/linux/syscalls.h linux/include/linux/syscalls.h
--- linux/include/linux/syscalls.h
+++ linux/include/linux/syscalls.h
@@ -54,6 +54,7 @@
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include 
 #include 
@@ -611,6 +612,9 @@
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- linux/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 0/5] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
The following patches provide an alternative implementation of the
sys_indirect system call which has been discussed a few times.
This is a system call that allows us to extend existing system call
interfaces by adding more system call parameters.

Davide's previous implementation is IMO far more complex than
warranted.  This code here is trivial, as you can see.  I've
discussed this approach with Linus recently and for a brief moment
we actually agreed on something.

We pass an additional block of data to the kernel, it is copied into
the task_struct, and then it is up to the function implementing the system
call to interpret the data.  Each system call, which is meant to be
extended this way, has to be white-listed in sys_indirect.  The
alternative is to filter out those system calls which absolutely cannot
be handled using sys_indirect (like clone, execve) since they require
the stack layout of an ordinary system call.  This is more dangerous
since it is too easy to miss a call.

Note that the sys_indirect system call takes an additional parameter which
is for now forced to be zero.  This parameter is meant to enable the use
of sys_indirect to create syslets, asynchronously executed system calls.
This syslet approach is also the main reason for the interface in the form
proposed here.

The code for x86 and x86-64 gets by without a single line of assembly
code.  This is likely to be true for many other archs as well.
There is architecture-dependent code, though.

The last three patches show the first application of the functionality.
They also show a complication: we need the test for valid sub-syscalls in the
main implementation and in the compatibility code.  And more: the actual
sources and generated binary for the test are very different (the numbers
differ).  Duplicating the information is a big problem, though.  I've used
some macro tricks to avoid this.  All the information about the flags and
the system calls using them is concentrated in one header.  This should
keep maintenance bearable.

This patch to use sys_indirect is just the beginning.  More will follow,
but I want to see how these patches are received before I spend more time
on it.  This code is enough to test the implementation with the following
test program.  Adjust it for architectures other than x86 and x86-64.

What is not addressed are differences in opinion about the whole approach.
Maybe Linus can chime in a defend what is basically his design.


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

typedef uint32_t __u32;
typedef uint64_t __u64;

union indirect_params {
  struct {
int flags;
  } file_flags;
};

#ifdef __x86_64__
# define __NR_indirect 286
struct indirect_registers {
  __u64 rax;
  __u64 rdi;
  __u64 rsi;
  __u64 rdx;
  __u64 r10;
  __u64 r8;
  __u64 r9;
};
#elif defined __i386__
# define __NR_indirect 325
struct indirect_registers {
  __u32 eax;
  __u32 ebx;
  __u32 ecx;
  __u32 edx;
  __u32 esi;
  __u32 edi;
  __u32 ebp;
};
#else
# error "need to define __NR_indirect and struct indirect_params"
#endif

#define FILL_IN(var, values...) \
  var = (struct indirect_registers) { values }

int
main (void)
{
  int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP);
  int s1 = fcntl (fd, F_GETFD);
  int t1 = fcntl (fd, F_GETFL);
  printf ("old: FD_CLOEXEC %s set, NONBLOCK %s set\n",
  s1 == 0 ? "not" : "is", (t1 & O_NONBLOCK) ? "is" : "not");
  close (fd);

  union indirect_params i;
  memset(, '\0', sizeof(i));
  i.file_flags.flags = O_CLOEXEC|O_NONBLOCK;

  struct indirect_registers r;
#ifdef __NR_socketcall
# define SOCKOP_socket   1
  long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP };
  FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args);
#else
  FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP);
#endif

  fd = syscall (__NR_indirect, , , sizeof (i), 0);
  int s2 = fcntl (fd, F_GETFD);
  int t2 = fcntl (fd, F_GETFL);
  printf ("new: FD_CLOEXEC %s set, NONBLOCK %s set\n",
  s2 == 0 ? "not" : "is", (t2 & O_NONBLOCK) ? "is" : "not");
  close (fd);

  i.file_flags.flags = O_CLOEXEC;
  sigset_t ss;
  sigemptyset();
  FILL_IN(r, __NR_signalfd, -1, (long) , 8);
  fd = syscall (__NR_indirect, , , sizeof (i), 0);
  int s3 = fcntl (fd, F_GETFD);
  printf ("signalfd: FD_CLOEXEC %s set\n", s3 == 0 ? "not" : "is");
  close (fd);

  FILL_IN(r, __NR_eventfd, 8);
  fd = syscall (__NR_indirect, , , sizeof (i), 0);
  int s4 = fcntl (fd, F_GETFD);
  printf ("eventfd: FD_CLOEXEC %s set\n", s4 == 0 ? "not" : "is");
  close (fd);

  return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0;
}


Signed-off-by: Ulrich Drepper <

[PATCHv5 2/5] x86 support for sys_indirect

2007-11-20 Thread Ulrich Drepper
This part adds support for sys_indirect on x86 and x86-64.

 arch/x86/Kconfig   |3 ++
 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S  |2 +
 arch/x86/ia32/sys_ia32.c   |   38 +
 arch/x86/kernel/syscall_table_32.S |1 
 include/asm-x86/indirect.h |5 
 include/asm-x86/indirect_32.h  |   25 
 include/asm-x86/indirect_64.h  |   36 +++
 include/asm-x86/unistd_32.h|3 +-
 include/asm-x86/unistd_64.h|2 +
 10 files changed, 115 insertions(+), 1 deletion(-)


--- linux/arch/x86/Kconfig
+++ linux/arch/x86/Kconfig
@@ -112,6 +112,9 @@ config GENERIC_TIME_VSYSCALL
bool
default X86_64
 
+config ARCH_HAS_INDIRECT_SYSCALLS
+   def_bool y
+
 
 
 
diff -u linux/include/asm-x86/indirect_32.h linux/include/asm-x86/indirect_32.h
--- linux/include/asm-x86/indirect_32.h
+++ linux/include/asm-x86/indirect_32.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_INDIRECT_32_H
+#define _ASM_X86_INDIRECT_32_H
+
+struct indirect_registers {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->eax
+
+static inline long call_indirect(struct indirect_registers *regs)
+{
+  extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32);
+
+  return sys_call_table[INDIRECT_SYSCALL(regs)](regs->ebx, regs->ecx,
+   regs->edx, regs->esi,
+   regs->edi, regs->ebp);
+}
+
+#endif
diff -u linux/include/asm-x86/indirect_64.h linux/include/asm-x86/indirect_64.h
--- linux/include/asm-x86/indirect_64.h
+++ linux/include/asm-x86/indirect_64.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_INDIRECT_64_H
+#define _ASM_X86_INDIRECT_64_H
+
+struct indirect_registers {
+   __u64 rax;
+   __u64 rdi;
+   __u64 rsi;
+   __u64 rdx;
+   __u64 r10;
+   __u64 r8;
+   __u64 r9;
+};
+
+struct indirect_registers32 {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->rax
+#define INDIRECT_SYSCALL32(regs) (regs)->eax
+
+static inline long call_indirect(struct indirect_registers *regs)
+{
+  extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64);
+
+  return sys_call_table[INDIRECT_SYSCALL(regs)](regs->rdi, regs->rsi,
+   regs->rdx, regs->r10,
+   regs->r8, regs->r9);
+}
+
+#endif
diff -u linux/arch/x86/ia32/sys_ia32.c linux/arch/x86/ia32/sys_ia32.c
--- linux/arch/x86/ia32/sys_ia32.c
+++ linux/arch/x86/ia32/sys_ia32.c
@@ -889,0 +890,38 @@
+
+asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs,
+  void __user *userparams, size_t paramslen,
+  int flags)
+{
+   extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32);
+
+   struct indirect_registers32 regs;
+   long result;
+
+   if (flags != 0)
+   return -EINVAL;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL32())
+   {
+#define INDSYSCALL(name) __NR_ia32_##name
+#include 
+   break;
+
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx,
+  regs.edx, regs.esi,
+  regs.edi, regs.ebp);
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
--- linux/arch/x86/ia32/Makefile
+++ linux/arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- linux/arch/x86/ia32/ia32entry.S
+++ linux/arch/x86/ia32/ia32entry.S
@@ -400,6 +400,7 @@ END(ia32_ptregs_common)
 
.section .rodata,"a"
.align 8
+   .globl ia32_sys_call_table
 ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -726,4 +727,5 @@ ia32_sys_call_table:
.quad compat_sys_timerfd
.quad sys_eventfd
.quad sys32_fallocate
+   .quad sys32_indirect/* 325  */
 ia32_syscall_end:
--- linux/arch/x86/kernel/syscall_table_32.S
+++ 

[PATCHv5 3/5] Allow setting FD_CLOEXEC flag for new sockets

2007-11-20 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   27 +++
 net/socket.c  |   21 +
 3 files changed, 41 insertions(+), 8 deletions(-)


diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h
--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -1,3 +1,4 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
@@ -13,5 +14,31 @@
+  struct {
+int flags;
+  } file_flags;
 };
 
 #define INDIRECT_PARAM(set, name) current->indirect_params.set.name
 
 #endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro "INDSYSCALL"
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
+#endif
--- linux/include/asm-x86/ia32_unistd.h
+++ linux/include/asm-x86/ia32_unistd.h
@@ -12,6 +12,7 @@
 #define __NR_ia32_exit   1
 #define __NR_ia32_read   3
 #define __NR_ia32_write  4
+#define __NR_ia32_socketcall   102
 #define __NR_ia32_sigreturn119
 #define __NR_ia32_rt_sigreturn 173
 
diff -u linux/net/socket.c linux/net/socket.c
--- linux/net/socket.c
+++ linux/net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd >= 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd();
+   int fd = sock_alloc_fd(, flags);
 
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file->f_op == _file_ops)
@@ -1208,7 +1213,7 @@
if (retval < 0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval < 0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err < 0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd();
+   fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1 < 0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd();
+   fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2 < 0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock->ops->owner);
 
-   newfd = sock_alloc_fd();
+   newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 5/5] FD_CLOEXEC support for eventfd, signalfd, timerfd

2007-11-20 Thread Ulrich Drepper
This patch adds support to set the FD_CLOEXEC flag for the file descriptors
returned by eventfd, signalfd, timerfd.

 fs/anon_inodes.c  |   15 +++
 fs/eventfd.c  |5 +++--
 fs/signalfd.c |6 --
 fs/timerfd.c  |6 --
 include/asm-x86/ia32_unistd.h |3 +++
 include/linux/anon_inodes.h   |3 +++
 include/linux/indirect.h  |3 +++
 7 files changed, 31 insertions(+), 10 deletions(-)


--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -40,5 +40,8 @@ union indirect_params {
 #if INDSYSCALL(socketpair)
   case INDSYSCALL(socketpair):
 #endif
+  case INDSYSCALL(eventfd):
+  case INDSYSCALL(signalfd):
+  case INDSYSCALL(timerfd):
 
 #endif
--- linux/fs/anon_inodes.c
+++ linux/fs/anon_inodes.c
@@ -70,9 +70,9 @@ static struct dentry_operations 
anon_inodefs_dentry_operations = {
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.
  */
-int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
-const char *name, const struct file_operations *fops,
-void *priv)
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags)
 {
struct qstr this;
struct dentry *dentry;
@@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct 
file **pfile,
if (!file)
return -ENFILE;
 
-   error = get_unused_fd();
+   error = get_unused_fd_flags(flags);
if (error < 0)
goto err_put_filp;
fd = error;
@@ -138,6 +138,13 @@ err_put_filp:
put_filp(file);
return error;
 }
+
+int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
+const char *name, const struct file_operations *fops,
+void *priv)
+{
+   return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
--- linux/include/linux/anon_inodes.h
+++ linux/include/linux/anon_inodes.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags);
 int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
 const char *name, const struct file_operations *fops,
 void *priv);
--- linux/fs/eventfd.c
+++ linux/fs/eventfd.c
@@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count)
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[eventfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , , "[eventfd]",
+  _fops, ctx,
+  INDIRECT_PARAM(file_flags, flags));
if (!error)
return fd;
 
--- linux/fs/signalfd.c
+++ linux/fs/signalfd.c
@@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user 
*user_mask, size_t sizemas
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[signalfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , ,
+  "[signalfd]", _fops,
+  ctx, INDIRECT_PARAM(file_flags,
+  flags));
if (error)
goto err_fdalloc;
} else {
--- linux/fs/timerfd.c
+++ linux/fs/timerfd.c
@@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int 
flags,
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[timerfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , , "[timerfd]",
+  _fops, ctx,
+  INDIRECT_PARAM(file_flags,
+ flags));
if (error)
goto err_tmrcancel;
} else {
--- linux/include/asm-x86/ia32_unistd.h
+++ linux/include/asm-x86/ia32_unistd.h
@@ -15,5 +15,8 @@
 #define __NR_ia32_socketcall   102
 #define 

[PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-20 Thread Ulrich Drepper
This patch adds support for setting the O_NONBLOCK flag of the file
descriptors returned by socket, socketpair, and accept.

 socket.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)


--- linux/net/socket.c
+++ linux/net/socket.c
@@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags)
return fd;
 }
 
-static int sock_attach_fd(struct socket *sock, struct file *file)
+static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
 {
struct dentry *dentry;
struct qstr name = { .name = "" };
@@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file 
*file)
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
  _file_ops);
SOCK_INODE(sock)->i_fop = _file_ops;
-   file->f_flags = O_RDWR;
+   file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->f_pos = 0;
file->private_data = sock;
 
@@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags)
int fd = sock_alloc_fd(, flags);
 
if (likely(fd >= 0)) {
-   int err = sock_attach_fd(sock, newfile);
+   int err = sock_attach_fd(sock, newfile, flags);
 
if (unlikely(err < 0)) {
put_filp(newfile);
@@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, 
int protocol,
goto out_release_both;
}
 
-   err = sock_attach_fd(sock1, newfile1);
+   err = sock_attach_fd(sock1, newfile1,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err < 0)) {
goto out_fd2;
}
 
-   err = sock_attach_fd(sock2, newfile2);
+   err = sock_attach_fd(sock2, newfile2,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err < 0)) {
fput(newfile1);
goto out_fd1;
@@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user 
*upeer_sockaddr,
goto out_put;
}
 
-   err = sock_attach_fd(newsock, newfile);
+   err = sock_attach_fd(newsock, newfile,
+INDIRECT_PARAM(file_flags, flags));
if (err < 0)
goto out_fd_simple;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.24-rc3: find complains about /proc/net

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Roland McGrath wrote:
> Oh, it seems it has indeed been that way for a very long time, so I was
> mistaken.  It still seems a little odd to me.  Ulrich can say definitively
> whether the kind of concern I mentioned really matters one way or the other
> for glibc.

glibc cannot survive (at least NPTL) if somebody uses funny CLONE_*
flags to separate various pieces of information, e.g., file descriptors.
 So, all the information in each thread's /proc/self should be identical.

When the information is not the same, the current semantics seems to be
more useful.  So I guess, no change is the way to go here.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQ25/2ijCOnn/RHQRAmhhAJsHRF7FqO8DWwZ97gHxIO/i4Z1AAQCffCGa
Q2J8kjthKbbNQf1USWMAw3Y=
=xl/a
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 0/6] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Zach Brown wrote:
> I'm sure the additional parameter will be needed, and it might be pretty
> involved.  I think the current notion of syslets needs, at the very least:

All correct.  I just want to point out that the proposed interface is
sufficiently prepared for this and that there is no need to wait adding
this initial, synchronous syscall stuff before the syslet stuff is
ready.  These interface changes are security-relevant and should be
added ASAP.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQySu2ijCOnn/RHQRAnQqAKCz0JzvmAeEcL8m77jbEYAZ4ZFWXwCgpfvE
do7pJGn9XBu9jfQhfLkxQSc=
=eX6m
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Zach Brown wrote:
> Have you given thought to having to perform compat translation on this?
>   Today it's only copied directly from the user pointer into the union
> in the task_struct.

Since there is no legacy interface to worry about all members added to
the structure can and should be neutral of the word size.  We've done
this with some syscalls already (like pread64) where we always use the
wide form in the parameter list.  It's just more simple here since it
does not have to split into two 32-bit registers.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQyJn2ijCOnn/RHQRAmWeAJ0Q6qBDtZDvsZYlfBnPFL6n11Z+lwCghiVp
NklFHsSnVyQYMD5rinDFQPo=
=Yo5E
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
> as an application writer how do i access accept(2) with FD_CLOEXEC 
> functionality?  will glibc expose an accept2() with a flags param?

Not yet decided.  There is the alternative to extend the accept()
interface to have both interfaces:

  int accept(int, struct sockaddr *, socklen_t *);
and
  int accept(int, struct sockaddr *, socklen_t *, int);

We can do this with type safety even in C nowadays.


> if so... why don't we just have an accept2() syscall?

If you read the mails of my first submission you'll find that I
explained this.  I talked to Andrew and he favored new syscalls.  But
then I talked to Linus and he favored this approach.  Probably
especially because it can be used for syslets as well.  And it is less
code and data than introducing new syscalls.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwhx2ijCOnn/RHQRAnezAKCkFmGwlwDZjpfKTRSUN4yLIeGTkACgtMK/
OcHdIaR8wbp848D3GU2iNYQ=
=nTu9
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 2/6] x86 support for sys_indirect

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Heiko Carstens wrote:
> All these macros could be functions, or? Would give us some type checking
> and avoids the capital letters.

Should be possible now.  I didn't do it initially since the macro used
the macro for the largest syscall number.  That macro wasn't always
available.  I'll test it.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwdg2ijCOnn/RHQRAmh9AJ9EuthsaoupSHn3kR/x0cWxqR3FoQCfSbmE
8RIDWzPKZ6cv+QVGNl0fawM=
=ScgY
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 0/6] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
> I am wondering if some parts are missing from your ChangeLog
> 
> You apparently added in v3 a new 'flags' parameter to indirect syscall
> but no trace of this change in Changelog, and why it was added. This
> seems to imply a future multiplexor.

This was mentioned in one of my mails.  I added the parameter to
accommodate Linus's and Zack's idea to use the functionality for syslets
as well.  Not really a multiplexer, it is meant to be a "execute
synchronously or asynchronously" flag.  In the latter case an additional
parameter might be needed to indicate the notification mechanism.


> And no change in the test program reflecting this 'flags' new param, so
> it fails.

Yep, sorry, I didn't update the text by including the most recent test
program.  I'll do that for the next version.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwca2ijCOnn/RHQRAgQJAKDH+N3+FSJ0kD5VbzbAFN4918wREwCePHbc
nSY/t9x1FuYstYDaaT6Kut0=
=c95e
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

David Miller wrote:
> FWIW, I think this indirect syscall stuff is the most ugly interface
> I've ever seen proposed for the kernel.

Well, the alternative is to introduce a dozens of new interfaces.  It
was Linus who suggested this alternative.  Plus, it seems that for
syslets we need basically the same interface anyway.


> And I agree with all of the objections raised by both H. Pater Anvin
> and Eric Dumazet.

Eric had no arguments and HP's comments lack a viable alternative proposal.


> Where does this INDIRECT_PARAM() macro get defined?  I do not
> see it being defined anywhere in these patches.

Defined in :

+#define INDIRECT_PARAM(set, name) current->indirect_params.set.name

Not my idea, I was following one review comment.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwWl2ijCOnn/RHQRAhEbAJ9/bkrb/phOMRl16Fb0N1TDYglSsgCeNhHQ
3huhdKCAVTu4CJnktf/ufy4=
=Jj6h
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

David Miller wrote:
 FWIW, I think this indirect syscall stuff is the most ugly interface
 I've ever seen proposed for the kernel.

Well, the alternative is to introduce a dozens of new interfaces.  It
was Linus who suggested this alternative.  Plus, it seems that for
syslets we need basically the same interface anyway.


 And I agree with all of the objections raised by both H. Pater Anvin
 and Eric Dumazet.

Eric had no arguments and HP's comments lack a viable alternative proposal.


 Where does this INDIRECT_PARAM() macro get defined?  I do not
 see it being defined anywhere in these patches.

Defined in linux/indirect.h:

+#define INDIRECT_PARAM(set, name) current-indirect_params.set.name

Not my idea, I was following one review comment.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwWl2ijCOnn/RHQRAhEbAJ9/bkrb/phOMRl16Fb0N1TDYglSsgCeNhHQ
3huhdKCAVTu4CJnktf/ufy4=
=Jj6h
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 0/6] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
 I am wondering if some parts are missing from your ChangeLog
 
 You apparently added in v3 a new 'flags' parameter to indirect syscall
 but no trace of this change in Changelog, and why it was added. This
 seems to imply a future multiplexor.

This was mentioned in one of my mails.  I added the parameter to
accommodate Linus's and Zack's idea to use the functionality for syslets
as well.  Not really a multiplexer, it is meant to be a execute
synchronously or asynchronously flag.  In the latter case an additional
parameter might be needed to indicate the notification mechanism.


 And no change in the test program reflecting this 'flags' new param, so
 it fails.

Yep, sorry, I didn't update the text by including the most recent test
program.  I'll do that for the next version.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwca2ijCOnn/RHQRAgQJAKDH+N3+FSJ0kD5VbzbAFN4918wREwCePHbc
nSY/t9x1FuYstYDaaT6Kut0=
=c95e
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
 as an application writer how do i access accept(2) with FD_CLOEXEC 
 functionality?  will glibc expose an accept2() with a flags param?

Not yet decided.  There is the alternative to extend the accept()
interface to have both interfaces:

  int accept(int, struct sockaddr *, socklen_t *);
and
  int accept(int, struct sockaddr *, socklen_t *, int);

We can do this with type safety even in C nowadays.


 if so... why don't we just have an accept2() syscall?

If you read the mails of my first submission you'll find that I
explained this.  I talked to Andrew and he favored new syscalls.  But
then I talked to Linus and he favored this approach.  Probably
especially because it can be used for syslets as well.  And it is less
code and data than introducing new syscalls.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwhx2ijCOnn/RHQRAnezAKCkFmGwlwDZjpfKTRSUN4yLIeGTkACgtMK/
OcHdIaR8wbp848D3GU2iNYQ=
=nTu9
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 2/6] x86x86-64 support for sys_indirect

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Heiko Carstens wrote:
 All these macros could be functions, or? Would give us some type checking
 and avoids the capital letters.

Should be possible now.  I didn't do it initially since the macro used
the macro for the largest syscall number.  That macro wasn't always
available.  I'll test it.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQwdg2ijCOnn/RHQRAmh9AJ9EuthsaoupSHn3kR/x0cWxqR3FoQCfSbmE
8RIDWzPKZ6cv+QVGNl0fawM=
=ScgY
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 0/6] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Zach Brown wrote:
 I'm sure the additional parameter will be needed, and it might be pretty
 involved.  I think the current notion of syslets needs, at the very least:

All correct.  I just want to point out that the proposed interface is
sufficiently prepared for this and that there is no need to wait adding
this initial, synchronous syscall stuff before the syslet stuff is
ready.  These interface changes are security-relevant and should be
added ASAP.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQySu2ijCOnn/RHQRAnQqAKCz0JzvmAeEcL8m77jbEYAZ4ZFWXwCgpfvE
do7pJGn9XBu9jfQhfLkxQSc=
=eX6m
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Zach Brown wrote:
 Have you given thought to having to perform compat translation on this?
   Today it's only copied directly from the user pointer into the union
 in the task_struct.

Since there is no legacy interface to worry about all members added to
the structure can and should be neutral of the word size.  We've done
this with some syscalls already (like pread64) where we always use the
wide form in the parameter list.  It's just more simple here since it
does not have to split into two 32-bit registers.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQyJn2ijCOnn/RHQRAmWeAJ0Q6qBDtZDvsZYlfBnPFL6n11Z+lwCghiVp
NklFHsSnVyQYMD5rinDFQPo=
=Yo5E
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.24-rc3: find complains about /proc/net

2007-11-20 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Roland McGrath wrote:
 Oh, it seems it has indeed been that way for a very long time, so I was
 mistaken.  It still seems a little odd to me.  Ulrich can say definitively
 whether the kind of concern I mentioned really matters one way or the other
 for glibc.

glibc cannot survive (at least NPTL) if somebody uses funny CLONE_*
flags to separate various pieces of information, e.g., file descriptors.
 So, all the information in each thread's /proc/self should be identical.

When the information is not the same, the current semantics seems to be
more useful.  So I guess, no change is the way to go here.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQ25/2ijCOnn/RHQRAmhhAJsHRF7FqO8DWwZ97gHxIO/i4Z1AAQCffCGa
Q2J8kjthKbbNQf1USWMAw3Y=
=xl/a
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 5/5] FD_CLOEXEC support for eventfd, signalfd, timerfd

2007-11-20 Thread Ulrich Drepper
This patch adds support to set the FD_CLOEXEC flag for the file descriptors
returned by eventfd, signalfd, timerfd.

 fs/anon_inodes.c  |   15 +++
 fs/eventfd.c  |5 +++--
 fs/signalfd.c |6 --
 fs/timerfd.c  |6 --
 include/asm-x86/ia32_unistd.h |3 +++
 include/linux/anon_inodes.h   |3 +++
 include/linux/indirect.h  |3 +++
 7 files changed, 31 insertions(+), 10 deletions(-)


--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -40,5 +40,8 @@ union indirect_params {
 #if INDSYSCALL(socketpair)
   case INDSYSCALL(socketpair):
 #endif
+  case INDSYSCALL(eventfd):
+  case INDSYSCALL(signalfd):
+  case INDSYSCALL(timerfd):
 
 #endif
--- linux/fs/anon_inodes.c
+++ linux/fs/anon_inodes.c
@@ -70,9 +70,9 @@ static struct dentry_operations 
anon_inodefs_dentry_operations = {
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.
  */
-int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
-const char *name, const struct file_operations *fops,
-void *priv)
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags)
 {
struct qstr this;
struct dentry *dentry;
@@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct 
file **pfile,
if (!file)
return -ENFILE;
 
-   error = get_unused_fd();
+   error = get_unused_fd_flags(flags);
if (error  0)
goto err_put_filp;
fd = error;
@@ -138,6 +138,13 @@ err_put_filp:
put_filp(file);
return error;
 }
+
+int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
+const char *name, const struct file_operations *fops,
+void *priv)
+{
+   return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
--- linux/include/linux/anon_inodes.h
+++ linux/include/linux/anon_inodes.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags);
 int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
 const char *name, const struct file_operations *fops,
 void *priv);
--- linux/fs/eventfd.c
+++ linux/fs/eventfd.c
@@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count)
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(fd, inode, file, [eventfd],
-eventfd_fops, ctx);
+   error = anon_inode_getfd_flags(fd, inode, file, [eventfd],
+  eventfd_fops, ctx,
+  INDIRECT_PARAM(file_flags, flags));
if (!error)
return fd;
 
--- linux/fs/signalfd.c
+++ linux/fs/signalfd.c
@@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user 
*user_mask, size_t sizemas
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(ufd, inode, file, [signalfd],
-signalfd_fops, ctx);
+   error = anon_inode_getfd_flags(ufd, inode, file,
+  [signalfd], signalfd_fops,
+  ctx, INDIRECT_PARAM(file_flags,
+  flags));
if (error)
goto err_fdalloc;
} else {
--- linux/fs/timerfd.c
+++ linux/fs/timerfd.c
@@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int 
flags,
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(ufd, inode, file, [timerfd],
-timerfd_fops, ctx);
+   error = anon_inode_getfd_flags(ufd, inode, file, [timerfd],
+  timerfd_fops, ctx,
+  INDIRECT_PARAM(file_flags,
+ flags));
if (error)
goto err_tmrcancel;
} else {
--- linux/include/asm-x86/ia32_unistd.h
+++ 

[PATCHv5 4/5] Allow setting O_NONBLOCK flag for new sockets

2007-11-20 Thread Ulrich Drepper
This patch adds support for setting the O_NONBLOCK flag of the file
descriptors returned by socket, socketpair, and accept.

 socket.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)


--- linux/net/socket.c
+++ linux/net/socket.c
@@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags)
return fd;
 }
 
-static int sock_attach_fd(struct socket *sock, struct file *file)
+static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
 {
struct dentry *dentry;
struct qstr name = { .name =  };
@@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file 
*file)
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
  socket_file_ops);
SOCK_INODE(sock)-i_fop = socket_file_ops;
-   file-f_flags = O_RDWR;
+   file-f_flags = O_RDWR | (flags  O_NONBLOCK);
file-f_pos = 0;
file-private_data = sock;
 
@@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags)
int fd = sock_alloc_fd(newfile, flags);
 
if (likely(fd = 0)) {
-   int err = sock_attach_fd(sock, newfile);
+   int err = sock_attach_fd(sock, newfile, flags);
 
if (unlikely(err  0)) {
put_filp(newfile);
@@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, 
int protocol,
goto out_release_both;
}
 
-   err = sock_attach_fd(sock1, newfile1);
+   err = sock_attach_fd(sock1, newfile1,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err  0)) {
goto out_fd2;
}
 
-   err = sock_attach_fd(sock2, newfile2);
+   err = sock_attach_fd(sock2, newfile2,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err  0)) {
fput(newfile1);
goto out_fd1;
@@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user 
*upeer_sockaddr,
goto out_put;
}
 
-   err = sock_attach_fd(newsock, newfile);
+   err = sock_attach_fd(newsock, newfile,
+INDIRECT_PARAM(file_flags, flags));
if (err  0)
goto out_fd_simple;
 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 3/5] Allow setting FD_CLOEXEC flag for new sockets

2007-11-20 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   27 +++
 net/socket.c  |   21 +
 3 files changed, 41 insertions(+), 8 deletions(-)


diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h
--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -1,3 +1,4 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
@@ -13,5 +14,31 @@
+  struct {
+int flags;
+  } file_flags;
 };
 
 #define INDIRECT_PARAM(set, name) current-indirect_params.set.name
 
 #endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro INDSYSCALL
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
+#endif
--- linux/include/asm-x86/ia32_unistd.h
+++ linux/include/asm-x86/ia32_unistd.h
@@ -12,6 +12,7 @@
 #define __NR_ia32_exit   1
 #define __NR_ia32_read   3
 #define __NR_ia32_write  4
+#define __NR_ia32_socketcall   102
 #define __NR_ia32_sigreturn119
 #define __NR_ia32_rt_sigreturn 173
 
diff -u linux/net/socket.c linux/net/socket.c
--- linux/net/socket.c
+++ linux/net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd = 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd(newfile);
+   int fd = sock_alloc_fd(newfile, flags);
 
if (likely(fd = 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file-f_op == socket_file_ops)
@@ -1208,7 +1213,7 @@
if (retval  0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval  0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err  0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd(newfile1);
+   fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1  0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd(newfile2);
+   fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2  0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock-ops-owner);
 
-   newfd = sock_alloc_fd(newfile);
+   newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd  0)) {
err = newfd;
sock_release(newsock);
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 1/5] actual sys_indirect code

2007-11-20 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |   17 +
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |3 +++
 kernel/indirect.c|   40 
 5 files changed, 68 insertions(+)


diff -u linux/include/linux/indirect.h linux/include/linux/indirect.h
--- linux/include/linux/indirect.h
+++ linux/include/linux/indirect.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include asm/indirect.h
+
+
+/* IMPORTANT:
+   All the elements of this union must be neutral to the word size
+   and must not require reworking when used in compat syscalls.  Used
+   fixed-size types or types which are known to not vary in size across
+   architectures.  */
+union indirect_params {
+};
+
+#define INDIRECT_PARAM(set, name) current-indirect_params.set.name
+
+#endif
diff -u linux/kernel/Makefile linux/kernel/Makefile
--- linux/kernel/Makefile
+++ linux/kernel/Makefile
@@ -57,6 +57,7 @@
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_ARCH_HAS_INDIRECT_SYSCALLS) += indirect.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra [EMAIL PROTECTED], the -fno-omit-frame-pointer is
@@ -67,6 +68,8 @@
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u linux/kernel/indirect.c linux/kernel/indirect.c
--- linux/kernel/indirect.c
+++ linux/kernel/indirect.c
@@ -0,0 +1,40 @@
+#include linux/sched.h
+#include linux/uaccess.h
+#include linux/unistd.h
+#include asm/asm-offsets.h
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(regs, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL (regs))
+   {
+#define INDSYSCALL(name) __NR_##name
+#include linux/indirect.h
+   break;
+
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen  sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(current-indirect_params, userparams, paramslen))
+   result = call_indirect(regs);
+
+   memset(current-indirect_params, '\0', paramslen);
+
+   return result;
+}
diff -u linux/include/linux/syscalls.h linux/include/linux/syscalls.h
--- linux/include/linux/syscalls.h
+++ linux/include/linux/syscalls.h
@@ -54,6 +54,7 @@
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include linux/types.h
 #include linux/aio_abi.h
@@ -611,6 +612,9 @@
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- linux/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include linux/rcupdate.h
 #include linux/futex.h
 #include linux/rtmutex.h
+#include linux/indirect.h
 
 #include linux/time.h
 #include linux/param.h
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv5 0/5] sys_indirect system call

2007-11-20 Thread Ulrich Drepper
The following patches provide an alternative implementation of the
sys_indirect system call which has been discussed a few times.
This is a system call that allows us to extend existing system call
interfaces by adding more system call parameters.

Davide's previous implementation is IMO far more complex than
warranted.  This code here is trivial, as you can see.  I've
discussed this approach with Linus recently and for a brief moment
we actually agreed on something.

We pass an additional block of data to the kernel, it is copied into
the task_struct, and then it is up to the function implementing the system
call to interpret the data.  Each system call, which is meant to be
extended this way, has to be white-listed in sys_indirect.  The
alternative is to filter out those system calls which absolutely cannot
be handled using sys_indirect (like clone, execve) since they require
the stack layout of an ordinary system call.  This is more dangerous
since it is too easy to miss a call.

Note that the sys_indirect system call takes an additional parameter which
is for now forced to be zero.  This parameter is meant to enable the use
of sys_indirect to create syslets, asynchronously executed system calls.
This syslet approach is also the main reason for the interface in the form
proposed here.

The code for x86 and x86-64 gets by without a single line of assembly
code.  This is likely to be true for many other archs as well.
There is architecture-dependent code, though.

The last three patches show the first application of the functionality.
They also show a complication: we need the test for valid sub-syscalls in the
main implementation and in the compatibility code.  And more: the actual
sources and generated binary for the test are very different (the numbers
differ).  Duplicating the information is a big problem, though.  I've used
some macro tricks to avoid this.  All the information about the flags and
the system calls using them is concentrated in one header.  This should
keep maintenance bearable.

This patch to use sys_indirect is just the beginning.  More will follow,
but I want to see how these patches are received before I spend more time
on it.  This code is enough to test the implementation with the following
test program.  Adjust it for architectures other than x86 and x86-64.

What is not addressed are differences in opinion about the whole approach.
Maybe Linus can chime in a defend what is basically his design.


#include fcntl.h
#include signal.h
#include stdint.h
#include stdio.h
#include unistd.h
#include netinet/in.h
#include sys/socket.h
#include sys/syscall.h

typedef uint32_t __u32;
typedef uint64_t __u64;

union indirect_params {
  struct {
int flags;
  } file_flags;
};

#ifdef __x86_64__
# define __NR_indirect 286
struct indirect_registers {
  __u64 rax;
  __u64 rdi;
  __u64 rsi;
  __u64 rdx;
  __u64 r10;
  __u64 r8;
  __u64 r9;
};
#elif defined __i386__
# define __NR_indirect 325
struct indirect_registers {
  __u32 eax;
  __u32 ebx;
  __u32 ecx;
  __u32 edx;
  __u32 esi;
  __u32 edi;
  __u32 ebp;
};
#else
# error need to define __NR_indirect and struct indirect_params
#endif

#define FILL_IN(var, values...) \
  var = (struct indirect_registers) { values }

int
main (void)
{
  int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP);
  int s1 = fcntl (fd, F_GETFD);
  int t1 = fcntl (fd, F_GETFL);
  printf (old: FD_CLOEXEC %s set, NONBLOCK %s set\n,
  s1 == 0 ? not : is, (t1  O_NONBLOCK) ? is : not);
  close (fd);

  union indirect_params i;
  memset(i, '\0', sizeof(i));
  i.file_flags.flags = O_CLOEXEC|O_NONBLOCK;

  struct indirect_registers r;
#ifdef __NR_socketcall
# define SOCKOP_socket   1
  long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP };
  FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args);
#else
  FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP);
#endif

  fd = syscall (__NR_indirect, r, i, sizeof (i), 0);
  int s2 = fcntl (fd, F_GETFD);
  int t2 = fcntl (fd, F_GETFL);
  printf (new: FD_CLOEXEC %s set, NONBLOCK %s set\n,
  s2 == 0 ? not : is, (t2  O_NONBLOCK) ? is : not);
  close (fd);

  i.file_flags.flags = O_CLOEXEC;
  sigset_t ss;
  sigemptyset(ss);
  FILL_IN(r, __NR_signalfd, -1, (long) ss, 8);
  fd = syscall (__NR_indirect, r, i, sizeof (i), 0);
  int s3 = fcntl (fd, F_GETFD);
  printf (signalfd: FD_CLOEXEC %s set\n, s3 == 0 ? not : is);
  close (fd);

  FILL_IN(r, __NR_eventfd, 8);
  fd = syscall (__NR_indirect, r, i, sizeof (i), 0);
  int s4 = fcntl (fd, F_GETFD);
  printf (eventfd: FD_CLOEXEC %s set\n, s4 == 0 ? not : is);
  close (fd);

  return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0;
}


Signed-off-by: Ulrich Drepper [EMAIL PROTECTED]


 arch/x86/Kconfig   |3 ++
 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S

[PATCHv5 2/5] x86x86-64 support for sys_indirect

2007-11-20 Thread Ulrich Drepper
This part adds support for sys_indirect on x86 and x86-64.

 arch/x86/Kconfig   |3 ++
 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S  |2 +
 arch/x86/ia32/sys_ia32.c   |   38 +
 arch/x86/kernel/syscall_table_32.S |1 
 include/asm-x86/indirect.h |5 
 include/asm-x86/indirect_32.h  |   25 
 include/asm-x86/indirect_64.h  |   36 +++
 include/asm-x86/unistd_32.h|3 +-
 include/asm-x86/unistd_64.h|2 +
 10 files changed, 115 insertions(+), 1 deletion(-)


--- linux/arch/x86/Kconfig
+++ linux/arch/x86/Kconfig
@@ -112,6 +112,9 @@ config GENERIC_TIME_VSYSCALL
bool
default X86_64
 
+config ARCH_HAS_INDIRECT_SYSCALLS
+   def_bool y
+
 
 
 
diff -u linux/include/asm-x86/indirect_32.h linux/include/asm-x86/indirect_32.h
--- linux/include/asm-x86/indirect_32.h
+++ linux/include/asm-x86/indirect_32.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_INDIRECT_32_H
+#define _ASM_X86_INDIRECT_32_H
+
+struct indirect_registers {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)-eax
+
+static inline long call_indirect(struct indirect_registers *regs)
+{
+  extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, __u32);
+
+  return sys_call_table[INDIRECT_SYSCALL(regs)](regs-ebx, regs-ecx,
+   regs-edx, regs-esi,
+   regs-edi, regs-ebp);
+}
+
+#endif
diff -u linux/include/asm-x86/indirect_64.h linux/include/asm-x86/indirect_64.h
--- linux/include/asm-x86/indirect_64.h
+++ linux/include/asm-x86/indirect_64.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_INDIRECT_64_H
+#define _ASM_X86_INDIRECT_64_H
+
+struct indirect_registers {
+   __u64 rax;
+   __u64 rdi;
+   __u64 rsi;
+   __u64 rdx;
+   __u64 r10;
+   __u64 r8;
+   __u64 r9;
+};
+
+struct indirect_registers32 {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)-rax
+#define INDIRECT_SYSCALL32(regs) (regs)-eax
+
+static inline long call_indirect(struct indirect_registers *regs)
+{
+  extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, __u64);
+
+  return sys_call_table[INDIRECT_SYSCALL(regs)](regs-rdi, regs-rsi,
+   regs-rdx, regs-r10,
+   regs-r8, regs-r9);
+}
+
+#endif
diff -u linux/arch/x86/ia32/sys_ia32.c linux/arch/x86/ia32/sys_ia32.c
--- linux/arch/x86/ia32/sys_ia32.c
+++ linux/arch/x86/ia32/sys_ia32.c
@@ -889,0 +890,38 @@
+
+asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs,
+  void __user *userparams, size_t paramslen,
+  int flags)
+{
+   extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32);
+
+   struct indirect_registers32 regs;
+   long result;
+
+   if (flags != 0)
+   return -EINVAL;
+
+   if (copy_from_user(regs, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL32(regs))
+   {
+#define INDSYSCALL(name) __NR_ia32_##name
+#include linux/indirect.h
+   break;
+
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen  sizeof(union indirect_params))
+   return -EINVAL;
+   result = -EFAULT;
+   if (!copy_from_user(current-indirect_params, userparams, paramslen))
+   result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx,
+  regs.edx, regs.esi,
+  regs.edi, regs.ebp);
+
+   memset(current-indirect_params, '\0', paramslen);
+
+   return result;
+}
--- linux/arch/x86/ia32/Makefile
+++ linux/arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- linux/arch/x86/ia32/ia32entry.S
+++ linux/arch/x86/ia32/ia32entry.S
@@ -400,6 +400,7 @@ END(ia32_ptregs_common)
 
.section .rodata,a
.align 8
+   .globl ia32_sys_call_table
 ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -726,4 +727,5 @@ ia32_sys_call_table:
.quad compat_sys_timerfd
.quad sys_eventfd
.quad sys32_fallocate
+   .quad sys32_indirect/* 325  */
 ia32_syscall_end:
--- linux/arch/x86/kernel/syscall_table_32.S
+++ 

[PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets

2007-11-19 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 arch/x86/ia32/Makefile|1 +
 arch/x86/ia32/sys_ia32.c  |4 
 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   33 +
 kernel/Makefile   |2 ++
 kernel/indirect.c |4 
 net/socket.c  |   21 +
 7 files changed, 58 insertions(+), 8 deletions(-)

--- arch/x86/ia32/Makefile
+++ arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- kernel/Makefile
+++ kernel/Makefile
@@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u net/socket.c net/socket.c
--- net/socket.c
+++ net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd >= 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd();
+   int fd = sock_alloc_fd(, flags);
 
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file->f_op == _file_ops)
@@ -1208,7 +1213,7 @@
if (retval < 0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval < 0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err < 0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd();
+   fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1 < 0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd();
+   fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2 < 0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock->ops->owner);
 
-   newfd = sock_alloc_fd();
+   newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -902,6 +902,10 @@
 
switch (INDIRECT_SYSCALL32())
{
+#define INDSYSCALL(name) __NR_ia32_##name
+#include 
+   break;
+
default:
return -EINVAL;
}
diff -u include/linux/indirect.h include/linux/indirect.h
--- include/linux/indirect.h
+++ include/linux/indirect.h
@@ -1,6 +1,39 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
 #include 
 
+
+union indirect_params {
+  struct {
+int flags;
+  } file_flags;
+};
+
+#define INDIRECT_PARAM(set, name) current->indirect_params.set.name
+
+#endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro "INDSYSCALL"
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
 #endif
diff -u kernel/indirect.c kernel/indirect.c
--- kernel/indirect.c
+++ kernel/indirect.c
@@ -19,6 +19,10 @@
 
switch (INDIRECT_SYSCALL ())
{
+#define INDSYSCALL(name) __NR_##name
+#include 
+   break;
+
default:
return -EINVAL;
}
--- include/asm-x86/ia32_unistd.h
+++ include/asm-x86/ia32_unistd.h
@@ -12,6 +12,7 @@
 

[PATCHv4 1/6] actual sys_indirect code

2007-11-19 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |6 ++
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |2 +-
 kernel/indirect.c|   36 
 5 files changed, 51 insertions(+), 1 deletion(-)

--- /dev/null
+++ include/linux/indirect.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include 
+
+#endif
--- include/linux/sched.h
+++ include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
--- include/linux/syscalls.h
+++ include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include 
 #include 
@@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- /dev/null
+++ kernel/indirect.c
@@ -0,0 +1,36 @@
+#include 
+#include 
+#include 
+#include 
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL ())
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = CALL_INDIRECT();
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
--- kernel/Makefile
+++ kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o 
profile.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
-   utsname.o notifier.o
+   utsname.o notifier.o indirect.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-19 Thread Ulrich Drepper
This patch adds support for setting the O_NONBLOCK flag of the file
descriptors returned by socket, socketpair, and accept.

 socket.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

--- net/socket.c
+++ net/socket.c
@@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags)
return fd;
 }
 
-static int sock_attach_fd(struct socket *sock, struct file *file)
+static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
 {
struct dentry *dentry;
struct qstr name = { .name = "" };
@@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file 
*file)
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
  _file_ops);
SOCK_INODE(sock)->i_fop = _file_ops;
-   file->f_flags = O_RDWR;
+   file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->f_pos = 0;
file->private_data = sock;
 
@@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags)
int fd = sock_alloc_fd(, flags);
 
if (likely(fd >= 0)) {
-   int err = sock_attach_fd(sock, newfile);
+   int err = sock_attach_fd(sock, newfile, flags);
 
if (unlikely(err < 0)) {
put_filp(newfile);
@@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, 
int protocol,
goto out_release_both;
}
 
-   err = sock_attach_fd(sock1, newfile1);
+   err = sock_attach_fd(sock1, newfile1,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err < 0)) {
goto out_fd2;
}
 
-   err = sock_attach_fd(sock2, newfile2);
+   err = sock_attach_fd(sock2, newfile2,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err < 0)) {
fput(newfile1);
goto out_fd1;
@@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user 
*upeer_sockaddr,
goto out_put;
}
 
-   err = sock_attach_fd(newsock, newfile);
+   err = sock_attach_fd(newsock, newfile,
+INDIRECT_PARAM(file_flags, flags));
if (err < 0)
goto out_fd_simple;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 2/6] x86 support for sys_indirect

2007-11-19 Thread Ulrich Drepper
This part adds support for sys_indirect on x86 and x86-64.

 arch/x86/ia32/ia32entry.S  |2 ++
 arch/x86/ia32/sys_ia32.c   |   31 +++
 arch/x86/kernel/syscall_table_32.S |1 +
 include/asm-x86/indirect.h |5 +
 include/asm-x86/indirect_32.h  |   23 +++
 include/asm-x86/indirect_64.h  |   34 ++
 include/asm-x86/unistd_32.h|3 ++-
 include/asm-x86/unistd_64.h|2 ++
 8 files changed, 100 insertions(+), 1 deletion(-)

--- arch/x86/ia32/ia32entry.S
+++ arch/x86/ia32/ia32entry.S
@@ -400,6 +400,7 @@ END(ia32_ptregs_common)
 
.section .rodata,"a"
.align 8
+   .globl ia32_sys_call_table
 ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -726,4 +727,5 @@ ia32_sys_call_table:
.quad compat_sys_timerfd
.quad sys_eventfd
.quad sys32_fallocate
+   .quad sys32_indirect/* 325  */
 ia32_syscall_end:
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -887,3 +887,37 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned 
offset_lo,
return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 ((u64)len_hi << 32) | len_lo);
 }
+
+asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs,
+  void __user *userparams, size_t paramslen,
+  int flags)
+{
+   extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32);
+
+   struct indirect_registers32 regs;
+   long result;
+
+   if (flags != 0)
+   return -EINVAL;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL32())
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx,
+  regs.edx, regs.esi,
+  regs.edi, regs.ebp);
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
--- arch/x86/kernel/syscall_table_32.S
+++ arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,4 @@ ENTRY(sys_call_table)
.long sys_timerfd
.long sys_eventfd
.long sys_fallocate
+   .long sys_indirect  /* 325 */
--- /dev/null
+++ include/asm-x86/indirect_32.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_INDIRECT_32_H
+#define _ASM_X86_INDIRECT_32_H
+
+struct indirect_registers {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, 
__u32); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->ebx, (regs)->ecx, \
+(regs)->edx, (regs)->esi, \
+(regs)->edi, (regs)->ebp); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect_64.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_INDIRECT_64_H
+#define _ASM_X86_INDIRECT_64_H
+
+struct indirect_registers {
+   __u64 rax;
+   __u64 rdi;
+   __u64 rsi;
+   __u64 rdx;
+   __u64 r10;
+   __u64 r8;
+   __u64 r9;
+};
+
+struct indirect_registers32 {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->rax
+#define INDIRECT_SYSCALL32(regs) (regs)->eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, 
__u64); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->rdi, (regs)->rsi, \
+(regs)->rdx, (regs)->r10, \
+(regs)->r8, (regs)->r9); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "indirect_32.h"
+#else
+# include "indirect_64.h"
+#endif
--- include/asm-x86/unistd_32.h
+++ include/asm-x86/unistd_32.h
@@ -330,10 +330,11 @@
 #define __NR_timerfd   322
 #define __NR_eventfd   323
 #define __NR_fallocate 324
+#define __NR_indirect  325
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 325
+#define NR_syscalls 326
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
--- include/asm-x86/unistd_64.h
+++ include/asm-x86/unistd_64.h
@@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd)
 __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate  

[PATCHv4 3/6] UML support for sys_indirect

2007-11-19 Thread Ulrich Drepper
This part adds support for sys_indirect for UML.

 indirect.h |6 ++
 1 file changed, 6 insertions(+)

--- /dev/null
+++ include/asm-um/indirect.h
@@ -0,0 +1,6 @@
+#ifndef __UM_INDIRECT_H
+#define __UM_INDIRECT_H
+
+#include "asm/arch/indirect.h"
+
+#endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 0/6] sys_indirect system call

2007-11-19 Thread Ulrich Drepper

wing patches provide an alternative implementation of the
sys_indirect system call which has been discussed a few times.
This no system call allows us to extend existing system call
interfaces with adding more system calls.

Davide's previous implementation is IMO far more complex than
warranted.  This code here is trivial, as you can see.  I've
discussed this approach with Linus last week and for a brief moment
we actually agreed on something.

We pass an additional block of data to the kernel, it is copied into
the task_struct, and then it is up to the function implementing the system
call to interpret the data.  Each system call, which is meant to be
extended this way, has to be white-listed in sys_indirect.  The
alternative is to filter out those system calls which absolutely cannot
be handled using sys_indirect (like clone, execve) since they require
the stack layout of an ordinary system call.  This is more dangerous
since it is too easy to miss a call.

The code for x86 and x86-64 gets by without a single line of assembly
code.  This is likely to be true for most/all the other archs as well.
There is architecture-dependent code, though.  For x86 and x86-64 I've
also fixed up UML (although only x86-64 is tested, that's my setup).

The last three patches show the first application of the functionality.
They also show a complication: we need the test for valid sub-syscalls in the
main implementation and in the compatibility code.  And more: the actual
sources and generated binary for the test are very different (the numbers
differ).  Duplicating the information is a big problem, though.  I've used
some macro tricks to avoid this.  All the information about the flags and
the system calls using them is concentrated in one header.  This should
maintenance bearable.

This patch to use sys_indirect is just the beginning.  More will follow,
but I want to see how these patches are received before I spend more time
on it.  This code is enough to test the implementation with the following
test program.  Adjust it for architectures other than x86 and x86-64.


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

typedef uint32_t __u32;
typedef uint64_t __u64;

union indirect_params {
  struct {
int flags;
  } file_flags;
};

#ifdef __x86_64__
# define __NR_indirect 286
struct indirect_registers {
  __u64 rax;
  __u64 rdi;
  __u64 rsi;
  __u64 rdx;
  __u64 r10;
  __u64 r8;
  __u64 r9;
};
#elif defined __i386__
# define __NR_indirect 325
struct indirect_registers {
  __u32 eax;
  __u32 ebx;
  __u32 ecx;
  __u32 edx;
  __u32 esi;
  __u32 edi;
  __u32 ebp;
};
#else
# error "need to define __NR_indirect and struct indirect_params"
#endif

#define FILL_IN(var, values...) \
  var = (struct indirect_registers) { values }

int
main (void)
{
  int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP);
  int s1 = fcntl (fd, F_GETFD);
  int t1 = fcntl (fd, F_GETFL);
  printf ("old: FD_CLOEXEC %s set, NONBLOCK %s set\n",
  s1 == 0 ? "not" : "is", (t1 & O_NONBLOCK) ? "is" : "not");
  close (fd);

  union indirect_params i;
  i.file_flags.flags = O_CLOEXEC|O_NONBLOCK;

  struct indirect_registers r;
#ifdef __NR_socketcall
# define SOCKOP_socket   1
  long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP };
  FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args);
#else
  FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP);
#endif

  fd = syscall (__NR_indirect, , , sizeof (i));
  int s2 = fcntl (fd, F_GETFD);
  int t2 = fcntl (fd, F_GETFL);
  printf ("new: FD_CLOEXEC %s set, NONBLOCK %s set\n",
  s2 == 0 ? "not" : "is", (t2 & O_NONBLOCK) ? "is" : "not");
  close (fd);

  i.file_flags.flags = O_CLOEXEC;
  sigset_t ss;
  sigemptyset();
  FILL_IN(r, __NR_signalfd, -1, (long) , 8);
  fd = syscall (__NR_indirect, , , sizeof (i));
  int s3 = fcntl (fd, F_GETFD);
  printf ("signalfd: FD_CLOEXEC %s set\n", s3 == 0 ? "not" : "is");
  close (fd);

  FILL_IN(r, __NR_eventfd, 8);
  fd = syscall (__NR_indirect, , , sizeof (i));
  int s4 = fcntl (fd, F_GETFD);
  printf ("eventfd: FD_CLOEXEC %s set\n", s4 == 0 ? "not" : "is");
  close (fd);

  return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0;
}


Signed-off-by: Ulrich Drepper <[EMAIL PROTECTED]>


 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S  |2 +
 arch/x86/ia32/sys_ia32.c   |   37 +-
 arch/x86/kernel/syscall_table_32.S |1 
 include/asm-um/indirect.h  |6 +
 include/asm-x86/ia32_unistd.h  |1 
 include/asm-x86/indirect.h |5 
 include/asm-x86/indirect_32.h  |   23 

[PATCHv4 6/6] FD_CLOEXEC support for eventfd, signalfd, timerfd

2007-11-19 Thread Ulrich Drepper
This patch adds support to set the FD_CLOEXEC flag for the file descriptors
returned by eventfd, signalfd, timerfd.

 fs/anon_inodes.c  |   15 +++
 fs/eventfd.c  |5 +++--
 fs/signalfd.c |6 --
 fs/timerfd.c  |6 --
 include/asm-x86/ia32_unistd.h |3 +++
 include/linux/anon_inodes.h   |3 +++
 include/linux/indirect.h  |3 +++
 7 files changed, 31 insertions(+), 10 deletions(-)

--- fs/anon_inodes.c
+++ fs/anon_inodes.c
@@ -70,9 +70,9 @@ static struct dentry_operations 
anon_inodefs_dentry_operations = {
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.
  */
-int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
-const char *name, const struct file_operations *fops,
-void *priv)
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags)
 {
struct qstr this;
struct dentry *dentry;
@@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct 
file **pfile,
if (!file)
return -ENFILE;
 
-   error = get_unused_fd();
+   error = get_unused_fd_flags(flags);
if (error < 0)
goto err_put_filp;
fd = error;
@@ -138,6 +138,13 @@ err_put_filp:
put_filp(file);
return error;
 }
+
+int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
+const char *name, const struct file_operations *fops,
+void *priv)
+{
+   return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
--- fs/eventfd.c
+++ fs/eventfd.c
@@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count)
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[eventfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , , "[eventfd]",
+  _fops, ctx,
+  INDIRECT_PARAM(file_flags, flags));
if (!error)
return fd;
 
--- fs/signalfd.c
+++ fs/signalfd.c
@@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user 
*user_mask, size_t sizemas
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[signalfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , ,
+  "[signalfd]", _fops,
+  ctx, INDIRECT_PARAM(file_flags,
+  flags));
if (error)
goto err_fdalloc;
} else {
--- fs/timerfd.c
+++ fs/timerfd.c
@@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int 
flags,
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(, , , "[timerfd]",
-_fops, ctx);
+   error = anon_inode_getfd_flags(, , , "[timerfd]",
+  _fops, ctx,
+  INDIRECT_PARAM(file_flags,
+ flags));
if (error)
goto err_tmrcancel;
} else {
--- include/asm-x86/ia32_unistd.h
+++ include/asm-x86/ia32_unistd.h
@@ -15,5 +15,8 @@
 #define __NR_ia32_socketcall   102
 #define __NR_ia32_sigreturn119
 #define __NR_ia32_rt_sigreturn 173
+#define __NR_ia32_signalfd 321
+#define __NR_ia32_timerfd  322
+#define __NR_ia32_eventfd  323
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
--- include/linux/anon_inodes.h
+++ include/linux/anon_inodes.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags);
 int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
 const char *name, const struct file_operations *fops,
 void *priv);
--- include/linux/indirect.h
+++ include/linux/indirect.h
@@ -35,5 +35,8 @@ union indirect_params {
 #if INDSYSCALL(socketpair)
   case 

Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
> What bothers me about the sys_indirect approach is that it will get
> increasingly expensive as time goes on, and in doing so it does a
> user-space memory reference, which are extra expensive.  The extra table
> can be colocated with the main table (a structure, in effect) so they'll
> share the same cache line.

You assume that using sys_indirect will be the norm.  It won't.  We
mustn't design system calls deliberately wrong so that they require the
indirection.

Beside, if the number of syscalls which has to be handled this way grows
we can use something more efficient for large numbers of test than a
switch statement.  It could even be a word next to the system call table.

But I still don't see that the magic encoding is a valid solution, it
doesn't address the limited parameter number.  Plus, using sys_indirect
could in future be used to transport entire parameters (like a sigset_t)
along with other information, thereby saving individual copy operations.

I think the sys_indirect approach is the way forward.  I'll submit a
last version of the patch in a bit.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQlRw2ijCOnn/RHQRApifAKDE1nZqRbm4cJxbhobBb7jCx1T00QCgiSa0
EXKjL2Gwu3atSLSD+Rb4yO4=
=6ZGt
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
> So when you recompile your old program (as you post it and as I commented on),
> it will pass a >= 12 bytes data to kernel, with only first 4 bytes set to 
> O_CLOEXEC.
> 
> Other bytes will contain junk 

If you don't initialize the entire structure and you use it all, of
course you get undefined behavior.  That's nothing new.  The program I
attached is not an example, it's a test for the functionality in this patch.

Like with every kernel interface, you have to use it correctly.  The
good news is that user programs should never use this syscall directly
(just like don't for existing ones).

I see no problem at all here.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQbBH2ijCOnn/RHQRAkc3AKCxVTWQ3BiQnCBwdbAsT122QWWaiwCggKXN
Z5Sz9/NFojMHZXXTzIMoxX4=
=slte
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
>>   union indirect_params i;
>>   i.file_flags.flags = O_CLOEXEC;
> 
> This setup forbids future addons to file_flags
> 
> In three years, when we want to add a new indirect feature to socket() 
> call, do we need a new indirect2() syscall ?

No, it doesn't.  The setup is indefinitely expandable.

All you need to do, if it becomes necessary to have more than an int, is
to define a little structure for the system call and then use it.  The
only requirement is that the code has to assume a value of zero is what
is used today.  That's the whole point.

union indirect_params {
  struct {
int flags;
  } file_flags;
  struct {
int flags;
int new_syscall_data1;
sigset_t and_a_sigmask;
  } new_data;
};

Old programs will set only the 'flags' member of 'new_data' while new
once can also set the new elements.  New programs on old kernels will
eithe have failing calls since the structure is too big or the call will
not have all the desired effects.  The latter can be tested for.


> Or better, you could avoid using 'union indirect_params' in user code, and 
> only use the substructs for each function.

There is no overhead introduced through the union.  The only reason the
union is there in the first place is to allocate sufficient data in
task_struct to cover all cases.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQafd2ijCOnn/RHQRAlSFAJ99lahwCDZGRSlIHCov5bWowrpoiQCgwvW4
LDSEusNUpMfIE1ywBCRDBfc=
=ChVT
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
   union indirect_params i;
   i.file_flags.flags = O_CLOEXEC;
 
 This setup forbids future addons to file_flags
 
 In three years, when we want to add a new indirect feature to socket() 
 call, do we need a new indirect2() syscall ?

No, it doesn't.  The setup is indefinitely expandable.

All you need to do, if it becomes necessary to have more than an int, is
to define a little structure for the system call and then use it.  The
only requirement is that the code has to assume a value of zero is what
is used today.  That's the whole point.

union indirect_params {
  struct {
int flags;
  } file_flags;
  struct {
int flags;
int new_syscall_data1;
sigset_t and_a_sigmask;
  } new_data;
};

Old programs will set only the 'flags' member of 'new_data' while new
once can also set the new elements.  New programs on old kernels will
eithe have failing calls since the structure is too big or the call will
not have all the desired effects.  The latter can be tested for.


 Or better, you could avoid using 'union indirect_params' in user code, and 
 only use the substructs for each function.

There is no overhead introduced through the union.  The only reason the
union is there in the first place is to allocate sufficient data in
task_struct to cover all cases.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQafd2ijCOnn/RHQRAlSFAJ99lahwCDZGRSlIHCov5bWowrpoiQCgwvW4
LDSEusNUpMfIE1ywBCRDBfc=
=ChVT
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Eric Dumazet wrote:
 So when you recompile your old program (as you post it and as I commented on),
 it will pass a = 12 bytes data to kernel, with only first 4 bytes set to 
 O_CLOEXEC.
 
 Other bytes will contain junk 

If you don't initialize the entire structure and you use it all, of
course you get undefined behavior.  That's nothing new.  The program I
attached is not an example, it's a test for the functionality in this patch.

Like with every kernel interface, you have to use it correctly.  The
good news is that user programs should never use this syscall directly
(just like don't for existing ones).

I see no problem at all here.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQbBH2ijCOnn/RHQRAkc3AKCxVTWQ3BiQnCBwdbAsT122QWWaiwCggKXN
Z5Sz9/NFojMHZXXTzIMoxX4=
=slte
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-19 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
 What bothers me about the sys_indirect approach is that it will get
 increasingly expensive as time goes on, and in doing so it does a
 user-space memory reference, which are extra expensive.  The extra table
 can be colocated with the main table (a structure, in effect) so they'll
 share the same cache line.

You assume that using sys_indirect will be the norm.  It won't.  We
mustn't design system calls deliberately wrong so that they require the
indirection.

Beside, if the number of syscalls which has to be handled this way grows
we can use something more efficient for large numbers of test than a
switch statement.  It could even be a word next to the system call table.

But I still don't see that the magic encoding is a valid solution, it
doesn't address the limited parameter number.  Plus, using sys_indirect
could in future be used to transport entire parameters (like a sigset_t)
along with other information, thereby saving individual copy operations.

I think the sys_indirect approach is the way forward.  I'll submit a
last version of the patch in a bit.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHQlRw2ijCOnn/RHQRApifAKDE1nZqRbm4cJxbhobBb7jCx1T00QCgiSa0
EXKjL2Gwu3atSLSD+Rb4yO4=
=6ZGt
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 4/6] Allow setting FD_CLOEXEC flag for new sockets

2007-11-19 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 arch/x86/ia32/Makefile|1 +
 arch/x86/ia32/sys_ia32.c  |4 
 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   33 +
 kernel/Makefile   |2 ++
 kernel/indirect.c |4 
 net/socket.c  |   21 +
 7 files changed, 58 insertions(+), 8 deletions(-)

--- arch/x86/ia32/Makefile
+++ arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- kernel/Makefile
+++ kernel/Makefile
@@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u net/socket.c net/socket.c
--- net/socket.c
+++ net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd = 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd(newfile);
+   int fd = sock_alloc_fd(newfile, flags);
 
if (likely(fd = 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file-f_op == socket_file_ops)
@@ -1208,7 +1213,7 @@
if (retval  0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval  0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err  0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd(newfile1);
+   fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1  0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd(newfile2);
+   fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2  0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock-ops-owner);
 
-   newfd = sock_alloc_fd(newfile);
+   newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd  0)) {
err = newfd;
sock_release(newsock);
diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -902,6 +902,10 @@
 
switch (INDIRECT_SYSCALL32(regs))
{
+#define INDSYSCALL(name) __NR_ia32_##name
+#include linux/indirect.h
+   break;
+
default:
return -EINVAL;
}
diff -u include/linux/indirect.h include/linux/indirect.h
--- include/linux/indirect.h
+++ include/linux/indirect.h
@@ -1,6 +1,39 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
 #include asm/indirect.h
 
+
+union indirect_params {
+  struct {
+int flags;
+  } file_flags;
+};
+
+#define INDIRECT_PARAM(set, name) current-indirect_params.set.name
+
+#endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro INDSYSCALL
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
 #endif
diff -u kernel/indirect.c kernel/indirect.c
--- kernel/indirect.c
+++ kernel/indirect.c
@@ -19,6 +19,10 @@
 
switch (INDIRECT_SYSCALL (regs))
{
+#define INDSYSCALL(name) __NR_##name
+#include linux/indirect.h
+   break;
+
default:
return 

[PATCHv4 1/6] actual sys_indirect code

2007-11-19 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |6 ++
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |2 +-
 kernel/indirect.c|   36 
 5 files changed, 51 insertions(+), 1 deletion(-)

--- /dev/null
+++ include/linux/indirect.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include asm/indirect.h
+
+#endif
--- include/linux/sched.h
+++ include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include linux/rcupdate.h
 #include linux/futex.h
 #include linux/rtmutex.h
+#include linux/indirect.h
 
 #include linux/time.h
 #include linux/param.h
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
--- include/linux/syscalls.h
+++ include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include linux/types.h
 #include linux/aio_abi.h
@@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- /dev/null
+++ kernel/indirect.c
@@ -0,0 +1,36 @@
+#include linux/sched.h
+#include linux/uaccess.h
+#include linux/unistd.h
+#include asm/asm-offsets.h
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(regs, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL (regs))
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen  sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(current-indirect_params, userparams, paramslen))
+   result = CALL_INDIRECT(regs);
+
+   memset(current-indirect_params, '\0', paramslen);
+
+   return result;
+}
--- kernel/Makefile
+++ kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o 
profile.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
-   utsname.o notifier.o
+   utsname.o notifier.o indirect.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 3/6] UML support for sys_indirect

2007-11-19 Thread Ulrich Drepper
This part adds support for sys_indirect for UML.

 indirect.h |6 ++
 1 file changed, 6 insertions(+)

--- /dev/null
+++ include/asm-um/indirect.h
@@ -0,0 +1,6 @@
+#ifndef __UM_INDIRECT_H
+#define __UM_INDIRECT_H
+
+#include asm/arch/indirect.h
+
+#endif
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv4 2/6] x86x86-64 support for sys_indirect

2007-11-19 Thread Ulrich Drepper
This part adds support for sys_indirect on x86 and x86-64.

 arch/x86/ia32/ia32entry.S  |2 ++
 arch/x86/ia32/sys_ia32.c   |   31 +++
 arch/x86/kernel/syscall_table_32.S |1 +
 include/asm-x86/indirect.h |5 +
 include/asm-x86/indirect_32.h  |   23 +++
 include/asm-x86/indirect_64.h  |   34 ++
 include/asm-x86/unistd_32.h|3 ++-
 include/asm-x86/unistd_64.h|2 ++
 8 files changed, 100 insertions(+), 1 deletion(-)

--- arch/x86/ia32/ia32entry.S
+++ arch/x86/ia32/ia32entry.S
@@ -400,6 +400,7 @@ END(ia32_ptregs_common)
 
.section .rodata,a
.align 8
+   .globl ia32_sys_call_table
 ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -726,4 +727,5 @@ ia32_sys_call_table:
.quad compat_sys_timerfd
.quad sys_eventfd
.quad sys32_fallocate
+   .quad sys32_indirect/* 325  */
 ia32_syscall_end:
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -887,3 +887,37 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned 
offset_lo,
return sys_fallocate(fd, mode, ((u64)offset_hi  32) | offset_lo,
 ((u64)len_hi  32) | len_lo);
 }
+
+asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs,
+  void __user *userparams, size_t paramslen,
+  int flags)
+{
+   extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32);
+
+   struct indirect_registers32 regs;
+   long result;
+
+   if (flags != 0)
+   return -EINVAL;
+
+   if (copy_from_user(regs, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL32(regs))
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen  sizeof(union indirect_params))
+   return -EINVAL;
+   result = -EFAULT;
+   if (!copy_from_user(current-indirect_params, userparams, paramslen))
+   result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx,
+  regs.edx, regs.esi,
+  regs.edi, regs.ebp);
+
+   memset(current-indirect_params, '\0', paramslen);
+
+   return result;
+}
--- arch/x86/kernel/syscall_table_32.S
+++ arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,4 @@ ENTRY(sys_call_table)
.long sys_timerfd
.long sys_eventfd
.long sys_fallocate
+   .long sys_indirect  /* 325 */
--- /dev/null
+++ include/asm-x86/indirect_32.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_INDIRECT_32_H
+#define _ASM_X86_INDIRECT_32_H
+
+struct indirect_registers {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)-eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, 
__u32); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)-ebx, (regs)-ecx, \
+(regs)-edx, (regs)-esi, \
+(regs)-edi, (regs)-ebp); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect_64.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_INDIRECT_64_H
+#define _ASM_X86_INDIRECT_64_H
+
+struct indirect_registers {
+   __u64 rax;
+   __u64 rdi;
+   __u64 rsi;
+   __u64 rdx;
+   __u64 r10;
+   __u64 r8;
+   __u64 r9;
+};
+
+struct indirect_registers32 {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)-rax
+#define INDIRECT_SYSCALL32(regs) (regs)-eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, 
__u64); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)-rdi, (regs)-rsi, \
+(regs)-rdx, (regs)-r10, \
+(regs)-r8, (regs)-r9); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include indirect_32.h
+#else
+# include indirect_64.h
+#endif
--- include/asm-x86/unistd_32.h
+++ include/asm-x86/unistd_32.h
@@ -330,10 +330,11 @@
 #define __NR_timerfd   322
 #define __NR_eventfd   323
 #define __NR_fallocate 324
+#define __NR_indirect  325
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 325
+#define NR_syscalls 326
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
--- include/asm-x86/unistd_64.h
+++ include/asm-x86/unistd_64.h
@@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd)
 __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate  

[PATCHv4 0/6] sys_indirect system call

2007-11-19 Thread Ulrich Drepper

wing patches provide an alternative implementation of the
sys_indirect system call which has been discussed a few times.
This no system call allows us to extend existing system call
interfaces with adding more system calls.

Davide's previous implementation is IMO far more complex than
warranted.  This code here is trivial, as you can see.  I've
discussed this approach with Linus last week and for a brief moment
we actually agreed on something.

We pass an additional block of data to the kernel, it is copied into
the task_struct, and then it is up to the function implementing the system
call to interpret the data.  Each system call, which is meant to be
extended this way, has to be white-listed in sys_indirect.  The
alternative is to filter out those system calls which absolutely cannot
be handled using sys_indirect (like clone, execve) since they require
the stack layout of an ordinary system call.  This is more dangerous
since it is too easy to miss a call.

The code for x86 and x86-64 gets by without a single line of assembly
code.  This is likely to be true for most/all the other archs as well.
There is architecture-dependent code, though.  For x86 and x86-64 I've
also fixed up UML (although only x86-64 is tested, that's my setup).

The last three patches show the first application of the functionality.
They also show a complication: we need the test for valid sub-syscalls in the
main implementation and in the compatibility code.  And more: the actual
sources and generated binary for the test are very different (the numbers
differ).  Duplicating the information is a big problem, though.  I've used
some macro tricks to avoid this.  All the information about the flags and
the system calls using them is concentrated in one header.  This should
maintenance bearable.

This patch to use sys_indirect is just the beginning.  More will follow,
but I want to see how these patches are received before I spend more time
on it.  This code is enough to test the implementation with the following
test program.  Adjust it for architectures other than x86 and x86-64.


#include fcntl.h
#include signal.h
#include stdint.h
#include stdio.h
#include unistd.h
#include netinet/in.h
#include sys/socket.h
#include sys/syscall.h

typedef uint32_t __u32;
typedef uint64_t __u64;

union indirect_params {
  struct {
int flags;
  } file_flags;
};

#ifdef __x86_64__
# define __NR_indirect 286
struct indirect_registers {
  __u64 rax;
  __u64 rdi;
  __u64 rsi;
  __u64 rdx;
  __u64 r10;
  __u64 r8;
  __u64 r9;
};
#elif defined __i386__
# define __NR_indirect 325
struct indirect_registers {
  __u32 eax;
  __u32 ebx;
  __u32 ecx;
  __u32 edx;
  __u32 esi;
  __u32 edi;
  __u32 ebp;
};
#else
# error need to define __NR_indirect and struct indirect_params
#endif

#define FILL_IN(var, values...) \
  var = (struct indirect_registers) { values }

int
main (void)
{
  int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP);
  int s1 = fcntl (fd, F_GETFD);
  int t1 = fcntl (fd, F_GETFL);
  printf (old: FD_CLOEXEC %s set, NONBLOCK %s set\n,
  s1 == 0 ? not : is, (t1  O_NONBLOCK) ? is : not);
  close (fd);

  union indirect_params i;
  i.file_flags.flags = O_CLOEXEC|O_NONBLOCK;

  struct indirect_registers r;
#ifdef __NR_socketcall
# define SOCKOP_socket   1
  long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP };
  FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args);
#else
  FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP);
#endif

  fd = syscall (__NR_indirect, r, i, sizeof (i));
  int s2 = fcntl (fd, F_GETFD);
  int t2 = fcntl (fd, F_GETFL);
  printf (new: FD_CLOEXEC %s set, NONBLOCK %s set\n,
  s2 == 0 ? not : is, (t2  O_NONBLOCK) ? is : not);
  close (fd);

  i.file_flags.flags = O_CLOEXEC;
  sigset_t ss;
  sigemptyset(ss);
  FILL_IN(r, __NR_signalfd, -1, (long) ss, 8);
  fd = syscall (__NR_indirect, r, i, sizeof (i));
  int s3 = fcntl (fd, F_GETFD);
  printf (signalfd: FD_CLOEXEC %s set\n, s3 == 0 ? not : is);
  close (fd);

  FILL_IN(r, __NR_eventfd, 8);
  fd = syscall (__NR_indirect, r, i, sizeof (i));
  int s4 = fcntl (fd, F_GETFD);
  printf (eventfd: FD_CLOEXEC %s set\n, s4 == 0 ? not : is);
  close (fd);

  return s1 != 0 || s2 == 0 || t1 != 0 || t2 == 0 || s3 == 0 || s4 == 0;
}


Signed-off-by: Ulrich Drepper [EMAIL PROTECTED]


 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S  |2 +
 arch/x86/ia32/sys_ia32.c   |   37 +-
 arch/x86/kernel/syscall_table_32.S |1 
 include/asm-um/indirect.h  |6 +
 include/asm-x86/ia32_unistd.h  |1 
 include/asm-x86/indirect.h |5 
 include/asm-x86/indirect_32.h  |   23 +
 include/asm-x86/indirect_64.h  |   34 +++
 include/asm-x86/unistd_32.h|3

[PATCHv4 6/6] FD_CLOEXEC support for eventfd, signalfd, timerfd

2007-11-19 Thread Ulrich Drepper
This patch adds support to set the FD_CLOEXEC flag for the file descriptors
returned by eventfd, signalfd, timerfd.

 fs/anon_inodes.c  |   15 +++
 fs/eventfd.c  |5 +++--
 fs/signalfd.c |6 --
 fs/timerfd.c  |6 --
 include/asm-x86/ia32_unistd.h |3 +++
 include/linux/anon_inodes.h   |3 +++
 include/linux/indirect.h  |3 +++
 7 files changed, 31 insertions(+), 10 deletions(-)

--- fs/anon_inodes.c
+++ fs/anon_inodes.c
@@ -70,9 +70,9 @@ static struct dentry_operations 
anon_inodefs_dentry_operations = {
  * hence saving memory and avoiding code duplication for the file/inode/dentry
  * setup.
  */
-int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
-const char *name, const struct file_operations *fops,
-void *priv)
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags)
 {
struct qstr this;
struct dentry *dentry;
@@ -85,7 +85,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct 
file **pfile,
if (!file)
return -ENFILE;
 
-   error = get_unused_fd();
+   error = get_unused_fd_flags(flags);
if (error  0)
goto err_put_filp;
fd = error;
@@ -138,6 +138,13 @@ err_put_filp:
put_filp(file);
return error;
 }
+
+int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
+const char *name, const struct file_operations *fops,
+void *priv)
+{
+   return anon_inode_getfd_flags(pfd, pinode, pfile, name, fops, priv, 0);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
--- fs/eventfd.c
+++ fs/eventfd.c
@@ -215,8 +215,9 @@ asmlinkage long sys_eventfd(unsigned int count)
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(fd, inode, file, [eventfd],
-eventfd_fops, ctx);
+   error = anon_inode_getfd_flags(fd, inode, file, [eventfd],
+  eventfd_fops, ctx,
+  INDIRECT_PARAM(file_flags, flags));
if (!error)
return fd;
 
--- fs/signalfd.c
+++ fs/signalfd.c
@@ -224,8 +224,10 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user 
*user_mask, size_t sizemas
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(ufd, inode, file, [signalfd],
-signalfd_fops, ctx);
+   error = anon_inode_getfd_flags(ufd, inode, file,
+  [signalfd], signalfd_fops,
+  ctx, INDIRECT_PARAM(file_flags,
+  flags));
if (error)
goto err_fdalloc;
} else {
--- fs/timerfd.c
+++ fs/timerfd.c
@@ -182,8 +182,10 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int 
flags,
 * When we call this, the initialization must be complete, since
 * anon_inode_getfd() will install the fd.
 */
-   error = anon_inode_getfd(ufd, inode, file, [timerfd],
-timerfd_fops, ctx);
+   error = anon_inode_getfd_flags(ufd, inode, file, [timerfd],
+  timerfd_fops, ctx,
+  INDIRECT_PARAM(file_flags,
+ flags));
if (error)
goto err_tmrcancel;
} else {
--- include/asm-x86/ia32_unistd.h
+++ include/asm-x86/ia32_unistd.h
@@ -15,5 +15,8 @@
 #define __NR_ia32_socketcall   102
 #define __NR_ia32_sigreturn119
 #define __NR_ia32_rt_sigreturn 173
+#define __NR_ia32_signalfd 321
+#define __NR_ia32_timerfd  322
+#define __NR_ia32_eventfd  323
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
--- include/linux/anon_inodes.h
+++ include/linux/anon_inodes.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+int anon_inode_getfd_flags(int *pfd, struct inode **pinode, struct file 
**pfile,
+  const char *name, const struct file_operations *fops,
+  void *priv, int flags);
 int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
 const char *name, const struct file_operations *fops,
 void *priv);
--- include/linux/indirect.h
+++ 

[PATCHv4 5/6] Allow setting O_NONBLOCK flag for new sockets

2007-11-19 Thread Ulrich Drepper
This patch adds support for setting the O_NONBLOCK flag of the file
descriptors returned by socket, socketpair, and accept.

 socket.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

--- net/socket.c
+++ net/socket.c
@@ -362,7 +362,7 @@ static int sock_alloc_fd(struct file **filep, int flags)
return fd;
 }
 
-static int sock_attach_fd(struct socket *sock, struct file *file)
+static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
 {
struct dentry *dentry;
struct qstr name = { .name =  };
@@ -384,7 +384,7 @@ static int sock_attach_fd(struct socket *sock, struct file 
*file)
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
  socket_file_ops);
SOCK_INODE(sock)-i_fop = socket_file_ops;
-   file-f_flags = O_RDWR;
+   file-f_flags = O_RDWR | (flags  O_NONBLOCK);
file-f_pos = 0;
file-private_data = sock;
 
@@ -397,7 +397,7 @@ static int sock_map_fd_flags(struct socket *sock, int flags)
int fd = sock_alloc_fd(newfile, flags);
 
if (likely(fd = 0)) {
-   int err = sock_attach_fd(sock, newfile);
+   int err = sock_attach_fd(sock, newfile, flags);
 
if (unlikely(err  0)) {
put_filp(newfile);
@@ -1268,12 +1268,14 @@ asmlinkage long sys_socketpair(int family, int type, 
int protocol,
goto out_release_both;
}
 
-   err = sock_attach_fd(sock1, newfile1);
+   err = sock_attach_fd(sock1, newfile1,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err  0)) {
goto out_fd2;
}
 
-   err = sock_attach_fd(sock2, newfile2);
+   err = sock_attach_fd(sock2, newfile2,
+INDIRECT_PARAM(file_flags, flags));
if (unlikely(err  0)) {
fput(newfile1);
goto out_fd1;
@@ -1423,7 +1425,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user 
*upeer_sockaddr,
goto out_put;
}
 
-   err = sock_attach_fd(newsock, newfile);
+   err = sock_attach_fd(newsock, newfile,
+INDIRECT_PARAM(file_flags, flags));
if (err  0)
goto out_fd_simple;
 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-18 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
> It seems to me that we could accomplish the same thing by passing the
> number of parameters in the upper bits of the system call number
> register (%eax in the case of x86.)

This isn't really a generic solution.  The number of parameters is
limited to six.  There are syscalls with six parameters already.  There
are many more with five which could only handle one more parameter.

Also, is it really simpler?  You'd need to have another table which
contains the default number of parameters a system call takes so that
you can fill in the default value of zero.  This extra memory access has
to be performed for every system call.

I think it is unlikely that this approach is faster.  To the contrary,
I'd guess.

I don't have much invested into this but it seems the sys_indirect
approach is so much simpler.  Overhead is only paid if you really need
it which is rarely the case.  Plus, you might have heard Linus and Zack
talk about syslets again.  Starting syslets can be done using the same
interface, I guess.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQAez2ijCOnn/RHQRAjoHAJ4/Qq4ygaZ4uq6uCIVNq4hfN1m2pACgpJFi
Z/vBsGFpUc/EUz+VW66jEIY=
=B19x
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 0/4] sys_indirect system call

2007-11-18 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

H. Peter Anvin wrote:
 It seems to me that we could accomplish the same thing by passing the
 number of parameters in the upper bits of the system call number
 register (%eax in the case of x86.)

This isn't really a generic solution.  The number of parameters is
limited to six.  There are syscalls with six parameters already.  There
are many more with five which could only handle one more parameter.

Also, is it really simpler?  You'd need to have another table which
contains the default number of parameters a system call takes so that
you can fill in the default value of zero.  This extra memory access has
to be performed for every system call.

I think it is unlikely that this approach is faster.  To the contrary,
I'd guess.

I don't have much invested into this but it seems the sys_indirect
approach is so much simpler.  Overhead is only paid if you really need
it which is rarely the case.  Plus, you might have heard Linus and Zack
talk about syslets again.  Starting syslets can be done using the same
interface, I guess.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHQAez2ijCOnn/RHQRAjoHAJ4/Qq4ygaZ4uq6uCIVNq4hfN1m2pACgpJFi
Z/vBsGFpUc/EUz+VW66jEIY=
=B19x
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] sys_indirect system call

2007-11-17 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Linus Torvalds wrote:
> Uli doesn't care that much about async syscalls, but I think that from a 
> kernel standpoint, we'd want to use this same indirect call for async 
> scheduling,

Note that I added a flags parameter to sys_indirect in the v3 patch.
This should allow you to add additional functionality like syslets
later.  Currently a zero value is enforced.  In future nonzero values
could also imply that the function takes additional parameters.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHPrDk2ijCOnn/RHQRAks1AJ43zF42Vy2ru2D8X3W13YlzYpazUQCfci37
wTKr35RIViiwkQWNMMCeMdk=
=Gmld
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] sys_indirect system call

2007-11-17 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Linus Torvalds wrote:
 Uli doesn't care that much about async syscalls, but I think that from a 
 kernel standpoint, we'd want to use this same indirect call for async 
 scheduling,

Note that I added a flags parameter to sys_indirect in the v3 patch.
This should allow you to add additional functionality like syslets
later.  Currently a zero value is enforced.  In future nonzero values
could also imply that the function takes additional parameters.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHPrDk2ijCOnn/RHQRAks1AJ43zF42Vy2ru2D8X3W13YlzYpazUQCfci37
wTKr35RIViiwkQWNMMCeMdk=
=Gmld
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv3 0/4] sys_indirect system call

2007-11-16 Thread Ulrich Drepper

wing patches provide an alternative implementation of the
sys_indirect system call which has been discussed a few times.
This no system call allows us to extend existing system call
interfaces with adding more system calls.

Davide's previous implementation is IMO far more complex than
warranted.  This code here is trivial, as you can see.  I've
discussed this approach with Linus last week and for a brief moment
we actually agreed on something.

We pass an additional block of data to the kernel, it is copied into
the task_struct, and then it is up to the function implementing the system
call to interpret the data.  Each system call, which is meant to be
extended this way, has to be white-listed in sys_indirect.  The
alternative is to filter out those system calls which absolutely cannot
be handled using sys_indirect (like clone, execve) since they require
the stack layout of an ordinary system call.  This is more dangerous
since it is too easy to miss a call.

The code for x86 and x86-64 gets by without a single line of assembly
code.  This is likely to be true for most/all the other archs as well.
There is architecture-dependent code, though.  For x86 and x86-64 I've
also fixed up UML (although only x86-64 is tested, that's my setup).

The last patch shows the first application of the functionality.  It also
shows a complication: we need the test for valid sub-syscalls in the
main implementation and in the compatibility code.  And more: the actual
sources and generated binary are very different (the numbers differ).
Duplicating the information is a big problem, though.  I've used some macro
tricks to avoid this.  All the information about the flags and the system
calls using them is concentrated in one header.  This should maintenance
bearable.

This patch to use sys_indirect is just the beginngin.  More will follow,
but I want to see how these patches are received before I spend more time
on it.  This code is enough to test the implementation with the following
test program.  Adjust it for architectures other than x86 and x86-64.


#include 
#include 
#include 
#include 
#include 
#include 
#include 

typedef uint32_t __u32;
typedef uint64_t __u64;

union indirect_params {
  struct {
int flags;
  } file_flags;
};

#ifdef __x86_64__
# define __NR_indirect 286
struct indirect_registers {
  __u64 rax;
  __u64 rdi;
  __u64 rsi;
  __u64 rdx;
  __u64 r10;
  __u64 r8;
  __u64 r9;
};
#elif defined __i386__
# define __NR_indirect 325
struct indirect_registers {
  __u32 eax;
  __u32 ebx;
  __u32 ecx;
  __u32 edx;
  __u32 esi;
  __u32 edi;
  __u32 ebp;
};
#else
# error "need to define __NR_indirect and struct indirect_params"
#endif

#define FILL_IN(var, values...) \
  var = (struct indirect_registers) { values }

int
main (void)
{
  int fd = socket (AF_INET, SOCK_DGRAM, IPPROTO_IP);
  int s1 = fcntl (fd, F_GETFD);
  printf ("old: FD_CLOEXEC %s set\n", s1 == 0 ? "not" : "is");
  close (fd);

  union indirect_params i;
  i.file_flags.flags = O_CLOEXEC;

  struct indirect_registers r;
#ifdef __NR_socketcall
# define SOCKOP_socket   1
  long args[3] = { AF_INET, SOCK_DGRAM, IPPROTO_IP };
  FILL_IN (r, __NR_socketcall, SOCKOP_socket, (long) args);
#else
  FILL_IN (r, __NR_socket, AF_INET, SOCK_DGRAM, IPPROTO_IP);
#endif

  fd = syscall (__NR_indirect, , , sizeof (i));
  int s2 = fcntl (fd, F_GETFD);
  printf ("new: FD_CLOEXEC %s set\n", s2 == 0 ? "not" : "is");
  close (fd);

  return s1 != 0 || s2 == 0;
}
~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <[EMAIL PROTECTED]>


 arch/x86/ia32/Makefile |1 
 arch/x86/ia32/ia32entry.S  |2 +
 arch/x86/ia32/sys_ia32.c   |   37 +-
 arch/x86/kernel/syscall_table_32.S |1 
 include/asm-um/indirect.h  |6 +
 include/asm-x86/ia32_unistd.h  |1 
 include/asm-x86/indirect.h |5 
 include/asm-x86/indirect_32.h  |   23 +
 include/asm-x86/indirect_64.h  |   34 +++
 include/asm-x86/unistd_32.h|3 +-
 include/asm-x86/unistd_64.h|2 +
 include/linux/indirect.h   |   39 
 include/linux/sched.h  |4 +++
 include/linux/syscalls.h   |6 -
 kernel/Makefile|4 ++-
 kernel/indirect.c  |   40 +
 net/socket.c   |   29 +++---
 17 files changed, 221 insertions(+), 16 deletions(-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv3 3/4] UML support for sys_indirect

2007-11-16 Thread Ulrich Drepper
This part adds support for sys_indirect for UML.

 indirect.h |6 ++
 1 file changed, 6 insertions(+)

--- /dev/null
+++ include/asm-um/indirect.h
@@ -0,0 +1,6 @@
+#ifndef __UM_INDIRECT_H
+#define __UM_INDIRECT_H
+
+#include "asm/arch/indirect.h"
+
+#endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv3 2/4] x86 support for sys_indirect

2007-11-16 Thread Ulrich Drepper
This part adds support for sys_indirect on x86 and x86-64.

 arch/x86/ia32/ia32entry.S  |2 ++
 arch/x86/ia32/sys_ia32.c   |   31 +++
 arch/x86/kernel/syscall_table_32.S |1 +
 include/asm-x86/indirect.h |5 +
 include/asm-x86/indirect_32.h  |   23 +++
 include/asm-x86/indirect_64.h  |   34 ++
 include/asm-x86/unistd_32.h|3 ++-
 include/asm-x86/unistd_64.h|2 ++
 8 files changed, 100 insertions(+), 1 deletion(-)

--- arch/x86/ia32/ia32entry.S
+++ arch/x86/ia32/ia32entry.S
@@ -400,6 +400,7 @@ END(ia32_ptregs_common)
 
.section .rodata,"a"
.align 8
+   .globl ia32_sys_call_table
 ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -726,4 +727,5 @@ ia32_sys_call_table:
.quad compat_sys_timerfd
.quad sys_eventfd
.quad sys32_fallocate
+   .quad sys32_indirect/* 325  */
 ia32_syscall_end:
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -887,3 +887,34 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned 
offset_lo,
return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 ((u64)len_hi << 32) | len_lo);
 }
+
+asmlinkage long sys32_indirect(struct indirect_registers32 __user *userregs,
+  void __user *userparams, size_t paramslen,
+  int flags)
+{
+   extern long (*ia32_sys_call_table[])(u32, u32, u32, u32, u32, u32);
+
+   struct indirect_registers32 regs;
+   long result;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL32())
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = ia32_sys_call_table[regs.eax](regs.ebx, regs.ecx,
+  regs.edx, regs.esi,
+  regs.edi, regs.ebp);
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
--- arch/x86/kernel/syscall_table_32.S
+++ arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,4 @@ ENTRY(sys_call_table)
.long sys_timerfd
.long sys_eventfd
.long sys_fallocate
+   .long sys_indirect  /* 325 */
--- /dev/null
+++ include/asm-x86/indirect_32.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_INDIRECT_32_H
+#define _ASM_X86_INDIRECT_32_H
+
+struct indirect_registers {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u32, __u32, __u32, __u32, __u32, 
__u32); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->ebx, (regs)->ecx, \
+(regs)->edx, (regs)->esi, \
+(regs)->edi, (regs)->ebp); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect_64.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_INDIRECT_64_H
+#define _ASM_X86_INDIRECT_64_H
+
+struct indirect_registers {
+   __u64 rax;
+   __u64 rdi;
+   __u64 rsi;
+   __u64 rdx;
+   __u64 r10;
+   __u64 r8;
+   __u64 r9;
+};
+
+struct indirect_registers32 {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   __u32 esi;
+   __u32 edi;
+   __u32 ebp;
+};
+
+#define INDIRECT_SYSCALL(regs) (regs)->rax
+#define INDIRECT_SYSCALL32(regs) (regs)->eax
+
+#define CALL_INDIRECT(regs) \
+  ({ extern long (*sys_call_table[]) (__u64, __u64, __u64, __u64, __u64, 
__u64); \
+ sys_call_table[INDIRECT_SYSCALL(regs)] ((regs)->rdi, (regs)->rsi, \
+(regs)->rdx, (regs)->r10, \
+(regs)->r8, (regs)->r9); \
+ })
+
+#endif
--- /dev/null
+++ include/asm-x86/indirect.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "indirect_32.h"
+#else
+# include "indirect_64.h"
+#endif
--- include/asm-x86/unistd_32.h
+++ include/asm-x86/unistd_32.h
@@ -330,10 +330,11 @@
 #define __NR_timerfd   322
 #define __NR_eventfd   323
 #define __NR_fallocate 324
+#define __NR_indirect  325
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 325
+#define NR_syscalls 326
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
--- include/asm-x86/unistd_64.h
+++ include/asm-x86/unistd_64.h
@@ -635,6 +635,8 @@ __SYSCALL(__NR_timerfd, sys_timerfd)
 __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate 285
 __SYSCALL(__NR_fallocate, sys_fallocate)

[PATCHv3 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 arch/x86/ia32/Makefile|1 +
 arch/x86/ia32/sys_ia32.c  |4 
 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   33 +
 kernel/Makefile   |2 ++
 kernel/indirect.c |4 
 net/socket.c  |   21 +
 7 files changed, 58 insertions(+), 8 deletions(-)

--- arch/x86/ia32/Makefile
+++ arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- kernel/Makefile
+++ kernel/Makefile
@@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u net/socket.c net/socket.c
--- net/socket.c
+++ net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd >= 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd();
+   int fd = sock_alloc_fd(, flags);
 
if (likely(fd >= 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file->f_op == _file_ops)
@@ -1208,7 +1213,7 @@
if (retval < 0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval < 0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err < 0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd();
+   fd1 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1 < 0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd();
+   fd2 = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2 < 0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock->ops->owner);
 
-   newfd = sock_alloc_fd();
+   newfd = sock_alloc_fd(, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -902,6 +902,10 @@
 
switch (INDIRECT_SYSCALL32())
{
+#define INDSYSCALL(name) __NR_ia32_##name
+#include 
+   break;
+
default:
return -EINVAL;
}
diff -u include/linux/indirect.h include/linux/indirect.h
--- include/linux/indirect.h
+++ include/linux/indirect.h
@@ -1,6 +1,39 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
 #include 
 
+
+union indirect_params {
+  struct {
+int flags;
+  } file_flags;
+};
+
+#define INDIRECT_PARAM(set, name) current->indirect_params.set.name
+
+#endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro "INDSYSCALL"
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
 #endif
diff -u kernel/indirect.c kernel/indirect.c
--- kernel/indirect.c
+++ kernel/indirect.c
@@ -19,6 +19,10 @@
 
switch (INDIRECT_SYSCALL ())
{
+#define INDSYSCALL(name) __NR_##name
+#include 
+   break;
+
default:
return -EINVAL;
}
--- include/asm-x86/ia32_unistd.h
+++ include/asm-x86/ia32_unistd.h
@@ -12,6 +12,7 @@
 

[PATCHv3 1/4] actual sys_indirect code

2007-11-16 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |6 ++
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |2 +-
 kernel/indirect.c|   36 
 5 files changed, 51 insertions(+), 1 deletion(-)

--- /dev/null
+++ include/linux/indirect.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include 
+
+#endif
--- include/linux/sched.h
+++ include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
--- include/linux/syscalls.h
+++ include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include 
 #include 
@@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- /dev/null
+++ kernel/indirect.c
@@ -0,0 +1,36 @@
+#include 
+#include 
+#include 
+#include 
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL ())
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen > sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(>indirect_params, userparams, paramslen))
+   result = CALL_INDIRECT();
+
+   memset(>indirect_params, '\0', paramslen);
+
+   return result;
+}
--- kernel/Makefile
+++ kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o 
profile.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
-   utsname.o notifier.o
+   utsname.o notifier.o indirect.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
> i'm not suggesting the library set the global flag.  i'm suggesting that 
> me as an app writer will do so.
> 
> it seems like both methods are useful.

No, the global flag is hardly ever useful.  You almost never know the
details of all the libraries you link to well enough to determine that
they don't need FD_CLOEXEC disabled.  Even more problematic, you cannot
know whether they will need it in future.

For applications the solution is simple: wrap to appropriate calls.
Apache has all these apr_ wrappers.  But them to some good news after all.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHPeim2ijCOnn/RHQRAu8xAJsF/0Ir1PWMbHkVRaI5vKOGFS4tMACfVEs9
pMYAiCAU1E2B+7QR0EP+/F8=
=btt9
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
> honestly i think there should be a per-task flag which indicates whether 
> fds are by default F_CLOEXEC or not.  my reason:  third party libraries.

Only somebody who thinks exclusively about applications as opposed to
runtimes/libraries can say something like that.  Library writers don't
have the luxury of being able to modify any global state.  This has all
been discussed here before.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHPd8b2ijCOnn/RHQRAuPPAKCm5mcOl8dycDenxi7BNFdrf2IfWgCgmaXQ
Fj7V13HU1vX6fM9bRumxRpk=
=UIi1
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
 honestly i think there should be a per-task flag which indicates whether 
 fds are by default F_CLOEXEC or not.  my reason:  third party libraries.

Only somebody who thinks exclusively about applications as opposed to
runtimes/libraries can say something like that.  Library writers don't
have the luxury of being able to modify any global state.  This has all
been discussed here before.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFHPd8b2ijCOnn/RHQRAuPPAKCm5mcOl8dycDenxi7BNFdrf2IfWgCgmaXQ
Fj7V13HU1vX6fM9bRumxRpk=
=UIi1
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv2 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

dean gaudet wrote:
 i'm not suggesting the library set the global flag.  i'm suggesting that 
 me as an app writer will do so.
 
 it seems like both methods are useful.

No, the global flag is hardly ever useful.  You almost never know the
details of all the libraries you link to well enough to determine that
they don't need FD_CLOEXEC disabled.  Even more problematic, you cannot
know whether they will need it in future.

For applications the solution is simple: wrap to appropriate calls.
Apache has all these apr_ wrappers.  But them to some good news after all.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.7 (GNU/Linux)
Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org

iD8DBQFHPeim2ijCOnn/RHQRAu8xAJsF/0Ir1PWMbHkVRaI5vKOGFS4tMACfVEs9
pMYAiCAU1E2B+7QR0EP+/F8=
=btt9
-END PGP SIGNATURE-
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv3 1/4] actual sys_indirect code

2007-11-16 Thread Ulrich Drepper
This is the actual architecture-independent part of the system call
implementation.

 include/linux/indirect.h |6 ++
 include/linux/sched.h|4 
 include/linux/syscalls.h |4 
 kernel/Makefile  |2 +-
 kernel/indirect.c|   36 
 5 files changed, 51 insertions(+), 1 deletion(-)

--- /dev/null
+++ include/linux/indirect.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_INDIRECT_H
+#define _LINUX_INDIRECT_H
+
+#include asm/indirect.h
+
+#endif
--- include/linux/sched.h
+++ include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include linux/rcupdate.h
 #include linux/futex.h
 #include linux/rtmutex.h
+#include linux/indirect.h
 
 #include linux/time.h
 #include linux/param.h
@@ -1174,6 +1175,9 @@ struct task_struct {
int make_it_fail;
 #endif
struct prop_local_single dirties;
+
+   /* Additional system call parameters.  */
+   union indirect_params indirect_params;
 };
 
 /*
--- include/linux/syscalls.h
+++ include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct indirect_registers;
 
 #include linux/types.h
 #include linux/aio_abi.h
@@ -611,6 +612,9 @@ asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const 
envp[]);
 
--- /dev/null
+++ kernel/indirect.c
@@ -0,0 +1,36 @@
+#include linux/sched.h
+#include linux/uaccess.h
+#include linux/unistd.h
+#include asm/asm-offsets.h
+
+
+asmlinkage long sys_indirect(struct indirect_registers __user *userregs,
+void __user *userparams, size_t paramslen,
+int flags)
+{
+   struct indirect_registers regs;
+   long result;
+
+   if (unlikely(flags != 0))
+   return -EINVAL;
+
+   if (copy_from_user(regs, userregs, sizeof(regs)))
+   return -EFAULT;
+
+   switch (INDIRECT_SYSCALL (regs))
+   {
+   default:
+   return -EINVAL;
+   }
+
+   if (paramslen  sizeof(union indirect_params))
+   return -EINVAL;
+
+   result = -EFAULT;
+   if (!copy_from_user(current-indirect_params, userparams, paramslen))
+   result = CALL_INDIRECT(regs);
+
+   memset(current-indirect_params, '\0', paramslen);
+
+   return result;
+}
--- kernel/Makefile
+++ kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o 
profile.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
-   utsname.o notifier.o
+   utsname.o notifier.o indirect.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv3 4/4] first use of sys_indirect system call

2007-11-16 Thread Ulrich Drepper
This is a first user of sys_indirect.  Several of the socket-related system
calls which produce a file handle now can be passed an additional parameter
to set the FD_CLOEXEC flag.

 arch/x86/ia32/Makefile|1 +
 arch/x86/ia32/sys_ia32.c  |4 
 include/asm-x86/ia32_unistd.h |1 +
 include/linux/indirect.h  |   33 +
 kernel/Makefile   |2 ++
 kernel/indirect.c |4 
 net/socket.c  |   21 +
 7 files changed, 58 insertions(+), 8 deletions(-)

--- arch/x86/ia32/Makefile
+++ arch/x86/ia32/Makefile
@@ -36,6 +36,7 @@ $(obj)/vsyscall-sysenter.so.dbg 
$(obj)/vsyscall-syscall.so.dbg: \
 $(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
 
+CFLAGS_sys_ia32.o = -Wno-undef
 AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
 
--- kernel/Makefile
+++ kernel/Makefile
@@ -67,6 +67,8 @@ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
+CFLAGS_indirect.o = -Wno-undef
+
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff -u net/socket.c net/socket.c
--- net/socket.c
+++ net/socket.c
@@ -344,11 +344,11 @@
  * but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
int fd;
 
-   fd = get_unused_fd();
+   fd = get_unused_fd_flags(flags);
if (likely(fd = 0)) {
struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
struct file *newfile;
-   int fd = sock_alloc_fd(newfile);
+   int fd = sock_alloc_fd(newfile, flags);
 
if (likely(fd = 0)) {
int err = sock_attach_fd(sock, newfile);
@@ -409,6 +409,11 @@
return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+   return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
if (file-f_op == socket_file_ops)
@@ -1208,7 +1213,7 @@
if (retval  0)
goto out;
 
-   retval = sock_map_fd(sock);
+   retval = sock_map_fd_flags(sock, INDIRECT_PARAM(file_flags, flags));
if (retval  0)
goto out_release;
 
@@ -1249,13 +1254,13 @@
if (err  0)
goto out_release_both;
 
-   fd1 = sock_alloc_fd(newfile1);
+   fd1 = sock_alloc_fd(newfile1, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd1  0)) {
err = fd1;
goto out_release_both;
}
 
-   fd2 = sock_alloc_fd(newfile2);
+   fd2 = sock_alloc_fd(newfile2, INDIRECT_PARAM(file_flags, flags));
if (unlikely(fd2  0)) {
err = fd2;
put_filp(newfile1);
@@ -1411,7 +1416,7 @@
 */
__module_get(newsock-ops-owner);
 
-   newfd = sock_alloc_fd(newfile);
+   newfd = sock_alloc_fd(newfile, INDIRECT_PARAM(file_flags, flags));
if (unlikely(newfd  0)) {
err = newfd;
sock_release(newsock);
diff -u arch/x86/ia32/sys_ia32.c arch/x86/ia32/sys_ia32.c
--- arch/x86/ia32/sys_ia32.c
+++ arch/x86/ia32/sys_ia32.c
@@ -902,6 +902,10 @@
 
switch (INDIRECT_SYSCALL32(regs))
{
+#define INDSYSCALL(name) __NR_ia32_##name
+#include linux/indirect.h
+   break;
+
default:
return -EINVAL;
}
diff -u include/linux/indirect.h include/linux/indirect.h
--- include/linux/indirect.h
+++ include/linux/indirect.h
@@ -1,6 +1,39 @@
+#ifndef INDSYSCALL
 #ifndef _LINUX_INDIRECT_H
 #define _LINUX_INDIRECT_H
 
 #include asm/indirect.h
 
+
+union indirect_params {
+  struct {
+int flags;
+  } file_flags;
+};
+
+#define INDIRECT_PARAM(set, name) current-indirect_params.set.name
+
+#endif
+#else
+
+/* Here comes the list of system calls which can be called through
+   sys_indirect.  When the list if support system calls is needed the
+   file including this header is supposed to define a macro INDSYSCALL
+   which adds a prefix fitting to the use.  If the resulting macro is
+   defined we generate a line
+   case MACRO:
+   */
+#if INDSYSCALL(accept)
+  case INDSYSCALL(accept):
+#endif
+#if INDSYSCALL(socket)
+  case INDSYSCALL(socket):
+#endif
+#if INDSYSCALL(socketcall)
+  case INDSYSCALL(socketcall):
+#endif
+#if INDSYSCALL(socketpair)
+  case INDSYSCALL(socketpair):
+#endif
+
 #endif
diff -u kernel/indirect.c kernel/indirect.c
--- kernel/indirect.c
+++ kernel/indirect.c
@@ -19,6 +19,10 @@
 
switch (INDIRECT_SYSCALL (regs))
{
+#define INDSYSCALL(name) __NR_##name
+#include linux/indirect.h
+   break;
+
default:
return 

  1   2   3   4   5   6   7   8   >