[PATCH 1/2] ipc namespace: a generic per-ipc pointer and peripc_ops

2013-12-22 Thread Oren Laadan
Add a void * pointer to struct ipc_namespace. The access rules are:
1. (un)register ops with (un)register_peripc_operations()
2. call ipc_assign_generic() to put private data on the ipc_namespace
3. call ipc_access_generic() to access the private data
4. do not change the pointer during the lifetime of ipc_namespace

Modeled after generic net-ns pointers (commit dec827d), but simplified
to accommodate a single user for now (reduce code churn):
5. only one caller can register at a time
6. caller must register at boot time (not to be used by modules)

Signed-off-by: Oren Laadan 
Signed-off-by: Amir Goldstein 
---
 include/linux/ipc_namespace.h | 29 +
 ipc/namespace.c   |  9 +++
 ipc/util.c| 60 +++
 ipc/util.h|  3 +++
 4 files changed, 101 insertions(+)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index f6c82de..72da9bf 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -70,8 +70,37 @@ struct ipc_namespace {
struct user_namespace *user_ns;
 
unsigned intproc_inum;
+
+   /* allow others to piggyback on ipc_namesspaces */
+   void *gen;  /* for others' private stuff */
 };
 
+/*
+ * To access to the per-ipc generic data:
+ * 1. (un)register ops with (un)register_peripc_operations()
+ * 2. call ipc_assign_generic() to put private data on the ipc_namespace
+ * 3. call ipc_access_generic() to access the private data
+ * 4. do not change the pointer during the lifetime of ipc_namespace
+ *
+ * Modeled after generic net-ns pointers (commit dec827d), simplified for
+ * a single user case for now:
+ * 5. only one caller can register at a time
+ * 6. caller must register at boot time (not to be used by modules)
+ */
+struct peripc_operations {
+   int (*init)(struct ipc_namespace *);
+   void (*exit)(struct ipc_namespace *);
+};
+
+static inline void ipc_assign_generic(struct ipc_namespace *ns, void *data)
+{ ns->gen = data; }
+
+static inline void *ipc_access_generic(struct ipc_namespace *ns)
+{ return ns->gen; }
+
+extern int register_peripc_ops(struct peripc_operations *ops);
+extern void unregister_peripc_ops(struct peripc_operations *ops);
+
 extern struct ipc_namespace init_ipc_ns;
 extern atomic_t nr_ipc_ns;
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59451c1..83968b6 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -33,9 +33,17 @@ static struct ipc_namespace *create_ipc_ns(struct 
user_namespace *user_ns,
}
 
atomic_set(>count, 1);
+
+   err = init_peripc_ns(ns);
+   if (err) {
+   kfree(ns);
+   return ERR_PTR(err);
+   }
+
err = mq_init_ns(ns);
if (err) {
proc_free_inum(ns->proc_inum);
+   exit_peripc_ns(ns);
kfree(ns);
return ERR_PTR(err);
}
@@ -111,6 +119,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
+   exit_peripc_ns(ns);
atomic_dec(_ipc_ns);
 
/*
diff --git a/ipc/util.c b/ipc/util.c
index 3ae17a4..8cb9949 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -71,6 +71,66 @@ struct ipc_proc_iface {
int (*show)(struct seq_file *, void *);
 };
 
+/* allow others to piggyback on ipc_namespace */
+static DEFINE_MUTEX(peripc_mutex);
+static struct peripc_operations *peripc_ops;
+
+/*
+ * peripc_operations is a simplified pernet_operations:
+ * - allow only one entity to register
+ * - allow to register only at boot time (no modules)
+ * (these assumptions make the code much simpler)
+ */
+
+static int init_peripc_count;
+
+/* caller hold peripc_mutex */
+int init_peripc_ns(struct ipc_namespace *ns)
+{
+   int ret = 0;
+
+   if (peripc_ops && peripc_ops->init)
+   ret = peripc_ops->init(ns);
+   if (ret == 0)
+   init_peripc_count++;
+   return ret;
+}
+
+/* caller hold peripc_mutex */
+void exit_peripc_ns(struct ipc_namespace *ns)
+{
+   if (peripc_ops && peripc_ops->exit)
+   peripc_ops->exit(ns);
+   init_peripc_count--;
+}
+
+int register_peripc_ops(struct peripc_operations *ops)
+{
+   int ret = -EBUSY;
+
+   mutex_lock(_mutex);
+   /* must be first register, and only init ipc_namespace exists */
+   if (peripc_ops == NULL && init_peripc_count == 0) {
+   peripc_ops = ops;
+   ret = init_peripc_ns(_ipc_ns);
+   if (ret < 0)
+   peripc_ops = NULL;
+   }
+   mutex_unlock(_mutex);
+   return ret;
+}
+
+void unregister_peripc_ops(struct peripc_operations *ops)
+{
+   mutex_lock(_mutex);
+   /* sanity:  be same as registered, and no other ipc ns (beyond init) */
+   BUG_ON(peripc_ops != ops);
+   BU

[PATCH 2/2] binder: implement namepsace support for Android binder driver

2013-12-22 Thread Oren Laadan
Add namespaces support for the Android binder driver.
As binder is an IPC mechanism, tie its namespace to IPC_NS.

In binder, the first process to call BINDER_SET_CONTEXT_MGR ioctl
becomes the manager with context 0, and thereafter IPC is realized
through binder handles obtained from this manager.

For namespaces, associate a separate binder state with each namespace
so each namespace has its own context manager. Binder users within a
namespace interact only with the context manager there. This suffices
because binder does not allow IPC not via the context manager.

Currently, statistics remain global, except for the list of processes
that hold an open binder device (reported per namespace).

Signed-off-by: Oren Laadan 
Acked-by: Amir Goldstein 
---
 drivers/staging/android/Kconfig  |   1 +
 drivers/staging/android/binder.c | 172 ---
 2 files changed, 143 insertions(+), 30 deletions(-)

diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 1e9ab6d..208685e 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -11,6 +11,7 @@ if ANDROID
 config ANDROID_BINDER_IPC
bool "Android Binder IPC Driver"
depends on MMU
+   select SYSVIPC
default n
---help---
  Binder is used in Android for both communication between processes,
diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index eaec1da..6f97f89 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -36,24 +36,104 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "binder.h"
 #include "binder_trace.h"
 
+/*
+ * Using a private context manager for each binder namespace is sufficient
+ * to isolate between namespaces, because in binder all IPC must be realized
+ * via hanldes obtained from the context manager.
+ *
+ * TODO: currently, most debugfs data is not tracked per binder namespaces.
+ * Except for "procs" which are properly virtualized, everything else is
+ * global, including stats, logs, and dead nodes.
+ */
+struct binder_namespace {
+   struct kref kref;
+
+   struct binder_node *context_mgr_node;
+   kuid_t context_mgr_uid;
+   int last_id;
+
+   struct hlist_head procs;
+};
+
+static struct binder_namespace *create_binder_ns(void)
+{
+   struct binder_namespace *binder_ns;
+
+   binder_ns = kzalloc(sizeof(struct binder_namespace), GFP_KERNEL);
+   if (binder_ns) {
+   kref_init(_ns->kref);
+   binder_ns->context_mgr_uid = INVALID_UID;
+   INIT_HLIST_HEAD(_ns->procs);
+   }
+   return binder_ns;
+}
+
+static void free_binder_ns(struct kref *kref)
+{
+   kfree(container_of(kref, struct binder_namespace, kref));
+}
+
+static void get_binder_ns(struct binder_namespace *binder_ns)
+{
+   kref_get(_ns->kref);
+}
+
+static void put_binder_ns(struct binder_namespace *binder_ns)
+{
+   kref_put(_ns->kref, free_binder_ns);
+}
+
+/*
+ * Binder is an IPC mechanism, so tie its namespace to IPC_NS
+ * using the generic data pointer and per-ipc operations.
+ */
+static struct binder_namespace *current_binder_ns(void)
+{
+   return ipc_access_generic(current->nsproxy->ipc_ns);
+}
+
+int binder_init_ns(struct ipc_namespace *ipcns)
+{
+   struct binder_namespace *binder_ns;
+   int ret = -ENOMEM;
+
+   binder_ns = create_binder_ns();
+   if (binder_ns) {
+   ipc_assign_generic(ipcns, binder_ns);
+   ret = 0;
+   }
+   return ret;
+}
+
+void binder_exit_ns(struct ipc_namespace *ipcns)
+{
+   struct binder_namespace *binder_ns;
+
+   binder_ns = ipc_access_generic(ipcns);
+   if (binder_ns)
+   put_binder_ns(binder_ns);
+}
+
+struct peripc_operations binder_peripc_ops = {
+   .init = binder_init_ns,
+   .exit = binder_exit_ns,
+};
+
 static DEFINE_MUTEX(binder_main_lock);
 static DEFINE_MUTEX(binder_deferred_lock);
 static DEFINE_MUTEX(binder_mmap_lock);
 
-static HLIST_HEAD(binder_procs);
 static HLIST_HEAD(binder_deferred_list);
 static HLIST_HEAD(binder_dead_nodes);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
-static int binder_last_id;
 static struct workqueue_struct *binder_deferred_workqueue;
 
 #define BINDER_DEBUG_ENTRY(name) \
@@ -319,6 +399,8 @@ struct binder_proc {
int ready_threads;
long default_priority;
struct dentry *debugfs_entry;
+
+   struct binder_namespace *binder_ns;
 };
 
 enum {
@@ -900,7 +982,7 @@ static struct binder_node *binder_new_node(struct 
binder_proc *proc,
binder_stats_created(BINDER_STAT_NODE);
rb_link_node(>rb_node, parent, p);
rb_insert_color(>rb_node, >nod

[PATCH 0/2] Namespace support for Android binder (under ipc-ns)

2013-12-22 Thread Oren Laadan
Hi,

This patch-duo adds namespaces support for the Android binder driver. As
discussed in Plumbers 2013, binder is a form of IPC and therefore will be
tied to ipc namespace.

On the ipc-ns side, the implementation is modelled after generic net-ns
pointers (see commit dec827d), but simplified to suit single user/caller
for now, to reduce complexity.

The binder driver registers with ipc-ns and uses the generic pointer to
store and retrieve its namespace data. That is, each namespace maintains
its own binder context manager - which suffices for separation, since all
transactions require binder handles provided by the context manager.

Thanks,

Oren.


Oren Laadan (2):
  ipc namespace: a generic per-ipc pointer and peripc_ops
  binder: implement namepsace support for Android binder driver

 drivers/staging/android/Kconfig  |   1 +
 drivers/staging/android/binder.c | 172 ---
 include/linux/ipc_namespace.h|  29 +++
 ipc/namespace.c  |   9 ++
 ipc/util.c   |  60 ++
 ipc/util.h   |   3 +
 6 files changed, 244 insertions(+), 30 deletions(-)

-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] binder: implement namepsace support for Android binder driver

2013-12-22 Thread Oren Laadan
Add namespaces support for the Android binder driver.
As binder is an IPC mechanism, tie its namespace to IPC_NS.

In binder, the first process to call BINDER_SET_CONTEXT_MGR ioctl
becomes the manager with context 0, and thereafter IPC is realized
through binder handles obtained from this manager.

For namespaces, associate a separate binder state with each namespace
so each namespace has its own context manager. Binder users within a
namespace interact only with the context manager there. This suffices
because binder does not allow IPC not via the context manager.

Currently, statistics remain global, except for the list of processes
that hold an open binder device (reported per namespace).

Signed-off-by: Oren Laadan or...@cellrox.com
Acked-by: Amir Goldstein a...@cellrox.com
---
 drivers/staging/android/Kconfig  |   1 +
 drivers/staging/android/binder.c | 172 ---
 2 files changed, 143 insertions(+), 30 deletions(-)

diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 1e9ab6d..208685e 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -11,6 +11,7 @@ if ANDROID
 config ANDROID_BINDER_IPC
bool Android Binder IPC Driver
depends on MMU
+   select SYSVIPC
default n
---help---
  Binder is used in Android for both communication between processes,
diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index eaec1da..6f97f89 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -36,24 +36,104 @@
 #include linux/uaccess.h
 #include linux/vmalloc.h
 #include linux/slab.h
+#include linux/ipc_namespace.h
 #include linux/pid_namespace.h
 
 #include binder.h
 #include binder_trace.h
 
+/*
+ * Using a private context manager for each binder namespace is sufficient
+ * to isolate between namespaces, because in binder all IPC must be realized
+ * via hanldes obtained from the context manager.
+ *
+ * TODO: currently, most debugfs data is not tracked per binder namespaces.
+ * Except for procs which are properly virtualized, everything else is
+ * global, including stats, logs, and dead nodes.
+ */
+struct binder_namespace {
+   struct kref kref;
+
+   struct binder_node *context_mgr_node;
+   kuid_t context_mgr_uid;
+   int last_id;
+
+   struct hlist_head procs;
+};
+
+static struct binder_namespace *create_binder_ns(void)
+{
+   struct binder_namespace *binder_ns;
+
+   binder_ns = kzalloc(sizeof(struct binder_namespace), GFP_KERNEL);
+   if (binder_ns) {
+   kref_init(binder_ns-kref);
+   binder_ns-context_mgr_uid = INVALID_UID;
+   INIT_HLIST_HEAD(binder_ns-procs);
+   }
+   return binder_ns;
+}
+
+static void free_binder_ns(struct kref *kref)
+{
+   kfree(container_of(kref, struct binder_namespace, kref));
+}
+
+static void get_binder_ns(struct binder_namespace *binder_ns)
+{
+   kref_get(binder_ns-kref);
+}
+
+static void put_binder_ns(struct binder_namespace *binder_ns)
+{
+   kref_put(binder_ns-kref, free_binder_ns);
+}
+
+/*
+ * Binder is an IPC mechanism, so tie its namespace to IPC_NS
+ * using the generic data pointer and per-ipc operations.
+ */
+static struct binder_namespace *current_binder_ns(void)
+{
+   return ipc_access_generic(current-nsproxy-ipc_ns);
+}
+
+int binder_init_ns(struct ipc_namespace *ipcns)
+{
+   struct binder_namespace *binder_ns;
+   int ret = -ENOMEM;
+
+   binder_ns = create_binder_ns();
+   if (binder_ns) {
+   ipc_assign_generic(ipcns, binder_ns);
+   ret = 0;
+   }
+   return ret;
+}
+
+void binder_exit_ns(struct ipc_namespace *ipcns)
+{
+   struct binder_namespace *binder_ns;
+
+   binder_ns = ipc_access_generic(ipcns);
+   if (binder_ns)
+   put_binder_ns(binder_ns);
+}
+
+struct peripc_operations binder_peripc_ops = {
+   .init = binder_init_ns,
+   .exit = binder_exit_ns,
+};
+
 static DEFINE_MUTEX(binder_main_lock);
 static DEFINE_MUTEX(binder_deferred_lock);
 static DEFINE_MUTEX(binder_mmap_lock);
 
-static HLIST_HEAD(binder_procs);
 static HLIST_HEAD(binder_deferred_list);
 static HLIST_HEAD(binder_dead_nodes);
 
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
-static int binder_last_id;
 static struct workqueue_struct *binder_deferred_workqueue;
 
 #define BINDER_DEBUG_ENTRY(name) \
@@ -319,6 +399,8 @@ struct binder_proc {
int ready_threads;
long default_priority;
struct dentry *debugfs_entry;
+
+   struct binder_namespace *binder_ns;
 };
 
 enum {
@@ -900,7 +982,7 @@ static struct binder_node *binder_new_node(struct 
binder_proc *proc,
binder_stats_created(BINDER_STAT_NODE);
rb_link_node(node

[PATCH 0/2] Namespace support for Android binder (under ipc-ns)

2013-12-22 Thread Oren Laadan
Hi,

This patch-duo adds namespaces support for the Android binder driver. As
discussed in Plumbers 2013, binder is a form of IPC and therefore will be
tied to ipc namespace.

On the ipc-ns side, the implementation is modelled after generic net-ns
pointers (see commit dec827d), but simplified to suit single user/caller
for now, to reduce complexity.

The binder driver registers with ipc-ns and uses the generic pointer to
store and retrieve its namespace data. That is, each namespace maintains
its own binder context manager - which suffices for separation, since all
transactions require binder handles provided by the context manager.

Thanks,

Oren.


Oren Laadan (2):
  ipc namespace: a generic per-ipc pointer and peripc_ops
  binder: implement namepsace support for Android binder driver

 drivers/staging/android/Kconfig  |   1 +
 drivers/staging/android/binder.c | 172 ---
 include/linux/ipc_namespace.h|  29 +++
 ipc/namespace.c  |   9 ++
 ipc/util.c   |  60 ++
 ipc/util.h   |   3 +
 6 files changed, 244 insertions(+), 30 deletions(-)

-- 
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] ipc namespace: a generic per-ipc pointer and peripc_ops

2013-12-22 Thread Oren Laadan
Add a void * pointer to struct ipc_namespace. The access rules are:
1. (un)register ops with (un)register_peripc_operations()
2. call ipc_assign_generic() to put private data on the ipc_namespace
3. call ipc_access_generic() to access the private data
4. do not change the pointer during the lifetime of ipc_namespace

Modeled after generic net-ns pointers (commit dec827d), but simplified
to accommodate a single user for now (reduce code churn):
5. only one caller can register at a time
6. caller must register at boot time (not to be used by modules)

Signed-off-by: Oren Laadan or...@cellrox.com
Signed-off-by: Amir Goldstein a...@cellrox.com
---
 include/linux/ipc_namespace.h | 29 +
 ipc/namespace.c   |  9 +++
 ipc/util.c| 60 +++
 ipc/util.h|  3 +++
 4 files changed, 101 insertions(+)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index f6c82de..72da9bf 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -70,8 +70,37 @@ struct ipc_namespace {
struct user_namespace *user_ns;
 
unsigned intproc_inum;
+
+   /* allow others to piggyback on ipc_namesspaces */
+   void *gen;  /* for others' private stuff */
 };
 
+/*
+ * To access to the per-ipc generic data:
+ * 1. (un)register ops with (un)register_peripc_operations()
+ * 2. call ipc_assign_generic() to put private data on the ipc_namespace
+ * 3. call ipc_access_generic() to access the private data
+ * 4. do not change the pointer during the lifetime of ipc_namespace
+ *
+ * Modeled after generic net-ns pointers (commit dec827d), simplified for
+ * a single user case for now:
+ * 5. only one caller can register at a time
+ * 6. caller must register at boot time (not to be used by modules)
+ */
+struct peripc_operations {
+   int (*init)(struct ipc_namespace *);
+   void (*exit)(struct ipc_namespace *);
+};
+
+static inline void ipc_assign_generic(struct ipc_namespace *ns, void *data)
+{ ns-gen = data; }
+
+static inline void *ipc_access_generic(struct ipc_namespace *ns)
+{ return ns-gen; }
+
+extern int register_peripc_ops(struct peripc_operations *ops);
+extern void unregister_peripc_ops(struct peripc_operations *ops);
+
 extern struct ipc_namespace init_ipc_ns;
 extern atomic_t nr_ipc_ns;
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59451c1..83968b6 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -33,9 +33,17 @@ static struct ipc_namespace *create_ipc_ns(struct 
user_namespace *user_ns,
}
 
atomic_set(ns-count, 1);
+
+   err = init_peripc_ns(ns);
+   if (err) {
+   kfree(ns);
+   return ERR_PTR(err);
+   }
+
err = mq_init_ns(ns);
if (err) {
proc_free_inum(ns-proc_inum);
+   exit_peripc_ns(ns);
kfree(ns);
return ERR_PTR(err);
}
@@ -111,6 +119,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
+   exit_peripc_ns(ns);
atomic_dec(nr_ipc_ns);
 
/*
diff --git a/ipc/util.c b/ipc/util.c
index 3ae17a4..8cb9949 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -71,6 +71,66 @@ struct ipc_proc_iface {
int (*show)(struct seq_file *, void *);
 };
 
+/* allow others to piggyback on ipc_namespace */
+static DEFINE_MUTEX(peripc_mutex);
+static struct peripc_operations *peripc_ops;
+
+/*
+ * peripc_operations is a simplified pernet_operations:
+ * - allow only one entity to register
+ * - allow to register only at boot time (no modules)
+ * (these assumptions make the code much simpler)
+ */
+
+static int init_peripc_count;
+
+/* caller hold peripc_mutex */
+int init_peripc_ns(struct ipc_namespace *ns)
+{
+   int ret = 0;
+
+   if (peripc_ops  peripc_ops-init)
+   ret = peripc_ops-init(ns);
+   if (ret == 0)
+   init_peripc_count++;
+   return ret;
+}
+
+/* caller hold peripc_mutex */
+void exit_peripc_ns(struct ipc_namespace *ns)
+{
+   if (peripc_ops  peripc_ops-exit)
+   peripc_ops-exit(ns);
+   init_peripc_count--;
+}
+
+int register_peripc_ops(struct peripc_operations *ops)
+{
+   int ret = -EBUSY;
+
+   mutex_lock(peripc_mutex);
+   /* must be first register, and only init ipc_namespace exists */
+   if (peripc_ops == NULL  init_peripc_count == 0) {
+   peripc_ops = ops;
+   ret = init_peripc_ns(init_ipc_ns);
+   if (ret  0)
+   peripc_ops = NULL;
+   }
+   mutex_unlock(peripc_mutex);
+   return ret;
+}
+
+void unregister_peripc_ops(struct peripc_operations *ops)
+{
+   mutex_lock(peripc_mutex);
+   /* sanity:  be same as registered, and no other ipc ns (beyond init) */
+   BUG_ON(peripc_ops != ops);
+   BUG_ON(init_peripc_count != 1

Re: [PATCH 2.6.24-rc8-mm1 09/15] (RFC) IPC: new kernel API to change an ID

2008-02-05 Thread Oren Laadan



Serge E. Hallyn wrote:

Quoting Oren Laadan ([EMAIL PROTECTED]):

I strongly second Kirill on this matter.

IMHO, we should _avoid_ as much as possible exposing internal kernel
state to applications, unless a _real_ need for it is _clearly_
demonstrated. The reasons for this are quite obvious.


Hmm, sure, but this sentence is designed to make us want to agree.  Yes,
we want to avoid exporting kernel internals, but generally that means
things like the precise layout of the task_struct.  What Pierre is doing
is in fact the opposite, exporting resource information in a kernel
version invariant way.


LOL ... a bit of misunderstanding - let me put some order here:

my response what with respect to the new interface that Pierre
suggested, that is - to add a new IPC call to change an identifier
after it has been allocated (and assigned). This is necessary for the
restart because applications expect to see the same resource id's as
they had at the time of the checkpoint.

What you are referring to is the more recent part of the thread, where
the topic became how data should be saved - in other words, the format
of the checkpoint data. This is entirely orthogonal to my argument.

Now please re-read my email :)

That said, I'd advocate for something in between a raw dump and a pure
"parametric" representation of the data. Raw data tends to be, well,
too raw, which makes the task of reading data from older version by
newer kernels harder to maintain. On the other hand, it is impossible
to abstract everything into kernel-independent format.



In fact, the very reason not to go the route you and Pavel are
advocating is that if we just dump task state to a file or filesystem
from the kernel in one shot, we'll be much more tempted to lay out data
in a way that exports and ends up depending on kernel internals.  So
we'll just want to read and write the task_struct verbatim.

So, there are two very different approaches we can start with.
Whichever one we follow, we want to avoid having kernel version
dependencies.  They both have their merits to be sure.


You will never be able to avoid that completely, simply because new
kernels will require saving more (or less) data per object, because
of new (or dropped) features.
The best solution in this sense is to provide a filter (hopefully
in user space, utility) that would convert a checkpoint image file
from the old format to a newer format.
And you keep a lot of compatibility code of the kernel, too.



But note that in either case we need to deal with a bunch of locking.
So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
doing no matter 1.  9-11 sound like they are contentuous until
we decide whether we want to go with a create_with_id() type approach
or a set_id().  12 is IMO a good locking cleanup regardless.  13 and
15 are contentous until we decide whether we want userspace-controlled
checkpoint or a one-shot fs.  14 IMO is useful for both c/r approaches.

Is that pretty accurate?


(context switch back to my original reply)

I prefer not to add a new interface to IPC that will provide a new
functionality that isn't needed, except for the checkpoint - because
there is a better alternative to do the same task; this alternative
is more suitable because (a) it can be applied incrementally, (b) it
provides a consistent method to pre-select identifiers of all syscalls,
(where is the current suggestion suggests one way for IPC and will
suggest other hacks for other resources).

(context switch back to the current reply)

I definitely welcome a cleanup of the (insanely multiplexedd) IPC
code. However I argue that the interface need not be extended.




It isn't strictly necessary to export a new interface in order to
support checkpoint/restart. **. Hence, I think that the speculation
"we may need it in the future" is too abstract and isn't a good
excuse to commit to a new, currently unneeded, interface.


OTOH it did succeed in starting some conversation :)


Should the
need arise in the future, it will be easy to design a new interface
(also based on aggregated experience until then).


What aggregated experience?  We have to start somewhere...


:)  well, assuming the selection of resource IDs is done as I suggested,
we'll have the restart use it. If someone finds a good reason (other
than checkpoint/restart) to pre-select/modify an identifier, it will
be easy to _then_ add an interface. That (hypothetical) interface is
likely to come out more clever after X months using checkpoint/restart.




** In fact, the suggested interface may prove problematic (as noted
earlier in this thread): if you first create the resource with some
arbitrary identifier and then modify the identifier (in our case,
IPC id), then the restart procedure is bound to execute sequentially,
because of lack of atomicity.


Hmm?  Lack of atomicity wrt what?  All the tasks being restarted were
checkpointed at the same time so there will be no conflict in the
reque

Re: [PATCH 2.6.24-rc8-mm1 09/15] (RFC) IPC: new kernel API to change an ID

2008-02-05 Thread Oren Laadan


I strongly second Kirill on this matter.

IMHO, we should _avoid_ as much as possible exposing internal kernel
state to applications, unless a _real_ need for it is _clearly_
demonstrated. The reasons for this are quite obvious.

It isn't strictly necessary to export a new interface in order to
support checkpoint/restart. **. Hence, I think that the speculation
"we may need it in the future" is too abstract and isn't a good
excuse to commit to a new, currently unneeded, interface. Should the
need arise in the future, it will be easy to design a new interface
(also based on aggregated experience until then).

** In fact, the suggested interface may prove problematic (as noted
earlier in this thread): if you first create the resource with some
arbitrary identifier and then modify the identifier (in our case,
IPC id), then the restart procedure is bound to execute sequentially,
because of lack of atomicity.

That said, I suggest the following method instead (this is the method
we use in Zap to determine the desired resource identifier when a new
resource is allocated; I recall that we had discussed it in the past,
perhaps the mini-summit in september ?):

1) The process/thread tells the kernel that it wishes to pre-determine
the resource identifier of a subsequent call (this can be done via a
new syscall, or by writing to /proc/self/...).

2) Each system call that allocates a resource and assigns an identifier
is modified to check this per-thread field first; if it is set then
it will attempt to allocate that particular value (if already taken,
return an error, eg. EBUSY). Otherwise it will proceed as it is today.

(I left out some details - eg. the kernel will keep the desire value
on a per-thread field, when it will be reset, whether we want to also
tag the field with its type and so on, but the idea is now clear).

The main two advantages are that first, we don't need to devise a new
method for every syscall that allocates said resources (sigh... just
think of clone() nightmare to add a new argument); second, the change
is incremental: first code the mechanism to set the field, then add
support in the IPC subsystem, later in the DEVPTS, then in clone and
so forth.

Oren.

Pierre Peiffer wrote:


Kirill Korotaev wrote:

Why user space can need this API? for checkpointing only?


I would say "at least for checkpointing"... ;) May be someone else may find an
interest about this for something else.
In fact, I'm sure that you have some interest in checkpointing; and thus, you
have probably some ideas in mind; but whatever the solution you will propose,
I'm pretty sure that I could say the same thing for your solution.
And what I finally think is: even if it's for "checkpointing only", if many
people are interested by this, it may be sufficient to push this ?


Then I would not consider it for inclusion until it is clear how to implement 
checkpointing.
As for me personally - I'm against exporting such APIs, since they are not 
needed in real-life user space applications and maintaining it forever for 
compatibility doesn't worth it.


Maintaining these patches is not a big deal, really, but this is not the main
point; the "need in real life" (1) is in fact the main one, and then, the "is
this solution the best one ?" (2) the second one.

About (1), as said in my first mail, as the namespaces and containers are being
integrated into the mainline kernel, checkpoint/restart is (or will be) the next
need.
About (2), my solution propose to do that, as much as possible from userspace,
to minimize the kernel impact. Of course, this is subject to discussion. My
opinion is that doing a full checkpoint/restart from kernel space will need lot
of new specific and intrusive code; I'm not sure that this will be acceptable by
the community. But this is my opinion only. Discusion is opened.


Also such APIs allow creation of non-GPL checkpointing in user-space, which can 
be of concern as well.


Honestly, I don't think this really a concern at all. I mean: I've never seen
"this allows non-GPL binary and thus, this is bad" as an argument to reject a
functionality, but I may be wrong, and thus, it can be discussed as well.
I think the points (1) and (2) as stated above are the key ones.

Pierre


Kirill


Pierre Peiffer wrote:

Hi again,

Thinking more about this, I think I must clarify why I choose this way.
In fact, the idea of these patches is to provide the missing user APIs (or
extend the existing ones) that allow to set or update _all_ properties of all
IPCs, as needed in the case of the checkpoint/restart of an application (the
current user API does not allow to specify an ID for a created IPC, for
example). And this, without changing the existing API of course.

And msgget(), semget() and shmget() does not have any parameter we can 
use to
specify an ID.
That's why I've decided to not change these routines and add a new 
control
command, IP_SETID, with which we can can change the ID of an IPC. 

Re: [PATCH 2.6.24-rc8-mm1 09/15] (RFC) IPC: new kernel API to change an ID

2008-02-05 Thread Oren Laadan


I strongly second Kirill on this matter.

IMHO, we should _avoid_ as much as possible exposing internal kernel
state to applications, unless a _real_ need for it is _clearly_
demonstrated. The reasons for this are quite obvious.

It isn't strictly necessary to export a new interface in order to
support checkpoint/restart. **. Hence, I think that the speculation
we may need it in the future is too abstract and isn't a good
excuse to commit to a new, currently unneeded, interface. Should the
need arise in the future, it will be easy to design a new interface
(also based on aggregated experience until then).

** In fact, the suggested interface may prove problematic (as noted
earlier in this thread): if you first create the resource with some
arbitrary identifier and then modify the identifier (in our case,
IPC id), then the restart procedure is bound to execute sequentially,
because of lack of atomicity.

That said, I suggest the following method instead (this is the method
we use in Zap to determine the desired resource identifier when a new
resource is allocated; I recall that we had discussed it in the past,
perhaps the mini-summit in september ?):

1) The process/thread tells the kernel that it wishes to pre-determine
the resource identifier of a subsequent call (this can be done via a
new syscall, or by writing to /proc/self/...).

2) Each system call that allocates a resource and assigns an identifier
is modified to check this per-thread field first; if it is set then
it will attempt to allocate that particular value (if already taken,
return an error, eg. EBUSY). Otherwise it will proceed as it is today.

(I left out some details - eg. the kernel will keep the desire value
on a per-thread field, when it will be reset, whether we want to also
tag the field with its type and so on, but the idea is now clear).

The main two advantages are that first, we don't need to devise a new
method for every syscall that allocates said resources (sigh... just
think of clone() nightmare to add a new argument); second, the change
is incremental: first code the mechanism to set the field, then add
support in the IPC subsystem, later in the DEVPTS, then in clone and
so forth.

Oren.

Pierre Peiffer wrote:


Kirill Korotaev wrote:

Why user space can need this API? for checkpointing only?


I would say at least for checkpointing... ;) May be someone else may find an
interest about this for something else.
In fact, I'm sure that you have some interest in checkpointing; and thus, you
have probably some ideas in mind; but whatever the solution you will propose,
I'm pretty sure that I could say the same thing for your solution.
And what I finally think is: even if it's for checkpointing only, if many
people are interested by this, it may be sufficient to push this ?


Then I would not consider it for inclusion until it is clear how to implement 
checkpointing.
As for me personally - I'm against exporting such APIs, since they are not 
needed in real-life user space applications and maintaining it forever for 
compatibility doesn't worth it.


Maintaining these patches is not a big deal, really, but this is not the main
point; the need in real life (1) is in fact the main one, and then, the is
this solution the best one ? (2) the second one.

About (1), as said in my first mail, as the namespaces and containers are being
integrated into the mainline kernel, checkpoint/restart is (or will be) the next
need.
About (2), my solution propose to do that, as much as possible from userspace,
to minimize the kernel impact. Of course, this is subject to discussion. My
opinion is that doing a full checkpoint/restart from kernel space will need lot
of new specific and intrusive code; I'm not sure that this will be acceptable by
the community. But this is my opinion only. Discusion is opened.


Also such APIs allow creation of non-GPL checkpointing in user-space, which can 
be of concern as well.


Honestly, I don't think this really a concern at all. I mean: I've never seen
this allows non-GPL binary and thus, this is bad as an argument to reject a
functionality, but I may be wrong, and thus, it can be discussed as well.
I think the points (1) and (2) as stated above are the key ones.

Pierre


Kirill


Pierre Peiffer wrote:

Hi again,

Thinking more about this, I think I must clarify why I choose this way.
In fact, the idea of these patches is to provide the missing user APIs (or
extend the existing ones) that allow to set or update _all_ properties of all
IPCs, as needed in the case of the checkpoint/restart of an application (the
current user API does not allow to specify an ID for a created IPC, for
example). And this, without changing the existing API of course.

And msgget(), semget() and shmget() does not have any parameter we can 
use to
specify an ID.
That's why I've decided to not change these routines and add a new 
control
command, IP_SETID, with which we can can change the ID of an IPC. (that looks 

Re: [PATCH 2.6.24-rc8-mm1 09/15] (RFC) IPC: new kernel API to change an ID

2008-02-05 Thread Oren Laadan



Serge E. Hallyn wrote:

Quoting Oren Laadan ([EMAIL PROTECTED]):

I strongly second Kirill on this matter.

IMHO, we should _avoid_ as much as possible exposing internal kernel
state to applications, unless a _real_ need for it is _clearly_
demonstrated. The reasons for this are quite obvious.


Hmm, sure, but this sentence is designed to make us want to agree.  Yes,
we want to avoid exporting kernel internals, but generally that means
things like the precise layout of the task_struct.  What Pierre is doing
is in fact the opposite, exporting resource information in a kernel
version invariant way.


LOL ... a bit of misunderstanding - let me put some order here:

my response what with respect to the new interface that Pierre
suggested, that is - to add a new IPC call to change an identifier
after it has been allocated (and assigned). This is necessary for the
restart because applications expect to see the same resource id's as
they had at the time of the checkpoint.

What you are referring to is the more recent part of the thread, where
the topic became how data should be saved - in other words, the format
of the checkpoint data. This is entirely orthogonal to my argument.

Now please re-read my email :)

That said, I'd advocate for something in between a raw dump and a pure
parametric representation of the data. Raw data tends to be, well,
too raw, which makes the task of reading data from older version by
newer kernels harder to maintain. On the other hand, it is impossible
to abstract everything into kernel-independent format.



In fact, the very reason not to go the route you and Pavel are
advocating is that if we just dump task state to a file or filesystem
from the kernel in one shot, we'll be much more tempted to lay out data
in a way that exports and ends up depending on kernel internals.  So
we'll just want to read and write the task_struct verbatim.

So, there are two very different approaches we can start with.
Whichever one we follow, we want to avoid having kernel version
dependencies.  They both have their merits to be sure.


You will never be able to avoid that completely, simply because new
kernels will require saving more (or less) data per object, because
of new (or dropped) features.
The best solution in this sense is to provide a filter (hopefully
in user space, utility) that would convert a checkpoint image file
from the old format to a newer format.
And you keep a lot of compatibility code of the kernel, too.



But note that in either case we need to deal with a bunch of locking.
So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
doing no matter 1.  9-11 sound like they are contentuous until
we decide whether we want to go with a create_with_id() type approach
or a set_id().  12 is IMO a good locking cleanup regardless.  13 and
15 are contentous until we decide whether we want userspace-controlled
checkpoint or a one-shot fs.  14 IMO is useful for both c/r approaches.

Is that pretty accurate?


(context switch back to my original reply)

I prefer not to add a new interface to IPC that will provide a new
functionality that isn't needed, except for the checkpoint - because
there is a better alternative to do the same task; this alternative
is more suitable because (a) it can be applied incrementally, (b) it
provides a consistent method to pre-select identifiers of all syscalls,
(where is the current suggestion suggests one way for IPC and will
suggest other hacks for other resources).

(context switch back to the current reply)

I definitely welcome a cleanup of the (insanely multiplexedd) IPC
code. However I argue that the interface need not be extended.




It isn't strictly necessary to export a new interface in order to
support checkpoint/restart. **. Hence, I think that the speculation
we may need it in the future is too abstract and isn't a good
excuse to commit to a new, currently unneeded, interface.


OTOH it did succeed in starting some conversation :)


Should the
need arise in the future, it will be easy to design a new interface
(also based on aggregated experience until then).


What aggregated experience?  We have to start somewhere...


:)  well, assuming the selection of resource IDs is done as I suggested,
we'll have the restart use it. If someone finds a good reason (other
than checkpoint/restart) to pre-select/modify an identifier, it will
be easy to _then_ add an interface. That (hypothetical) interface is
likely to come out more clever after X months using checkpoint/restart.




** In fact, the suggested interface may prove problematic (as noted
earlier in this thread): if you first create the resource with some
arbitrary identifier and then modify the identifier (in our case,
IPC id), then the restart procedure is bound to execute sequentially,
because of lack of atomicity.


Hmm?  Lack of atomicity wrt what?  All the tasks being restarted were
checkpointed at the same time so there will be no conflict in the
requested IDs, so I don't know

Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-20 Thread Oren Laadan


Pavel Emelyanov wrote:
> Oren Laadan wrote:
>> Serge E. Hallyn wrote:
>>> Quoting Pavel Emelyanov ([EMAIL PROTECTED]):
>>>> Oren Laadan wrote:
>>>>> Serge E. Hallyn wrote:
>>>>>> Quoting Oren Laadan ([EMAIL PROTECTED]):
>>>>>>> I hate to bring this again, but what if the admin in the container
>>>>>>> mounts an external file system (eg. nfs, usb, loop mount from a file,
>>>>>>> or via fuse), and that file system already has a device that we would
>>>>>>> like to ban inside that container ?
>>>>>> Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
>>>>>> then mnt->mnt_flags |= MNT_NODEV.  So that's no problem.
>>>>> Yes, that works to disallow all device files from a mounted file system.
>>>>>
>>>>> But it's a black and white thing: either they are all banned or allowed;
>>>>> you can't have some devices allowed and others not, depending on type
>>>>> A scenario where this may be useful is, for instance, if we some apps in
>>>>> the container to execute withing a pre-made chroot (sub)tree within that
>>>>> container.
>>>>>
>>>>>> But that's been pulled out of -mm! ?  Crap.
>>>>>>
>>>>>>> Since anyway we will have to keep a white- (or black-) list of devices
>>>>>>> that are permitted in a container, and that list may change even change
>>>>>>> per container -- why not enforce the access control at the VFS layer ?
>>>>>>> It's safer in the long run.
>>>>>> By that you mean more along the lines of Pavel's patch than my whitelist
>>>>>> LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean 
>>>>>> that
>>>>>> by 'vfs layer' :), or something different entirely?
>>>>> :)
>>>>>
>>>>> By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
>>>>> Either yours or Pavel's; I tend to prefer not to use LSM as it may
>>>>> collide with future security modules.
>>>> Oren, AFAIS you've seen my patches for device access controller, right?
>> If you mean this one:
>> http://openvz.org/pipermail/devel/2007-September/007647.html
>> then ack :)
> 
> Great! Thanks.
> 
>>>> Maybe we can revisit the issue then and try to come to agreement on what
>>>> kind of model and implementation we all want?
>>> That would be great, Pavel.  I do prefer your solution over my LSM, so
>>> if we can get an elegant block device control right in the vfs code that
>>> would be my preference.
>> I concur.
>>
>> So it seems to me that we are all in favor of the model where open()
>> of a device will consult a black/white-list. Also, we are all in favor
>> of a non-LSM implementation, Pavel's code being a good example.
> 
> Thank you, Oren and Serge! I will revisit this issue then, but
> I have a vacation the next week and, after this, we have a New
> Year and Christmas holidays in Russia. So I will be able to go
> on with it only after the 7th January :( Hope this is OK for you.
> 
> Besides, Andrew told that he would pay little attention to new
> features till the 2.6.24 release, so I'm afraid we won't have this 
> even in -mm in the nearest months :(

Sounds great !  (as for the delay, it wasn't the highest priority issue
to begin with, so no worries).

Ah.. coincidentally they are celebrated here, too, on the same time :D
Merry Christmas and Happy New Year !

Oren.

> 
> Thanks,
> Pavel
> 
>> Oren.
>>
>>> The only thing that makes me keep wanting to go back to an LSM is the
>>> fact that the code defining the whitelist seems out of place in the vfs.
>>> But I guess that's actually separated into a modular cgroup, with the
>>> actual enforcement built in at the vfs.  So that's really the best
>>> solution.
>>>
>>> -serge
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-20 Thread Oren Laadan


Pavel Emelyanov wrote:
 Oren Laadan wrote:
 Serge E. Hallyn wrote:
 Quoting Pavel Emelyanov ([EMAIL PROTECTED]):
 Oren Laadan wrote:
 Serge E. Hallyn wrote:
 Quoting Oren Laadan ([EMAIL PROTECTED]):
 I hate to bring this again, but what if the admin in the container
 mounts an external file system (eg. nfs, usb, loop mount from a file,
 or via fuse), and that file system already has a device that we would
 like to ban inside that container ?
 Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
 then mnt-mnt_flags |= MNT_NODEV.  So that's no problem.
 Yes, that works to disallow all device files from a mounted file system.

 But it's a black and white thing: either they are all banned or allowed;
 you can't have some devices allowed and others not, depending on type
 A scenario where this may be useful is, for instance, if we some apps in
 the container to execute withing a pre-made chroot (sub)tree within that
 container.

 But that's been pulled out of -mm! ?  Crap.

 Since anyway we will have to keep a white- (or black-) list of devices
 that are permitted in a container, and that list may change even change
 per container -- why not enforce the access control at the VFS layer ?
 It's safer in the long run.
 By that you mean more along the lines of Pavel's patch than my whitelist
 LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean 
 that
 by 'vfs layer' :), or something different entirely?
 :)

 By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
 Either yours or Pavel's; I tend to prefer not to use LSM as it may
 collide with future security modules.
 Oren, AFAIS you've seen my patches for device access controller, right?
 If you mean this one:
 http://openvz.org/pipermail/devel/2007-September/007647.html
 then ack :)
 
 Great! Thanks.
 
 Maybe we can revisit the issue then and try to come to agreement on what
 kind of model and implementation we all want?
 That would be great, Pavel.  I do prefer your solution over my LSM, so
 if we can get an elegant block device control right in the vfs code that
 would be my preference.
 I concur.

 So it seems to me that we are all in favor of the model where open()
 of a device will consult a black/white-list. Also, we are all in favor
 of a non-LSM implementation, Pavel's code being a good example.
 
 Thank you, Oren and Serge! I will revisit this issue then, but
 I have a vacation the next week and, after this, we have a New
 Year and Christmas holidays in Russia. So I will be able to go
 on with it only after the 7th January :( Hope this is OK for you.
 
 Besides, Andrew told that he would pay little attention to new
 features till the 2.6.24 release, so I'm afraid we won't have this 
 even in -mm in the nearest months :(

Sounds great !  (as for the delay, it wasn't the highest priority issue
to begin with, so no worries).

Ah.. coincidentally they are celebrated here, too, on the same time :D
Merry Christmas and Happy New Year !

Oren.

 
 Thanks,
 Pavel
 
 Oren.

 The only thing that makes me keep wanting to go back to an LSM is the
 fact that the code defining the whitelist seems out of place in the vfs.
 But I guess that's actually separated into a modular cgroup, with the
 actual enforcement built in at the vfs.  So that's really the best
 solution.

 -serge
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-19 Thread Oren Laadan


Serge E. Hallyn wrote:
> Quoting Pavel Emelyanov ([EMAIL PROTECTED]):
>> Oren Laadan wrote:
>>> Serge E. Hallyn wrote:
>>>> Quoting Oren Laadan ([EMAIL PROTECTED]):
>>>>> I hate to bring this again, but what if the admin in the container
>>>>> mounts an external file system (eg. nfs, usb, loop mount from a file,
>>>>> or via fuse), and that file system already has a device that we would
>>>>> like to ban inside that container ?
>>>> Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
>>>> then mnt->mnt_flags |= MNT_NODEV.  So that's no problem.
>>> Yes, that works to disallow all device files from a mounted file system.
>>>
>>> But it's a black and white thing: either they are all banned or allowed;
>>> you can't have some devices allowed and others not, depending on type
>>> A scenario where this may be useful is, for instance, if we some apps in
>>> the container to execute withing a pre-made chroot (sub)tree within that
>>> container.
>>>
>>>> But that's been pulled out of -mm! ?  Crap.
>>>>
>>>>> Since anyway we will have to keep a white- (or black-) list of devices
>>>>> that are permitted in a container, and that list may change even change
>>>>> per container -- why not enforce the access control at the VFS layer ?
>>>>> It's safer in the long run.
>>>> By that you mean more along the lines of Pavel's patch than my whitelist
>>>> LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean that
>>>> by 'vfs layer' :), or something different entirely?
>>> :)
>>>
>>> By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
>>> Either yours or Pavel's; I tend to prefer not to use LSM as it may
>>> collide with future security modules.
>> Oren, AFAIS you've seen my patches for device access controller, right?

If you mean this one:
http://openvz.org/pipermail/devel/2007-September/007647.html
then ack :)

>>
>> Maybe we can revisit the issue then and try to come to agreement on what
>> kind of model and implementation we all want?
> 
> That would be great, Pavel.  I do prefer your solution over my LSM, so
> if we can get an elegant block device control right in the vfs code that
> would be my preference.

I concur.

So it seems to me that we are all in favor of the model where open()
of a device will consult a black/white-list. Also, we are all in favor
of a non-LSM implementation, Pavel's code being a good example.

Oren.

> The only thing that makes me keep wanting to go back to an LSM is the
> fact that the code defining the whitelist seems out of place in the vfs.
> But I guess that's actually separated into a modular cgroup, with the
> actual enforcement built in at the vfs.  So that's really the best
> solution.
> 
> -serge
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-19 Thread Oren Laadan


Serge E. Hallyn wrote:
 Quoting Pavel Emelyanov ([EMAIL PROTECTED]):
 Oren Laadan wrote:
 Serge E. Hallyn wrote:
 Quoting Oren Laadan ([EMAIL PROTECTED]):
 I hate to bring this again, but what if the admin in the container
 mounts an external file system (eg. nfs, usb, loop mount from a file,
 or via fuse), and that file system already has a device that we would
 like to ban inside that container ?
 Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
 then mnt-mnt_flags |= MNT_NODEV.  So that's no problem.
 Yes, that works to disallow all device files from a mounted file system.

 But it's a black and white thing: either they are all banned or allowed;
 you can't have some devices allowed and others not, depending on type
 A scenario where this may be useful is, for instance, if we some apps in
 the container to execute withing a pre-made chroot (sub)tree within that
 container.

 But that's been pulled out of -mm! ?  Crap.

 Since anyway we will have to keep a white- (or black-) list of devices
 that are permitted in a container, and that list may change even change
 per container -- why not enforce the access control at the VFS layer ?
 It's safer in the long run.
 By that you mean more along the lines of Pavel's patch than my whitelist
 LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean that
 by 'vfs layer' :), or something different entirely?
 :)

 By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
 Either yours or Pavel's; I tend to prefer not to use LSM as it may
 collide with future security modules.
 Oren, AFAIS you've seen my patches for device access controller, right?

If you mean this one:
http://openvz.org/pipermail/devel/2007-September/007647.html
then ack :)


 Maybe we can revisit the issue then and try to come to agreement on what
 kind of model and implementation we all want?
 
 That would be great, Pavel.  I do prefer your solution over my LSM, so
 if we can get an elegant block device control right in the vfs code that
 would be my preference.

I concur.

So it seems to me that we are all in favor of the model where open()
of a device will consult a black/white-list. Also, we are all in favor
of a non-LSM implementation, Pavel's code being a good example.

Oren.

 The only thing that makes me keep wanting to go back to an LSM is the
 fact that the code defining the whitelist seems out of place in the vfs.
 But I guess that's actually separated into a modular cgroup, with the
 actual enforcement built in at the vfs.  So that's really the best
 solution.
 
 -serge
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-17 Thread Oren Laadan

Serge E. Hallyn wrote:
> Quoting Oren Laadan ([EMAIL PROTECTED]):
>> I hate to bring this again, but what if the admin in the container
>> mounts an external file system (eg. nfs, usb, loop mount from a file,
>> or via fuse), and that file system already has a device that we would
>> like to ban inside that container ?
> 
> Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
> then mnt->mnt_flags |= MNT_NODEV.  So that's no problem.

Yes, that works to disallow all device files from a mounted file system.

But it's a black and white thing: either they are all banned or allowed;
you can't have some devices allowed and others not, depending on type
A scenario where this may be useful is, for instance, if we some apps in
the container to execute withing a pre-made chroot (sub)tree within that
container.

> 
> But that's been pulled out of -mm! ?  Crap.
> 
>> Since anyway we will have to keep a white- (or black-) list of devices
>> that are permitted in a container, and that list may change even change
>> per container -- why not enforce the access control at the VFS layer ?
>> It's safer in the long run.
> 
> By that you mean more along the lines of Pavel's patch than my whitelist
> LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean that
> by 'vfs layer' :), or something different entirely?

:)

By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
Either yours or Pavel's; I tend to prefer not to use LSM as it may
collide with future security modules.

Oren.

> 
> thanks,
> -serge
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-17 Thread Oren Laadan

I hate to bring this again, but what if the admin in the container
mounts an external file system (eg. nfs, usb, loop mount from a file,
or via fuse), and that file system already has a device that we would
like to ban inside that container ?

Since anyway we will have to keep a white- (or black-) list of devices
that are permitted in a container, and that list may change even change
per container -- why not enforce the access control at the VFS layer ?
It's safer in the long run.

Oren.

Serge E. Hallyn wrote:
> Quoting Tetsuo Handa ([EMAIL PROTECTED]):
>> Hello.
>>
>> Serge E. Hallyn wrote:
>>> CAP_MKNOD will be removed from its capability
>> I think it is not enough because the root can rename/unlink device files
>> (mv /dev/sda1 /dev/tmp; mv /dev/sda2 /dev/sda1; mv /dev/tmp /dev/sda2).
> 
> Sure but that doesn't bother us :)
> 
> The admin in the container has his own /dev directory and can do what he
> likes with the devices he's allowed to have.  He just shouldn't have
> access to others.  If he wants to rename /dev/sda1 to /dev/sda5 that's
> his choice.
> 
>>> To use your approach, i guess we would have to use selinux (or tomoyo)
>>> to enforce that devices may only be created under /dev?
>> Everyone can use this filesystem alone.
> 
> Sure but it is worthless alone.
> 
> No?
> 
> What will keep the container admin from doing 'mknod /root/hda1 b 3 1'?
> 
>> But use with MAC (or whatever access control mechanisms that prevent
>> attackers from unmounting/overlaying this filesystem) is recomennded.
> 
> -serge
> ___
> Containers mailing list
> [EMAIL PROTECTED]
> https://lists.linux-foundation.org/mailman/listinfo/containers
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-17 Thread Oren Laadan

I hate to bring this again, but what if the admin in the container
mounts an external file system (eg. nfs, usb, loop mount from a file,
or via fuse), and that file system already has a device that we would
like to ban inside that container ?

Since anyway we will have to keep a white- (or black-) list of devices
that are permitted in a container, and that list may change even change
per container -- why not enforce the access control at the VFS layer ?
It's safer in the long run.

Oren.

Serge E. Hallyn wrote:
 Quoting Tetsuo Handa ([EMAIL PROTECTED]):
 Hello.

 Serge E. Hallyn wrote:
 CAP_MKNOD will be removed from its capability
 I think it is not enough because the root can rename/unlink device files
 (mv /dev/sda1 /dev/tmp; mv /dev/sda2 /dev/sda1; mv /dev/tmp /dev/sda2).
 
 Sure but that doesn't bother us :)
 
 The admin in the container has his own /dev directory and can do what he
 likes with the devices he's allowed to have.  He just shouldn't have
 access to others.  If he wants to rename /dev/sda1 to /dev/sda5 that's
 his choice.
 
 To use your approach, i guess we would have to use selinux (or tomoyo)
 to enforce that devices may only be created under /dev?
 Everyone can use this filesystem alone.
 
 Sure but it is worthless alone.
 
 No?
 
 What will keep the container admin from doing 'mknod /root/hda1 b 3 1'?
 
 But use with MAC (or whatever access control mechanisms that prevent
 attackers from unmounting/overlaying this filesystem) is recomennded.
 
 -serge
 ___
 Containers mailing list
 [EMAIL PROTECTED]
 https://lists.linux-foundation.org/mailman/listinfo/containers
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 1/2] [RFC] Simple tamper-proof device filesystem.

2007-12-17 Thread Oren Laadan

Serge E. Hallyn wrote:
 Quoting Oren Laadan ([EMAIL PROTECTED]):
 I hate to bring this again, but what if the admin in the container
 mounts an external file system (eg. nfs, usb, loop mount from a file,
 or via fuse), and that file system already has a device that we would
 like to ban inside that container ?
 
 Miklos' user mount patches enforced that if !capable(CAP_MKNOD),
 then mnt-mnt_flags |= MNT_NODEV.  So that's no problem.

Yes, that works to disallow all device files from a mounted file system.

But it's a black and white thing: either they are all banned or allowed;
you can't have some devices allowed and others not, depending on type
A scenario where this may be useful is, for instance, if we some apps in
the container to execute withing a pre-made chroot (sub)tree within that
container.

 
 But that's been pulled out of -mm! ?  Crap.
 
 Since anyway we will have to keep a white- (or black-) list of devices
 that are permitted in a container, and that list may change even change
 per container -- why not enforce the access control at the VFS layer ?
 It's safer in the long run.
 
 By that you mean more along the lines of Pavel's patch than my whitelist
 LSM, or you actually mean Tetsuo's filesystem (i assume you don't mean that
 by 'vfs layer' :), or something different entirely?

:)

By 'vfs' I mean at open() time, and not at mount(), or mknod() time.
Either yours or Pavel's; I tend to prefer not to use LSM as it may
collide with future security modules.

Oren.

 
 thanks,
 -serge
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch -mm 2/4] mqueue namespace : add unshare support

2007-11-29 Thread Oren Laadan


Cedric Le Goater wrote:
>>> Index: 2.6.24-rc3-mm2/include/linux/sched.h
>>> ===
>>> --- 2.6.24-rc3-mm2.orig/include/linux/sched.h
>>> +++ 2.6.24-rc3-mm2/include/linux/sched.h
>>> @@ -27,6 +27,7 @@
>>>  #define CLONE_NEWUSER  0x1000  /* New user namespace */
>>>  #define CLONE_NEWPID   0x2000  /* New pid namespace */
>>>  #define CLONE_NEWNET   0x4000  /* New network 
>>> namespace */
>>> +#define CLONE_NEWMQ0x8000  /* New posix mqueue 
>>> namespace */
>> That's it :) We've run out of clone flags on 32-bit platforms :(
> 
> yes. 
> 
> I have been giving some thoughts to a clone2() to extend the flags but
> andrew is preparing to recycle CLONE_DETACHED and CLONE_STOPPED for
> 2.6.26. Some we might have some more time in front of us.

Two comments:

1) Does it ever make any sense to clone the IPC namespace *without* doing
so also for the MQ namespace or vice versa ?  Unless there is a good
reason for doing so, a single CLONE_IPCMQ flag would suffice.

2) Before coming up with a new clone2() or other solution, what about the
proposed (and debated) sys_indrect() -- if it gets merged it can provide
the solution.

Oren.

> 
> C.
> ___
> Containers mailing list
> [EMAIL PROTECTED]
> https://lists.linux-foundation.org/mailman/listinfo/containers
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch -mm 2/4] mqueue namespace : add unshare support

2007-11-29 Thread Oren Laadan


Cedric Le Goater wrote:
 Index: 2.6.24-rc3-mm2/include/linux/sched.h
 ===
 --- 2.6.24-rc3-mm2.orig/include/linux/sched.h
 +++ 2.6.24-rc3-mm2/include/linux/sched.h
 @@ -27,6 +27,7 @@
  #define CLONE_NEWUSER  0x1000  /* New user namespace */
  #define CLONE_NEWPID   0x2000  /* New pid namespace */
  #define CLONE_NEWNET   0x4000  /* New network 
 namespace */
 +#define CLONE_NEWMQ0x8000  /* New posix mqueue 
 namespace */
 That's it :) We've run out of clone flags on 32-bit platforms :(
 
 yes. 
 
 I have been giving some thoughts to a clone2() to extend the flags but
 andrew is preparing to recycle CLONE_DETACHED and CLONE_STOPPED for
 2.6.26. Some we might have some more time in front of us.

Two comments:

1) Does it ever make any sense to clone the IPC namespace *without* doing
so also for the MQ namespace or vice versa ?  Unless there is a good
reason for doing so, a single CLONE_IPCMQ flag would suffice.

2) Before coming up with a new clone2() or other solution, what about the
proposed (and debated) sys_indrect() -- if it gets merged it can provide
the solution.

Oren.

 
 C.
 ___
 Containers mailing list
 [EMAIL PROTECTED]
 https://lists.linux-foundation.org/mailman/listinfo/containers
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/