date:20071220

With more and more sub-systems/sub-components leaving their footprint
in task handling functions, it seems reasonable to add notifiers that
these components can use instead of having them all patch themselves
directly into core files.

Patch 1 introduces the base definitions and hooks for task creation
and deletion.
Patch 2 switches delayacct to make use of the notifier.
Patch 3 makes the procevents/connector use the infrastructure and adds
additional notifiers needed there.
Patch 4 makes the security keys handling use this, too.

Signed-off-by: Jan Beulich [EMAIL PROTECTED]


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/4] add task handling notifier: base definitions

This is the base patch, adding notification for task creation and
deletion.

Signed-off-by: Jan Beulich [EMAIL PROTECTED]
---
 include/linux/sched.h |8 +++-
 kernel/fork.c |   11 +++
 2 files changed, 18 insertions(+), 1 deletion(-)

--- 2.6.24-rc5-notify-task.orig/include/linux/sched.h
+++ 2.6.24-rc5-notify-task/include/linux/sched.h
@@ -80,7 +80,7 @@ struct sched_param {
 #include linux/rcupdate.h
 #include linux/futex.h
 #include linux/rtmutex.h
-
+#include linux/notifier.h
 #include linux/time.h
 #include linux/param.h
 #include linux/resource.h
@@ -1700,6 +1700,12 @@ extern int do_execve(char *, char __user
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned 
long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 
+#define TASK_NEW 1
+#define TASK_DELETE 2
+
+extern struct blocking_notifier_head task_notifier_list;
+extern struct atomic_notifier_head atomic_task_notifier_list;
+
 extern void set_task_comm(struct task_struct *tsk, char *from);
 extern void get_task_comm(char *to, struct task_struct *tsk);
 
--- 2.6.24-rc5-notify-task.orig/kernel/fork.c
+++ 2.6.24-rc5-notify-task/kernel/fork.c
@@ -46,6 +46,7 @@
 #include linux/tsacct_kern.h
 #include linux/cn_proc.h
 #include linux/freezer.h
+#include linux/notifier.h
 #include linux/delayacct.h
 #include linux/taskstats_kern.h
 #include linux/random.h
@@ -71,6 +72,11 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+BLOCKING_NOTIFIER_HEAD(task_notifier_list);
+EXPORT_SYMBOL_GPL(task_notifier_list);
+ATOMIC_NOTIFIER_HEAD(atomic_task_notifier_list);
+EXPORT_SYMBOL_GPL(atomic_task_notifier_list);
+
 int nr_processes(void)
 {
int cpu;
@@ -121,6 +127,9 @@ void __put_task_struct(struct task_struc
WARN_ON(atomic_read(tsk-usage));
WARN_ON(tsk == current);
 
+   atomic_notifier_call_chain(atomic_task_notifier_list,
+  TASK_DELETE, tsk);
+
security_task_free(tsk);
free_uid(tsk-user);
put_group_info(tsk-group_info);
@@ -1450,6 +1459,8 @@ long do_fork(unsigned long clone_flags,
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
 
+   blocking_notifier_call_chain(task_notifier_list, TASK_NEW, p);
+
if (!(clone_flags  CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] add task handling notifier: connector based proc events

This has the additional benefit of allowing the code to now be built
as a module (which made it necessary to add MODULE_xxx declarations).

Signed-off-by: Jan Beulich [EMAIL PROTECTED]
Cc: Matt Helsley [EMAIL PROTECTED]
---
 drivers/connector/Kconfig   |5 +--
 drivers/connector/cn_proc.c |   57 
 fs/exec.c   |5 ++-
 include/linux/cn_proc.h |   21 
 include/linux/sched.h   |4 +++
 kernel/exit.c   |5 ++-
 kernel/fork.c   |2 -
 kernel/sys.c|   27 +---
 8 files changed, 83 insertions(+), 43 deletions(-)

--- 2.6.24-rc5-notify-task.orig/drivers/connector/Kconfig
+++ 2.6.24-rc5-notify-task/drivers/connector/Kconfig
@@ -12,9 +12,8 @@ menuconfig CONNECTOR
 if CONNECTOR
 
 config PROC_EVENTS
-   boolean Report process events to userspace
-   depends on CONNECTOR=y
-   default y
+   tristate Report process events to userspace
+   default CONNECTOR
---help---
  Provide a connector that reports process events to userspace. Send
  events such as fork, exec, id change (uid, gid, suid, etc), and exit.
--- 2.6.24-rc5-notify-task.orig/drivers/connector/cn_proc.c
+++ 2.6.24-rc5-notify-task/drivers/connector/cn_proc.c
@@ -26,12 +26,17 @@
 #include linux/kernel.h
 #include linux/ktime.h
 #include linux/init.h
+#include linux/notifier.h
 #include linux/connector.h
 #include asm/atomic.h
 #include asm/unaligned.h
 
 #include linux/cn_proc.h
 
+MODULE_LICENSE(GPL);
+MODULE_AUTHOR(Matt Helsley [EMAIL PROTECTED]);
+MODULE_DESCRIPTION(Process events - userspace connector);
+
 #define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event))
 
 static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
@@ -47,7 +52,7 @@ static inline void get_seq(__u32 *ts, in
put_cpu_var(proc_event_counts);
 }
 
-void proc_fork_connector(struct task_struct *task)
+static void proc_fork_connector(struct task_struct *task)
 {
struct cn_msg *msg;
struct proc_event *ev;
@@ -75,7 +80,7 @@ void proc_fork_connector(struct task_str
cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
-void proc_exec_connector(struct task_struct *task)
+static void proc_exec_connector(struct task_struct *task)
 {
struct cn_msg *msg;
struct proc_event *ev;
@@ -100,7 +105,7 @@ void proc_exec_connector(struct task_str
cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
-void proc_id_connector(struct task_struct *task, int which_id)
+static void proc_id_connector(struct task_struct *task, int which_id)
 {
struct cn_msg *msg;
struct proc_event *ev;
@@ -133,7 +138,7 @@ void proc_id_connector(struct task_struc
cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
-void proc_exit_connector(struct task_struct *task)
+static void proc_exit_connector(struct task_struct *task)
 {
struct cn_msg *msg;
struct proc_event *ev;
@@ -220,6 +225,35 @@ static void cn_proc_mcast_ctl(void *data
cn_proc_ack(err, msg-seq, msg-ack);
 }
 
+static int connector_task_notifier(struct notifier_block *nb,
+  unsigned long action,
+  void *task)
+{
+   switch (action) {
+   case TASK_NEW:
+   proc_fork_connector(task);
+   break;
+   case TASK_EXEC:
+   proc_exec_connector(task);
+   break;
+   case TASK_EXIT:
+   proc_exit_connector(task);
+   break;
+   case TASK_UID_CHANGE:
+   proc_id_connector(task, PROC_EVENT_UID);
+   break;
+   case TASK_GID_CHANGE:
+   proc_id_connector(task, PROC_EVENT_GID);
+   break;
+   }
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block connector_task_notifier_block = {
+   .notifier_call = connector_task_notifier
+};
+
 /*
  * cn_proc_init - initialization entry point
  *
@@ -234,7 +268,22 @@ static int __init cn_proc_init(void)
printk(KERN_WARNING cn_proc failed to register\n);
return err;
}
+   blocking_notifier_chain_register(task_notifier_list,
+connector_task_notifier_block);
return 0;
 }
 
+/*
+ * cn_proc_exit - unload entry point
+ *
+ * Removes the connector callback from the connector driver.
+ */
+static void __exit cn_proc_exit(void)
+{
+   blocking_notifier_chain_unregister(task_notifier_list,
+  connector_task_notifier_block);
+   cn_del_callback(cn_proc_event_id);
+}
+
 module_init(cn_proc_init);
+module_exit(cn_proc_exit);
--- 2.6.24-rc5-notify-task.orig/fs/exec.c
+++ 2.6.24-rc5-notify-task/fs/exec.c
@@ -49,7 +49,7 @@
 #include linux/syscalls.h
 #include linux/rmap.h
 #include linux/tsacct_kern.h
-#include linux/cn_proc.h
+#include linux/notifier.h
 #include linux/audit.h
 
 #include

[PATCH 4/4] add task handling notifier: security keys

Signed-off-by: Jan Beulich [EMAIL PROTECTED]
Cc: David Howells [EMAIL PROTECTED]
---
 arch/mips/kernel/kspd.c  |7 +++--
 include/linux/key.h  |4 ---
 kernel/sys.c |8 --
 security/keys/process_keys.c |   55 ++-
 4 files changed, 39 insertions(+), 35 deletions(-)

--- 2.6.24-rc5-notify-task.orig/arch/mips/kernel/kspd.c
+++ 2.6.24-rc5-notify-task/arch/mips/kernel/kspd.c
@@ -25,6 +25,7 @@
 #include linux/workqueue.h
 #include linux/errno.h
 #include linux/list.h
+#include linux/notifier.h
 
 #include asm/vpe.h
 #include asm/rtlx.h
@@ -177,8 +178,10 @@ static void sp_setfsuidgid( uid_t uid, g
current-fsuid = uid;
current-fsgid = gid;
 
-   key_fsuid_changed(current);
-   key_fsgid_changed(current);
+   blocking_notifier_call_chain(task_notifier_list,
+TASK_UID_CHANGE, current);
+   blocking_notifier_call_chain(task_notifier_list,
+TASK_GID_CHANGE, current);
 }
 
 /*
--- 2.6.24-rc5-notify-task.orig/include/linux/key.h
+++ 2.6.24-rc5-notify-task/include/linux/key.h
@@ -272,8 +272,6 @@ extern void exit_keys(struct task_struct
 extern void exit_thread_group_keys(struct signal_struct *tg);
 extern int suid_keys(struct task_struct *tsk);
 extern int exec_keys(struct task_struct *tsk);
-extern void key_fsuid_changed(struct task_struct *tsk);
-extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
 #define __install_session_keyring(tsk, keyring)\
@@ -302,8 +300,6 @@ extern void key_init(void);
 #define exit_thread_group_keys(tg) do { } while(0)
 #define suid_keys(t)   do { } while(0)
 #define exec_keys(t)   do { } while(0)
-#define key_fsuid_changed(t)   do { } while(0)
-#define key_fsgid_changed(t)   do { } while(0)
 #define key_init() do { } while(0)
 
 /* Initial keyrings */
--- 2.6.24-rc5-notify-task.orig/kernel/sys.c
+++ 2.6.24-rc5-notify-task/kernel/sys.c
@@ -517,7 +517,6 @@ asmlinkage long sys_setregid(gid_t rgid,
current-fsgid = new_egid;
current-egid = new_egid;
current-gid = new_rgid;
-   key_fsgid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_GID_CHANGE, current);
 
@@ -554,7 +553,6 @@ asmlinkage long sys_setgid(gid_t gid)
else
return -EPERM;
 
-   key_fsgid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_GID_CHANGE, current);
 
@@ -644,7 +642,6 @@ asmlinkage long sys_setreuid(uid_t ruid,
current-suid = current-euid;
current-fsuid = current-euid;
 
-   key_fsuid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_UID_CHANGE, current);
 
@@ -692,7 +689,6 @@ asmlinkage long sys_setuid(uid_t uid)
current-fsuid = current-euid = uid;
current-suid = new_suid;
 
-   key_fsuid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_UID_CHANGE, current);
 
@@ -741,7 +737,6 @@ asmlinkage long sys_setresuid(uid_t ruid
if (suid != (uid_t) -1)
current-suid = suid;
 
-   key_fsuid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_UID_CHANGE, current);
 
@@ -794,7 +789,6 @@ asmlinkage long sys_setresgid(gid_t rgid
if (sgid != (gid_t) -1)
current-sgid = sgid;
 
-   key_fsgid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_GID_CHANGE, current);
return 0;
@@ -836,7 +830,6 @@ asmlinkage long sys_setfsuid(uid_t uid)
current-fsuid = uid;
}
 
-   key_fsuid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_UID_CHANGE, current);
 
@@ -864,7 +857,6 @@ asmlinkage long sys_setfsgid(gid_t gid)
smp_wmb();
}
current-fsgid = gid;
-   key_fsgid_changed(current);
blocking_notifier_call_chain(task_notifier_list,
 TASK_GID_CHANGE, current);
}
--- 2.6.24-rc5-notify-task.orig/security/keys/process_keys.c
+++ 2.6.24-rc5-notify-task/security/keys/process_keys.c
@@ -16,6 +16,7 @@
 #include linux/keyctl.h
 #include linux/fs.h
 #include linux/err.h
+#include linux/notifier.h
 #include linux/mutex.h
 #include asm/uaccess.h
 #include internal.h
@@ -354,35 +355,47 @@ int suid_keys(struct task_struct *tsk)
 
 } /* end suid_keys() */
 
-/*/
-/*
- * the filesystem user ID changed
- */

[patch 1/5] x86, ptrace: rlimit BTS buffer allocation

Check the rlimit of the tracing task for total and locked memory when 
allocating the BTS buffer.

Signed-off-by: Markus Metzger [EMAIL PROTECTED]
---

Index: linux-2.6-x86/arch/x86/kernel/ptrace.c
===
--- linux-2.6-x86.orig/arch/x86/kernel/ptrace.c 2007-12-20 13:51:21.%N +0100
+++ linux-2.6-x86/arch/x86/kernel/ptrace.c  2007-12-20 13:51:45.%N +0100
@@ -620,12 +620,80 @@
return i;
 }
 
+static int ptrace_bts_realloc(struct task_struct *child,
+ int size, int reduce_size)
+{
+   unsigned long rlim, vm;
+   int ret, old_size;
+
+   if (size  0)
+   return -EINVAL;
+
+   old_size = ds_get_bts_size((void *)child-thread.ds_area_msr);
+   if (old_size  0)
+   return old_size;
+
+   ret = ds_free((void **)child-thread.ds_area_msr);
+   if (ret  0)
+   goto out;
+
+   size = PAGE_SHIFT;
+   old_size = PAGE_SHIFT;
+
+   current-mm-total_vm  -= old_size;
+   current-mm-locked_vm -= old_size;
+
+   if (size == 0)
+   goto out;
+
+   rlim = current-signal-rlim[RLIMIT_AS].rlim_cur  PAGE_SHIFT;
+   vm = current-mm-total_vm  + size;
+   if (rlim  vm) {
+   ret = -ENOMEM;
+
+   if (!reduce_size)
+   goto out;
+
+   size = rlim - current-mm-total_vm;
+   if (size = 0)
+   goto out;
+   }
+
+   rlim = current-signal-rlim[RLIMIT_MEMLOCK].rlim_cur  PAGE_SHIFT;
+   vm = current-mm-locked_vm  + size;
+   if (rlim  vm) {
+   ret = -ENOMEM;
+
+   if (!reduce_size)
+   goto out;
+
+   size = rlim - current-mm-locked_vm;
+   if (size = 0)
+   goto out;
+   }
+
+   ret = ds_allocate((void **)child-thread.ds_area_msr,
+ size  PAGE_SHIFT);
+   if (ret  0)
+   goto out;
+
+   current-mm-total_vm  += size;
+   current-mm-locked_vm += size;
+
+out:
+   if (child-thread.ds_area_msr)
+   set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+   else
+   clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+   return ret;
+}
+
 static int ptrace_bts_config(struct task_struct *child,
 const struct ptrace_bts_config __user *ucfg)
 {
struct ptrace_bts_config cfg;
-   unsigned long debugctl_mask;
-   int bts_size, ret;
+   int bts_size, ret = 0;
void *ds;
 
if (copy_from_user(cfg, ucfg, sizeof(cfg)))
@@ -638,59 +706,46 @@
if (bts_size  0)
return bts_size;
}
+   cfg.size = PAGE_ALIGN(cfg.size);
 
if (bts_size != cfg.size) {
-   ret = ds_free((void **)child-thread.ds_area_msr);
+   ret = ptrace_bts_realloc(child, cfg.size,
+cfg.flags  PTRACE_BTS_O_CUT_SIZE);
if (ret  0)
-   return ret;
+   goto errout;
 
-   if (cfg.size  0)
-   ret = ds_allocate((void **)child-thread.ds_area_msr,
- cfg.size);
ds = (void *)child-thread.ds_area_msr;
-   if (ds)
-   set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-   else
-   clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
-   if (ret  0)
-   return ret;
-
-   bts_size = ds_get_bts_size(ds);
-   if (bts_size = 0)
-   return bts_size;
}
 
-   if (ds) {
-   if (cfg.flags  PTRACE_BTS_O_SIGNAL) {
-   ret = ds_set_overflow(ds, DS_O_SIGNAL);
-   } else {
-   ret = ds_set_overflow(ds, DS_O_WRAP);
-   }
-   if (ret  0)
-   return ret;
-   }
-
-   debugctl_mask = ds_debugctl_mask();
-   if (ds  (cfg.flags  PTRACE_BTS_O_TRACE)) {
-   child-thread.debugctlmsr |= debugctl_mask;
-   set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-   } else {
-   /* there is no way for us to check whether we 'own'
-* the respective bits in the DEBUGCTL MSR, we're
-* about to clear */
-   child-thread.debugctlmsr = ~debugctl_mask;
+   if (cfg.flags  PTRACE_BTS_O_SIGNAL)
+   ret = ds_set_overflow(ds, DS_O_SIGNAL);
+   else
+   ret = ds_set_overflow(ds, DS_O_WRAP);
+   if (ret  0)
+   goto errout;
 
-   if (!child-thread.debugctlmsr)
-   clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-   }
+   if (cfg.flags  PTRACE_BTS_O_TRACE)
+   child-thread.debugctlmsr |= ds_debugctl_mask();
+   else
+

[patch 2/5] x86, ptrace: support 32bit-cross-64bit BTS recording

Support BTS recording of 32bit and 64bit tasks from 32bit or 64bit tasks.


Signed-off-by: Markus Metzger [EMAIL PROTECTED]
---

Index: linux-2.6-x86/arch/x86/kernel/ds.c
===
--- linux-2.6-x86.orig/arch/x86/kernel/ds.c 2007-12-20 13:51:20.%N +0100
+++ linux-2.6-x86/arch/x86/kernel/ds.c  2007-12-20 13:52:01.%N +0100
@@ -111,53 +111,53 @@
  * Accessor functions for some DS and BTS fields using the above
  * global ptrace_bts_cfg.
  */
-static inline void *get_bts_buffer_base(char *base)
+static inline unsigned long get_bts_buffer_base(char *base)
 {
-   return *(void **)(base + ds_cfg.bts_buffer_base.offset);
+   return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
 }
-static inline void set_bts_buffer_base(char *base, void *value)
+static inline void set_bts_buffer_base(char *base, unsigned long value)
 {
-   (*(void **)(base + ds_cfg.bts_buffer_base.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
 }
-static inline void *get_bts_index(char *base)
+static inline unsigned long get_bts_index(char *base)
 {
-   return *(void **)(base + ds_cfg.bts_index.offset);
+   return *(unsigned long *)(base + ds_cfg.bts_index.offset);
 }
-static inline void set_bts_index(char *base, void *value)
+static inline void set_bts_index(char *base, unsigned long value)
 {
-   (*(void **)(base + ds_cfg.bts_index.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
 }
-static inline void *get_bts_absolute_maximum(char *base)
+static inline unsigned long get_bts_absolute_maximum(char *base)
 {
-   return *(void **)(base + ds_cfg.bts_absolute_maximum.offset);
+   return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
 }
-static inline void set_bts_absolute_maximum(char *base, void *value)
+static inline void set_bts_absolute_maximum(char *base, unsigned long value)
 {
-   (*(void **)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
 }
-static inline void *get_bts_interrupt_threshold(char *base)
+static inline unsigned long get_bts_interrupt_threshold(char *base)
 {
-   return *(void **)(base + ds_cfg.bts_interrupt_threshold.offset);
+   return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
 }
-static inline void set_bts_interrupt_threshold(char *base, void *value)
+static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
 {
-   (*(void **)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = 
value;
 }
-static inline long get_from_ip(char *base)
+static inline unsigned long get_from_ip(char *base)
 {
-   return *(long *)(base + ds_cfg.from_ip.offset);
+   return *(unsigned long *)(base + ds_cfg.from_ip.offset);
 }
-static inline void set_from_ip(char *base, long value)
+static inline void set_from_ip(char *base, unsigned long value)
 {
-   (*(long *)(base + ds_cfg.from_ip.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
 }
-static inline long get_to_ip(char *base)
+static inline unsigned long get_to_ip(char *base)
 {
-   return *(long *)(base + ds_cfg.to_ip.offset);
+   return *(unsigned long *)(base + ds_cfg.to_ip.offset);
 }
-static inline void set_to_ip(char *base, long value)
+static inline void set_to_ip(char *base, unsigned long value)
 {
-   (*(long *)(base + ds_cfg.to_ip.offset)) = value;
+   (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
 }
 static inline unsigned char get_info_type(char *base)
 {
@@ -180,7 +180,7 @@
 int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 {
size_t bts_size_in_records;
-   void *bts;
+   unsigned long bts;
void *ds;
 
if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
@@ -197,7 +197,7 @@
if (bts_size_in_bytes = 0)
return -EINVAL;
 
-   bts = kzalloc(bts_size_in_bytes, GFP_KERNEL);
+   bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
 
if (!bts)
return -ENOMEM;
@@ -205,7 +205,7 @@
ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
 
if (!ds) {
-   kfree(bts);
+   kfree((void *)bts);
return -ENOMEM;
}
 
@@ -221,7 +221,7 @@
 int ds_free(void **dsp)
 {
if (*dsp)
-   kfree(get_bts_buffer_base(*dsp));
+   kfree((void *)get_bts_buffer_base(*dsp));
kfree(*dsp);
*dsp = 0;
 
@@ -230,7 +230,7 @@
 
 int ds_get_bts_size(void *ds)
 {
-   size_t size_in_bytes;
+   int size_in_bytes;
 
if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
return -EOPNOTSUPP;
@@ -246,7 +246,7 @@
 
 int ds_get_bts_end(void *ds)
 {
-   size_t size_in_bytes = ds_get_bts_size(ds);
+   int size_in_bytes =

Re: [patch 3/3] Enable setting of IRQ-thread priorities from kernel cmdline. (repost:CC to LKML)

2007-12-20 Thread Juergen Beisert

On Thursday 20 December 2007 13:45, Jaswinder Singh wrote:
 On 12/20/07, Remy Bohmer [EMAIL PROTECTED] wrote:
  So, Is this a serious requirement? Should this be possible?

 I have noticed this problem:
 [EMAIL PROTECTED]:~# cat /proc/loadavgrt
 1.00 1.00 1.00 0/52 1158
 [EMAIL PROTECTED]:~# cat /proc/loadavg
 0.00 0.00 0.02 1/52 1159
 [EMAIL PROTECTED]:~#

 So I am curious, if possible, user can switch softirq-threads or IRQs
 RT tasks to non-RT tasks for slow hardware or least important hardware
 for NON-RT tasks. So this will improve RT behaviour.
^^
Why?

IMHO: Simply decrease the RT priority of less important IRQs or increase all 
other more important IRQs. IRQs are always more important than other 
processes in a system, also in non RT systems.

Juergen
-- 
Dipl.-Ing. Juergen Beisert | http://www.pengutronix.de
 Pengutronix - Linux Solutions for Science and Industry
    Handelsregister: Amtsgericht Hildesheim, HRA 2686
     Vertretung Sued/Muenchen, Germany
   Phone: +49-8766-939 228 |  Fax: +49-5121-206917-9
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 3/5] x86, ptrace: add buffer size checks

Pass the buffer size for (most) ptrace commands that pass user-allocated 
buffers and check that size before accessing the buffer. Unfortunately, 
PTRACE_BTS_GET already uses all 4 parameters.
Commands that access user buffers return the number of bytes or records read or 
written.


Signed-off-by: Markus Metzger [EMAIL PROTECTED]
---

Index: linux-2.6-x86/arch/x86/kernel/ptrace.c
===
--- linux-2.6-x86.orig/arch/x86/kernel/ptrace.c 2007-12-20 13:52:01.%N +0100
+++ linux-2.6-x86/arch/x86/kernel/ptrace.c  2007-12-20 13:52:09.%N +0100
@@ -591,6 +591,7 @@
 }
 
 static int ptrace_bts_drain(struct task_struct *child,
+   long size,
struct bts_struct __user *out)
 {
int end, i;
@@ -603,6 +604,9 @@
if (end = 0)
return end;
 
+   if (size  (end * sizeof(struct bts_struct)))
+   return -EIO;
+
for (i = 0; i  end; i++, out++) {
struct bts_struct ret;
int retval;
@@ -617,7 +621,7 @@
 
ds_clear(ds);
 
-   return i;
+   return end;
 }
 
 static int ptrace_bts_realloc(struct task_struct *child,
@@ -690,15 +694,22 @@
 }
 
 static int ptrace_bts_config(struct task_struct *child,
+long cfg_size,
 const struct ptrace_bts_config __user *ucfg)
 {
struct ptrace_bts_config cfg;
int bts_size, ret = 0;
void *ds;
 
+   if (cfg_size  sizeof(cfg))
+   return -EIO;
+
if (copy_from_user(cfg, ucfg, sizeof(cfg)))
return -EFAULT;
 
+   if ((int)cfg.size  0)
+   return -EINVAL;
+
bts_size = 0;
ds = (void *)child-thread.ds_area_msr;
if (ds) {
@@ -734,6 +745,8 @@
else
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 
+   ret = sizeof(cfg);
+
 out:
if (child-thread.debugctlmsr)
set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -749,11 +762,15 @@
 }
 
 static int ptrace_bts_status(struct task_struct *child,
+long cfg_size,
 struct ptrace_bts_config __user *ucfg)
 {
void *ds = (void *)child-thread.ds_area_msr;
struct ptrace_bts_config cfg;
 
+   if (cfg_size  sizeof(cfg))
+   return -EIO;
+
memset(cfg, 0, sizeof(cfg));
 
if (ds) {
@@ -935,12 +952,12 @@
 
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
-   (child, (struct ptrace_bts_config __user *)addr);
+   (child, data, (struct ptrace_bts_config __user *)addr);
break;
 
case PTRACE_BTS_STATUS:
ret = ptrace_bts_status
-   (child, (struct ptrace_bts_config __user *)addr);
+   (child, data, (struct ptrace_bts_config __user *)addr);
break;
 
case PTRACE_BTS_SIZE:
@@ -958,7 +975,7 @@
 
case PTRACE_BTS_DRAIN:
ret = ptrace_bts_drain
-   (child, (struct bts_struct __user *) addr);
+   (child, data, (struct bts_struct __user *) addr);
break;
 
default:
Index: linux-2.6-x86/include/asm-x86/ptrace-abi.h
===
--- linux-2.6-x86.orig/include/asm-x86/ptrace-abi.h 2007-12-20 13:52:01.%N 
+0100
+++ linux-2.6-x86/include/asm-x86/ptrace-abi.h  2007-12-20 13:52:09.%N +0100
@@ -99,13 +99,15 @@
 
 #define PTRACE_BTS_CONFIG  40
 /* Configure branch trace recording.
-   DATA is ignored, ADDR points to a struct ptrace_bts_config.
+   ADDR points to a struct ptrace_bts_config.
+   DATA gives the size of that buffer.
A new buffer is allocated, iff the size changes.
+   Returns the number of bytes read.
 */
 #define PTRACE_BTS_STATUS  41
-/* Return the current configuration.
-   DATA is ignored, ADDR points to a struct ptrace_bts_config
-   that will contain the result.
+/* Return the current configuration in a struct ptrace_bts_config
+   pointed to by ADDR; DATA gives the size of that buffer.
+   Returns the number of bytes written.
 */
 #define PTRACE_BTS_SIZE42
 /* Return the number of available BTS records.
@@ -123,8 +125,8 @@
 */
 #define PTRACE_BTS_DRAIN   45
 /* Read all available BTS records and clear the buffer.
-   DATA is ignored. ADDR points to an array of struct bts_struct of
-   suitable size.
+   ADDR points to an array of struct bts_struct.
+   DATA gives the size of that buffer.
BTS records are read from oldest to newest.
Returns number of BTS records drained.
 */
-
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer

Re: [PATCH] Move page_assign_page_cgroup to VM_BUG_ON in free_hot_cold_page

2007-12-20 Thread Hugh Dickins

On Wed, 19 Dec 2007, Dave Hansen wrote:
  --- 
  linux-2.6.24-rc5/mm/page_alloc.c~memory-controller-move-to-bug-on-in-free_hot_cold_page
   2007-12-19 11:31:46.0 +0530
  +++ linux-2.6.24-rc5-balbir/mm/page_alloc.c 2007-12-19 
  11:33:45.0 +0530
  @@ -995,7 +995,7 @@ static void fastcall free_hot_cold_page(
   
  if (!PageHighMem(page))
  debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
  -   page_assign_page_cgroup(page, NULL);
  +   VM_BUG_ON(page_get_page_cgroup(page));
  arch_free_page(page, 0);
  kernel_map_pages(page, 1, 0); 
 
 Hi Balbir,
 
 You generally want to do these like:
 
   foo = page_assign_page_cgroup(page, NULL);
   VM_BUG_ON(foo);
 
 Some embedded people have been known to optimize kernel size like this:
 
   #define VM_BUG_ON(x) do{}while(0)

Balbir's patch looks fine to me: I don't get your point there, Dave.

Hugh
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Fix crash with FLAT_MEMORY and ARCH_PFN_OFFSET != 0

2007-12-20 Thread Mel Gorman

On (20/12/07 13:43), Thomas Bogendoerfer didst pronounce:
 On Thu, Dec 20, 2007 at 11:44:06AM +, Mel Gorman wrote:
  --- a/include/asm-mips/page.h
  +++ b/include/asm-mips/page.h
  @@ -37,13 +37,6 @@
   #include linux/pfn.h
   #include asm/io.h
   
  -/*
  - * It's normally defined only for FLATMEM config but it's
  - * used in our early mem init code for all memory models.
  - * So always define it.
  - */
  -#define ARCH_PFN_OFFSETPFN_UP(PHYS_OFFSET)
  -
 
 hmm, doesn't this break what I've fixed ? Without this #define
 ARCH_PFN_OFFSET gets defined to 0 and the bug is back. Or did
 I miss anything ?
 

ARCH_PFN_OFFSET goes to 0, so page_to_pfn() is no longer adjusting by
PFN_UP(PHYS_OFFSET) like it was when your problem occured. I am guessing
that the nature of the crash was that page_to_pfn() was returning bogus
values early in boot and trying to initialise memmap that didn't exist.

-- 
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/6] udf: fix signedness issue

2007-12-20 Thread Jan Kara

On Wed 19-12-07 20:27:20, Marcin Slusarz wrote:
 On Mon, Dec 17, 2007 at 05:32:17PM +0100, Jan Kara wrote:
   sparse generated:
   fs/udf/namei.c:896:15: originally declared here
   fs/udf/namei.c:1147:41: warning: incorrect type in argument 3 (different 
   signedness)
   fs/udf/namei.c:1147:41:expected int *offset
   fs/udf/namei.c:1147:41:got unsigned int *noident
   fs/udf/namei.c:1152:78: warning: incorrect type in argument 3 (different 
   signedness)
   fs/udf/namei.c:1152:78:expected int *offset
   fs/udf/namei.c:1152:78:got unsigned int *noident
  
   Signed-off-by: Marcin Slusarz [EMAIL PROTECTED]
I don't think this is right. udf_get_fileident() should take unsigned
  int * as an offset, not just int. This means changing struct
  udf_fileident_bh to use unsigned int too but that is better anyway.
And BTW the type shouldn't be uint32_t but really unsigned int in
  udf_rename (int needn't have 32 bits on all archs (although I think it
  has currently)).
 That would be hard. Look what is happening with soffset and eoffset
 eg in udf_fileident_read() - these fields are used as signed ints.
  Doh, you're right. soffset can go below 0. OK, then your patch is fine.
You can add there
  Acked-by: Jan Kara [EMAIL PROTECTED]

  BTW: If you haven't got an email from Andrew about accepting the patches
we agreed upon, then please resend the patches to him (with additional
Acked-by: Jan Kara [EMAIL PROTECTED] and CC me) so that they don't get lost.
Thanks.

Honza
-- 
Jan Kara [EMAIL PROTECTED]
SUSE Labs, CR
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/16] lguest: introduce vcpu structure

this patch makes room for the vcpu structure in lguest, already used in
this very same way at lguest64. It's the first part of our plan to
have lguest and lguest64 unified too. 

When two dogs hang out, you don't have new puppies right in the other day.
Some time has to be elapsed. They have to grow first. In this same spirit, 
having these 
patches _do not_ mean smp guests can be launched (yet)
Much more work is to come, but this is the basic infrastructure.

Enjoy


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 01/16] introduce vcpu struct

this patch introduces a vcpu struct for lguest. In upcoming patches,
more and more fields will be moved from the lguest struct to the vcpu

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/lg.h |   15 +++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 8692489..9723732 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -38,6 +38,13 @@ struct lguest_pages
 #define CHANGED_GDT_TLS4 /* Actually a subset of CHANGED_GDT */
 #define CHANGED_ALL3
 
+struct lguest;
+
+struct lguest_vcpu {
+   int vcpu_id;
+   struct lguest *lg;
+};
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
@@ -47,6 +54,9 @@ struct lguest
struct lguest_data __user *lguest_data;
struct task_struct *tsk;
struct mm_struct *mm;   /* == tsk-mm, but that becomes NULL on exit */
+   struct lguest_vcpu vcpus[NR_CPUS];
+   unsigned int nr_vcpus;
+
u32 pfn_limit;
/* This provides the offset to the base of guest-physical
 * memory in the Launcher. */
@@ -92,6 +102,11 @@ struct lguest
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
 };
 
+static inline struct lguest *lg_of_vcpu(struct lguest_vcpu *vcpu)
+{
+   return container_of((vcpu - vcpu-vcpu_id), struct lguest, vcpus[0]);
+}
+
 extern struct mutex lguest_lock;
 
 /* core.c: */
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 03/16] initialize vcpu

this patch initializes the first vcpu in the initialize() routing,
which is responsible for starting the process of putting the guest up.
right now, as much of the fields are still not per-vcpu, it does not
do much.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/lguest_user.c |   17 +
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 3b92a61..d1b1c26 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -88,6 +88,17 @@ static ssize_t read(struct file *file, char __user *user, 
size_t size,loff_t*o)
return run_guest(lg, (unsigned long __user *)user);
 }
 
+static int vcpu_start(struct lguest_vcpu *vcpu, int vcpu_id,
+ unsigned long start_ip)
+{
+   vcpu-vcpu_id = vcpu_id;
+
+   vcpu-lg = container_of((vcpu - vcpu_id), struct lguest, vcpus[0]);
+   vcpu-lg-nr_vcpus++;
+
+   return 0;
+}
+
 /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
  * values (in addition to the LHREQ_INITIALIZE value).  These are:
  *
@@ -134,6 +145,12 @@ static int initialize(struct file *file, const unsigned 
long __user *input)
lg-mem_base = (void __user *)(long)args[0];
lg-pfn_limit = args[1];
 
+   /* This is the first cpu */
+   lg-nr_vcpus = 0;
+   err = vcpu_start(lg-vcpus[0], 0, args[3]);
+   if (err)
+   goto release_guest;
+
/* We need a complete page for the Guest registers: they are accessible
 * to the Guest and we can only grant it access to whole pages. */
lg-regs_page = get_zeroed_page(GFP_KERNEL);
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/16] adapt lguest launcher to per-cpuness

This patch makes uses of pread() and pwrite() in lguest launcher
to communicate the vcpu id to the lguest driver. The id is kept in
a thread variable, which means we'll span in the future, vcpus as
threads. But right now, only the infrastructure is out there.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 Documentation/lguest/lguest.c |   24 +---
 1 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 9b0e322..c406ba9 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -79,6 +79,9 @@ static void *guest_base;
 /* The maximum guest physical address allowed, and maximum possible. */
 static unsigned long guest_limit, guest_max;
 
+/* a per-cpu variable indicating whose vcpu is currently running */
+static unsigned int __thread vcpu_id;
+
 /* This is our list of devices. */
 struct device_list
 {
@@ -554,7 +557,7 @@ static void wake_parent(int pipefd, int lguest_fd)
else
FD_CLR(-fd - 1, devices.infds);
} else /* Send LHREQ_BREAK command. */
-   write(lguest_fd, args, sizeof(args));
+   pwrite(lguest_fd, args, sizeof(args), 0);
}
 }
 
@@ -1511,7 +1514,8 @@ static void __attribute__((noreturn)) run_guest(int 
lguest_fd)
int readval;
 
/* We read from the /dev/lguest device to run the Guest. */
-   readval = read(lguest_fd, notify_addr, sizeof(notify_addr));
+   readval = pread(lguest_fd, notify_addr,
+   sizeof(notify_addr), vcpu_id);
 
/* One unsigned long means the Guest did HCALL_NOTIFY */
if (readval == sizeof(notify_addr)) {
@@ -1521,17 +1525,22 @@ static void __attribute__((noreturn)) run_guest(int 
lguest_fd)
/* ENOENT means the Guest died.  Reading tells us why. */
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
-   read(lguest_fd, reason, sizeof(reason)-1);
+   pread(lguest_fd, reason, sizeof(reason)-1, vcpu_id);
errx(1, %s, reason);
/* EAGAIN means the Waker wanted us to look at some input.
 * Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, Running guest failed);
 
-   /* Service input, then unset the BREAK to release the Waker. */
-   handle_input(lguest_fd);
-   if (write(lguest_fd, args, sizeof(args))  0)
-   err(1, Resetting break);
+   if (!vcpu_id) {
+   /*
+* Service input, then unset the BREAK to
+* release the Waker.
+*/
+   handle_input(lguest_fd);
+   if (pwrite(lguest_fd, args, sizeof(args), 0)  0)
+   err(1, Resetting break);
+   }
}
 }
 /*
@@ -1582,6 +1591,7 @@ int main(int argc, char *argv[])
devices.lastdev = devices.dev;
devices.next_irq = 1;
 
+   vcpu_id = 0;
/* We need to know how much memory so we can set up the device
 * descriptor and memory pages for the devices as we parse the command
 * line.  So we quickly look through the arguments to find the amount
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 04/16] per-cpu run guest

This patch makes the run_guest() routine use the vcpu struct.
This is required since in a smp guest environment, there's no
more the notion of running the guest, but rather, it is running the vcpu

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/core.c|6 --
 drivers/lguest/lg.h  |4 ++--
 drivers/lguest/lguest_user.c |6 +-
 drivers/lguest/x86/core.c|   16 +++-
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cb4c670..70fc65e 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -174,8 +174,10 @@ void __lgwrite(struct lguest *lg, unsigned long addr, 
const void *b,
 /*H:030 Let's jump straight to the the main loop which runs the Guest.
  * Remember, this is called by the Launcher reading /dev/lguest, and we keep
  * going around and around until something interesting happens. */
-int run_guest(struct lguest *lg, unsigned long __user *user)
+int run_guest(struct lguest_vcpu *vcpu, unsigned long __user *user)
 {
+   struct lguest *lg = vcpu-lg;
+
/* We stop running once the Guest is dead. */
while (!lg-dead) {
/* First we run any hypercalls the Guest wants done. */
@@ -226,7 +228,7 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
local_irq_disable();
 
/* Actually run the Guest until something happens. */
-   lguest_arch_run_guest(lg);
+   lguest_arch_run_guest(vcpu);
 
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9723732..c4a0a97 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -131,7 +131,7 @@ void __lgwrite(struct lguest *, unsigned long, const void 
*, unsigned);
} while(0)
 /* (end of memory access helper routines) :*/
 
-int run_guest(struct lguest *lg, unsigned long __user *user);
+int run_guest(struct lguest_vcpu *vcpu, unsigned long __user *user);
 
 /* Helper macros to obtain the first 12 or the last 20 bits, this is only the
  * first step in the migration to the kernel types.  pte_pfn is already defined
@@ -182,7 +182,7 @@ void page_table_guest_data_init(struct lguest *lg);
 /* arch/core.c: */
 void lguest_arch_host_init(void);
 void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lguest *lg);
+void lguest_arch_run_guest(struct lguest_vcpu *vcpu);
 void lguest_arch_handle_trap(struct lguest *lg);
 int lguest_arch_init_hypercalls(struct lguest *lg);
 int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index d1b1c26..894d530 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -55,11 +55,15 @@ static int user_send_irq(struct lguest *lg, const unsigned 
long __user *input)
 static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 {
struct lguest *lg = file-private_data;
+   struct lguest_vcpu *vcpu = NULL;
+   unsigned int vcpu_id = *o;
 
/* You must write LHREQ_INITIALIZE first! */
if (!lg)
return -EINVAL;
 
+   vcpu = lg-vcpus[vcpu_id];
+
/* If you're not the task which owns the Guest, go away. */
if (current != lg-tsk)
return -EPERM;
@@ -85,7 +89,7 @@ static ssize_t read(struct file *file, char __user *user, 
size_t size,loff_t*o)
lg-pending_notify = 0;
 
/* Run the Guest until something interesting happens. */
-   return run_guest(lg, (unsigned long __user *)user);
+   return run_guest(vcpu, (unsigned long __user *)user);
 }
 
 static int vcpu_start(struct lguest_vcpu *vcpu, int vcpu_id,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 482aec2..0530ef3 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -73,8 +73,10 @@ static DEFINE_PER_CPU(struct lguest *, last_guest);
  * since it last ran.  We saw this set in interrupts_and_traps.c and
  * segments.c.
  */
-static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+static void copy_in_guest_info(struct lguest_vcpu *vcpu,
+  struct lguest_pages *pages)
 {
+   struct lguest *lg = vcpu-lg;
/* Copying all this data can be quite expensive.  We usually run the
 * same Guest we ran last time (and that Guest hasn't run anywhere else
 * meanwhile).  If that's not the case, we pretend everything in the
@@ -113,14 +115,16 @@ static void copy_in_guest_info(struct lguest *lg, struct 
lguest_pages *pages)
 }
 
 /* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+static void run_guest_once(struct lguest_vcpu *vcpu,
+  struct

[PATCH 05/16] make write() operation smp aware

This patch makes the write() file operation smp aware. Which means, receiving
the vcpu_id value through the offset parameter, and being well aware to which
vcpu we're talking to.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/lguest_user.c |   11 +--
 1 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 894d530..ae5bf4c 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -223,14 +223,21 @@ static ssize_t write(struct file *file, const char __user 
*in,
struct lguest *lg = file-private_data;
const unsigned long __user *input = (const unsigned long __user *)in;
unsigned long req;
+   struct lguest_vcpu *vcpu = NULL;
+   int vcpu_id = *off;
 
if (get_user(req, input) != 0)
return -EFAULT;
input++;
 
/* If you haven't initialized, you must do that first. */
-   if (req != LHREQ_INITIALIZE  !lg)
-   return -EINVAL;
+   if (req != LHREQ_INITIALIZE) {
+   if (!lg)
+   return -EINVAL;
+   vcpu = lg-vcpus[vcpu_id];
+   if (!vcpu)
+   return -EINVAL;
+   }
 
/* Once the Guest is dead, all you can do is read() why it died. */
if (lg  lg-dead)
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 06/16] make hypercalls use the vcpu struct

this patch changes do_hcall() and do_async_hcall() interfaces (and obviously 
their
callers) to get a vcpu struct. Again, a vcpu services the hypercall, not the 
whole
guest

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/core.c   |6 +++---
 drivers/lguest/hypercalls.c |   42 +++---
 drivers/lguest/lg.h |   16 
 drivers/lguest/x86/core.c   |   16 ++--
 4 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 70fc65e..ef35e02 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -181,8 +181,8 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
/* We stop running once the Guest is dead. */
while (!lg-dead) {
/* First we run any hypercalls the Guest wants done. */
-   if (lg-hcall)
-   do_hypercalls(lg);
+   if (vcpu-hcall)
+   do_hypercalls(vcpu);
 
/* It's possible the Guest did a NOTIFY hypercall to the
 * Launcher, in which case we return from the read() now. */
@@ -234,7 +234,7 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
local_irq_enable();
 
/* Now we deal with whatever happened to the Guest. */
-   lguest_arch_handle_trap(lg);
+   lguest_arch_handle_trap(vcpu);
}
 
/* The Guest is dead = No such file or directory */
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index b478aff..62da355 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -29,8 +29,10 @@
 
 /*H:120 This is the core hypercall routine: where the Guest gets what it wants.
  * Or gets killed.  Or, in the case of LHCALL_CRASH, both. */
-static void do_hcall(struct lguest *lg, struct hcall_args *args)
+static void do_hcall(struct lguest_vcpu *vcpu, struct hcall_args *args)
 {
+   struct lguest *lg = vcpu-lg;
+
switch (args-arg0) {
case LHCALL_FLUSH_ASYNC:
/* This call does nothing, except by breaking out of the Guest
@@ -91,7 +93,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args 
*args)
break;
default:
/* It should be an architecture-specific hypercall. */
-   if (lguest_arch_do_hcall(lg, args))
+   if (lguest_arch_do_hcall(vcpu, args))
kill_guest(lg, Bad hypercall %li\n, args-arg0);
}
 }
@@ -104,10 +106,11 @@ static void do_hcall(struct lguest *lg, struct hcall_args 
*args)
  * Guest put them in the ring, but we also promise the Guest that they will
  * happen before any normal hypercall (which is why we check this before
  * checking for a normal hcall). */
-static void do_async_hcalls(struct lguest *lg)
+static void do_async_hcalls(struct lguest_vcpu *vcpu)
 {
unsigned int i;
u8 st[LHCALL_RING_SIZE];
+   struct lguest *lg = vcpu-lg;
 
/* For simplicity, we copy the entire call status array in at once. */
if (copy_from_user(st, lg-lguest_data-hcall_status, sizeof(st)))
@@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg)
/* We remember where we were up to from last time.  This makes
 * sure that the hypercalls are done in the order the Guest
 * places them in the ring. */
-   unsigned int n = lg-next_hcall;
+   unsigned int n = vcpu-next_hcall;
 
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
@@ -127,8 +130,8 @@ static void do_async_hcalls(struct lguest *lg)
 
/* OK, we have hypercall.  Increment the next_hcall cursor,
 * and wrap back to 0 if we reach the end. */
-   if (++lg-next_hcall == LHCALL_RING_SIZE)
-   lg-next_hcall = 0;
+   if (++vcpu-next_hcall == LHCALL_RING_SIZE)
+   vcpu-next_hcall = 0;
 
/* Copy the hypercall arguments into a local copy of
 * the hcall_args struct. */
@@ -139,7 +142,7 @@ static void do_async_hcalls(struct lguest *lg)
}
 
/* Do the hypercall, same as a normal one. */
-   do_hcall(lg, args);
+   do_hcall(vcpu, args);
 
/* Mark the hypercall done. */
if (put_user(0xFF, lg-lguest_data-hcall_status[n])) {
@@ -156,16 +159,17 @@ static void do_async_hcalls(struct lguest *lg)
 
 /* Last of all, we look at what happens first of all.  The very first time the
  * Guest makes a hypercall, we end up here to set things up: */
-static void initialize(struct lguest *lg)
+static void initialize(struct lguest_vcpu *vcpu)
 {
+   struct lguest *lg = vcpu-lg;
/* You can't do anything until you're initialized.  The Guest

[PATCH 07/16] per-vcpu lguest timers

Here, I introduce per-vcpu timers. With this, we can have
local expiries, needed for accounting time in smp guests

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/hypercalls.c   |2 +-
 drivers/lguest/interrupts_and_traps.c |   20 ++--
 drivers/lguest/lg.h   |   10 +-
 drivers/lguest/lguest_user.c  |   12 +++-
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 62da355..4364bc2 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -78,7 +78,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
guest_set_pmd(lg, args-arg1, args-arg2);
break;
case LHCALL_SET_CLOCKEVENT:
-   guest_set_clockevent(lg, args-arg1);
+   guest_set_clockevent(vcpu, args-arg1);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index 2b66f79..189d66e 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -470,13 +470,13 @@ void copy_traps(const struct lguest *lg, struct 
desc_struct *idt,
  * infrastructure to set a callback at that time.
  *
  * 0 means turn off the clock. */
-void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+void guest_set_clockevent(struct lguest_vcpu *vcpu, unsigned long delta)
 {
ktime_t expires;
 
if (unlikely(delta == 0)) {
/* Clock event device is shutting down. */
-   hrtimer_cancel(lg-hrt);
+   hrtimer_cancel(vcpu-hrt);
return;
}
 
@@ -484,25 +484,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned 
long delta)
 * all the time between now and the timer interrupt it asked for.  This
 * is almost always the right thing to do. */
expires = ktime_add_ns(ktime_get_real(), delta);
-   hrtimer_start(lg-hrt, expires, HRTIMER_MODE_ABS);
+   hrtimer_start(vcpu-hrt, expires, HRTIMER_MODE_ABS);
 }
 
 /* This is the function called when the Guest's timer expires. */
 static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 {
-   struct lguest *lg = container_of(timer, struct lguest, hrt);
+   struct lguest_vcpu *vcpu = container_of(timer, struct lguest_vcpu, hrt);
 
/* Remember the first interrupt is the timer interrupt. */
-   set_bit(0, lg-irqs_pending);
+   set_bit(0, vcpu-lg-irqs_pending);
/* If the Guest is actually stopped, we need to wake it up. */
-   if (lg-halted)
-   wake_up_process(lg-tsk);
+   if (vcpu-lg-halted)
+   wake_up_process(vcpu-lg-tsk);
return HRTIMER_NORESTART;
 }
 
 /* This sets up the timer for this Guest. */
-void init_clockdev(struct lguest *lg)
+void init_clockdev(struct lguest_vcpu *vcpu)
 {
-   hrtimer_init(lg-hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-   lg-hrt.function = clockdev_fn;
+   hrtimer_init(vcpu-hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+   vcpu-hrt.function = clockdev_fn;
 }
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 696cdf1..0205409 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -47,6 +47,9 @@ struct lguest_vcpu {
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
u32 next_hcall;
+
+   /* Virtual clock device */
+   struct hrtimer hrt;
 };
 
 /* The private info the thread maintains about the guest. */
@@ -95,9 +98,6 @@ struct lguest
 
struct lguest_arch arch;
 
-   /* Virtual clock device */
-   struct hrtimer hrt;
-
/* Pending virtual interrupts */
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
 };
@@ -150,8 +150,8 @@ void setup_default_idt_entries(struct lguest_ro_state 
*state,
   const unsigned long *def);
 void copy_traps(const struct lguest *lg, struct desc_struct *idt,
const unsigned long *def);
-void guest_set_clockevent(struct lguest *lg, unsigned long delta);
-void init_clockdev(struct lguest *lg);
+void guest_set_clockevent(struct lguest_vcpu *vcpu, unsigned long delta);
+void init_clockdev(struct lguest_vcpu *vcpu);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
 void free_interrupts(void);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ae5bf4c..7481e82 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -97,6 +97,9 @@ static int vcpu_start(struct lguest_vcpu *vcpu, int vcpu_id,
 {
vcpu-vcpu_id = vcpu_id;
 
+   /* The timer for lguest's clock needs initialization. */
+   init_clockdev(vcpu);
+
vcpu-lg = container_of((vcpu - vcpu_id), struct lguest, vcpus[0]);

[PATCH 08/16] per-vcpu interrupt processing.

This patch adapts interrupt processing for using the vcpu struct.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/core.c |2 +-
 drivers/lguest/interrupts_and_traps.c |   25 ++---
 drivers/lguest/lg.h   |   10 +-
 drivers/lguest/lguest_user.c  |7 ---
 drivers/lguest/x86/core.c |2 +-
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index ef35e02..4d0102d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -203,7 +203,7 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
/* Check if there are any interrupts which can be delivered
 * now: if so, this sets up the hander to be executed when we
 * next run the Guest. */
-   maybe_do_interrupt(lg);
+   maybe_do_interrupt(vcpu);
 
/* All long-lived kernel loops need to check with this horrible
 * thing called the freezer.  If the Host is trying to suspend,
diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index 189d66e..db440cb 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -60,11 +60,13 @@ static void push_guest_stack(struct lguest *lg, unsigned 
long *gstack, u32 val)
  * We set up the stack just like the CPU does for a real interrupt, so it's
  * identical for the Guest (and the standard iret instruction will undo
  * it). */
-static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 lo, u32 hi,
+   int has_err)
 {
unsigned long gstack, origstack;
u32 eflags, ss, irq_enable;
unsigned long virtstack;
+   struct lguest *lg = vcpu-lg;
 
/* There are two cases for interrupts: one where the Guest is already
 * in the kernel, and a more complex one where the Guest is in
@@ -129,9 +131,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, 
u32 hi, int has_err)
  *
  * maybe_do_interrupt() gets called before every entry to the Guest, to see if
  * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lguest *lg)
+void maybe_do_interrupt(struct lguest_vcpu *vcpu)
 {
unsigned int irq;
+   struct lguest *lg = vcpu-lg;
DECLARE_BITMAP(blk, LGUEST_IRQS);
struct desc_struct *idt;
 
@@ -145,7 +148,7 @@ void maybe_do_interrupt(struct lguest *lg)
   sizeof(blk)))
return;
 
-   bitmap_andnot(blk, lg-irqs_pending, blk, LGUEST_IRQS);
+   bitmap_andnot(blk, vcpu-irqs_pending, blk, LGUEST_IRQS);
 
/* Find the first interrupt. */
irq = find_first_bit(blk, LGUEST_IRQS);
@@ -180,11 +183,11 @@ void maybe_do_interrupt(struct lguest *lg)
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt-a, idt-b)) {
/* OK, mark it no longer pending and deliver it. */
-   clear_bit(irq, lg-irqs_pending);
+   clear_bit(irq, vcpu-irqs_pending);
/* set_guest_interrupt() takes the interrupt descriptor and a
 * flag to say whether this interrupt pushes an error code onto
 * the stack as well: virtual interrupts never do. */
-   set_guest_interrupt(lg, idt-a, idt-b, 0);
+   set_guest_interrupt(vcpu, idt-a, idt-b, 0);
}
 
/* Every time we deliver an interrupt, we update the timestamp in the
@@ -245,19 +248,19 @@ static int has_err(unsigned int trap)
 }
 
 /* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lguest *lg, unsigned int num)
+int deliver_trap(struct lguest_vcpu *vcpu, unsigned int num)
 {
/* Trap numbers are always 8 bit, but we set an impossible trap number
 * for traps inside the Switcher, so check that here. */
-   if (num = ARRAY_SIZE(lg-arch.idt))
+   if (num = ARRAY_SIZE(vcpu-lg-arch.idt))
return 0;
 
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
 * bogus one in): if we fail here, the Guest will be killed. */
-   if (!idt_present(lg-arch.idt[num].a, lg-arch.idt[num].b))
+   if (!idt_present(vcpu-lg-arch.idt[num].a, vcpu-lg-arch.idt[num].b))
return 0;
-   set_guest_interrupt(lg, lg-arch.idt[num].a, lg-arch.idt[num].b,
-   has_err(num));
+   set_guest_interrupt(vcpu, vcpu-lg-arch.idt[num].a,
+   vcpu-lg-arch.idt[num].b, has_err(num));
return 1;
 }
 
@@ -493,7 +496,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer 
*timer)
struct lguest_vcpu *vcpu = container_of(timer, struct lguest_vcpu, hrt);

[PATCH 09/16] map_switcher_in_guest() per-vcpu

The switcher needs to be mapped per-vcpu, because different vcpus
will potentially have different page tables (they don't have to,
because threads will share the same).

So our first step is the make the function receive a vcpu struct

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/lg.h  |3 ++-
 drivers/lguest/page_tables.c |4 +++-
 drivers/lguest/x86/core.c|2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index db2edd6..f6e9020 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -173,7 +173,8 @@ void guest_pagetable_clear_all(struct lguest *lg);
 void guest_pagetable_flush_user(struct lguest *lg);
 void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
+void map_switcher_in_guest(struct lguest_vcpu *vcpu,
+  struct lguest_pages *pages);
 int demand_page(struct lguest *info, unsigned long cr2, int errcode);
 void pin_page(struct lguest *lg, unsigned long vaddr);
 unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index fffabb3..7fb8627 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -634,8 +634,10 @@ void free_guest_pagetable(struct lguest *lg)
  * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
  * for each CPU already set up, we just need to hook them in now we know which
  * Guest is about to run on this CPU. */
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+void map_switcher_in_guest(struct lguest_vcpu *vcpu,
+  struct lguest_pages *pages)
 {
+   struct lguest *lg = vcpu-lg;
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
pgd_t switcher_pgd;
pte_t regs_pte;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 3d21c6d..9bf2213 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -92,7 +92,7 @@ static void copy_in_guest_info(struct lguest_vcpu *vcpu,
pages-state.host_cr3 = __pa(current-mm-pgd);
/* Set up the Guest's page tables to see this CPU's pages (and no
 * other CPU's pages). */
-   map_switcher_in_guest(lg, pages);
+   map_switcher_in_guest(vcpu, pages);
/* Set up the two TSS members which tell the CPU what stack to use
 * for traps which do directly into the Guest (ie. traps at privilege
 * level 1). */
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 10/16] make emulate_insn receive a vcpu struct.

emulate_insn() needs to know about current eip, which will be,
in the future, a per-vcpu thing. So in this patch, the function
prototype is modified to receive a vcpu struct

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/x86/core.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 9bf2213..2fb9cd3 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -220,8 +220,9 @@ void lguest_arch_run_guest(struct lguest_vcpu *vcpu)
  * When the Guest uses one of these instructions, we get a trap (General
  * Protection Fault) and come here.  We see if it's one of those troublesome
  * instructions and skip over it.  We return true if we did. */
-static int emulate_insn(struct lguest *lg)
+static int emulate_insn(struct lguest_vcpu *vcpu)
 {
+   struct lguest *lg = vcpu-lg;
u8 insn;
unsigned int insnlen = 0, in = 0, shift = 0;
/* The eip contains the *virtual* address of the Guest's instruction:
@@ -294,7 +295,7 @@ void lguest_arch_handle_trap(struct lguest_vcpu *vcpu)
 * instructions which we need to emulate.  If so, we just go
 * back into the Guest after we've done it. */
if (lg-regs-errcode == 0) {
-   if (emulate_insn(lg))
+   if (emulate_insn(vcpu))
return;
}
break;
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 12/16] replace lguest_arch with lguest_vcpu_arch.

The fields found in lguest_arch are not really per-guest,
but per-cpu (gdt, idt, etc). So this patch turns lguest_arch
into lguest_vcpu_arch.

It makes sense to have a per-guest per-arch struct, but this
can be addressed later, when the need arrives.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/interrupts_and_traps.c |   29 +++--
 drivers/lguest/lg.h   |   19 +++---
 drivers/lguest/segments.c |   43 +---
 drivers/lguest/x86/core.c |   24 --
 include/asm-x86/lguest.h  |2 +-
 5 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index 1ceff5f..b3d444a 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -180,7 +180,7 @@ void maybe_do_interrupt(struct lguest_vcpu *vcpu)
/* Look at the IDT entry the Guest gave us for this interrupt.  The
 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
 * over them. */
-   idt = lg-arch.idt[FIRST_EXTERNAL_VECTOR+irq];
+   idt = vcpu-arch.idt[FIRST_EXTERNAL_VECTOR+irq];
/* If they don't have a handler (yet?), we just ignore it */
if (idt_present(idt-a, idt-b)) {
/* OK, mark it no longer pending and deliver it. */
@@ -253,15 +253,15 @@ int deliver_trap(struct lguest_vcpu *vcpu, unsigned int 
num)
 {
/* Trap numbers are always 8 bit, but we set an impossible trap number
 * for traps inside the Switcher, so check that here. */
-   if (num = ARRAY_SIZE(vcpu-lg-arch.idt))
+   if (num = ARRAY_SIZE(vcpu-arch.idt))
return 0;
 
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
 * bogus one in): if we fail here, the Guest will be killed. */
-   if (!idt_present(vcpu-lg-arch.idt[num].a, vcpu-lg-arch.idt[num].b))
+   if (!idt_present(vcpu-arch.idt[num].a, vcpu-arch.idt[num].b))
return 0;
-   set_guest_interrupt(vcpu, vcpu-lg-arch.idt[num].a,
-   vcpu-lg-arch.idt[num].b, has_err(num));
+   set_guest_interrupt(vcpu, vcpu-arch.idt[num].a,
+   vcpu-arch.idt[num].b, has_err(num));
return 1;
 }
 
@@ -387,7 +387,8 @@ static void set_trap(struct lguest *lg, struct desc_struct 
*trap,
  *
  * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
  * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
-void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+void load_guest_idt_entry(struct lguest_vcpu *vcpu,
+ unsigned int num, u32 lo, u32 hi)
 {
/* Guest never handles: NMI, doublefault, spurious interrupt or
 * hypercall.  We ignore when it tries to set them. */
@@ -396,13 +397,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int 
num, u32 lo, u32 hi)
 
/* Mark the IDT as changed: next time the Guest runs we'll know we have
 * to copy this again. */
-   lg-changed |= CHANGED_IDT;
+   vcpu-lg-changed |= CHANGED_IDT;
 
/* Check that the Guest doesn't try to step outside the bounds. */
-   if (num = ARRAY_SIZE(lg-arch.idt))
-   kill_guest(lg, Setting idt entry %u, num);
+   if (num = ARRAY_SIZE(vcpu-arch.idt))
+   kill_guest(vcpu-lg, Setting idt entry %u, num);
else
-   set_trap(lg, lg-arch.idt[num], num, lo, hi);
+   set_trap(vcpu-lg, vcpu-arch.idt[num], num, lo, hi);
 }
 
 /* The default entry for each interrupt points into the Switcher routines which
@@ -438,14 +439,14 @@ void setup_default_idt_entries(struct lguest_ro_state 
*state,
 /*H:240 We don't use the IDT entries in the struct lguest directly, instead
  * we copy them into the IDT which we've set up for Guests on this CPU, just
  * before we run the Guest.  This routine does that copy. */
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lguest_vcpu *vcpu, struct desc_struct *idt,
const unsigned long *def)
 {
unsigned int i;
 
/* We can simply copy the direct traps, otherwise we use the default
 * ones in the Switcher: they will return to the Host. */
-   for (i = 0; i  ARRAY_SIZE(lg-arch.idt); i++) {
+   for (i = 0; i  ARRAY_SIZE(vcpu-arch.idt); i++) {
/* If no Guest can ever override this trap, leave it alone. */
if (!direct_trap(i))
continue;
@@ -454,8 +455,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct 
*idt,
 * Interrupt gates (type 14) disable interrupts as they are
 * entered, which we never let the Guest do.  Not present
 * entries (type 0x0) also can't go direct, of course. */
-

neigh: timer !nud_in_timer

2007-12-20 Thread John Sigler


Hello,

I noticed the following message in my kernel log.
kernel: neigh: timer  !nud_in_timer
(Might be due to a race condition.)

I'm running a UP Linux version 2.6.22.1-rt9
( http://rt.wiki.kernel.org/index.php )

The following /proc entries might be relevant.

/proc/sys/net/ipv4/conf/all/arp_accept
0
/proc/sys/net/ipv4/conf/all/arp_announce
2
/proc/sys/net/ipv4/conf/all/arp_filter
0
/proc/sys/net/ipv4/conf/all/arp_ignore
1

I also lowered the priority of softirq-timer/0 to 10 which means
it can be interrupted by other IRQ handlers.

Regards.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 11/16] make registers per-vcpu

This is the most obvious per-vcpu field: registers.

So this patch moves it from struct lguest to struct vcpu,
and patch the places in which they are used, accordingly

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/interrupts_and_traps.c |   29 ---
 drivers/lguest/lg.h   |9 ---
 drivers/lguest/lguest_user.c  |   36 +++---
 drivers/lguest/page_tables.c  |4 ++-
 drivers/lguest/x86/core.c |   39 +
 5 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index db440cb..1ceff5f 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -71,7 +71,7 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 
lo, u32 hi,
/* There are two cases for interrupts: one where the Guest is already
 * in the kernel, and a more complex one where the Guest is in
 * userspace.  We check the privilege level to find out. */
-   if ((lg-regs-ss0x3) != GUEST_PL) {
+   if ((vcpu-regs-ss0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
 * hypercall: both the virtual address and the segment */
virtstack = lg-esp1;
@@ -82,12 +82,12 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, 
u32 lo, u32 hi,
 * stack: when the Guest does an iret back from the interrupt
 * handler the CPU will notice they're dropping privilege
 * levels and expect these here. */
-   push_guest_stack(lg, gstack, lg-regs-ss);
-   push_guest_stack(lg, gstack, lg-regs-esp);
+   push_guest_stack(lg, gstack, vcpu-regs-ss);
+   push_guest_stack(lg, gstack, vcpu-regs-esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
-   virtstack = lg-regs-esp;
-   ss = lg-regs-ss;
+   virtstack = vcpu-regs-esp;
+   ss = vcpu-regs-ss;
 
origstack = gstack = guest_pa(lg, virtstack);
}
@@ -96,7 +96,7 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 
lo, u32 hi,
 * the Interrupt Flag bit is always set.  We copy that bit from the
 * Guest's irq_enabled field into the eflags word: we saw the Guest
 * copy it back in lguest_iret. */
-   eflags = lg-regs-eflags;
+   eflags = vcpu-regs-eflags;
if (get_user(irq_enable, lg-lguest_data-irq_enabled) == 0
 !(irq_enable  X86_EFLAGS_IF))
eflags = ~X86_EFLAGS_IF;
@@ -105,19 +105,19 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, 
u32 lo, u32 hi,
 * eflags word, the old code segment, and the old instruction
 * pointer. */
push_guest_stack(lg, gstack, eflags);
-   push_guest_stack(lg, gstack, lg-regs-cs);
-   push_guest_stack(lg, gstack, lg-regs-eip);
+   push_guest_stack(lg, gstack, vcpu-regs-cs);
+   push_guest_stack(lg, gstack, vcpu-regs-eip);
 
/* For the six traps which supply an error code, we push that, too. */
if (has_err)
-   push_guest_stack(lg, gstack, lg-regs-errcode);
+   push_guest_stack(lg, gstack, vcpu-regs-errcode);
 
/* Now we've pushed all the old state, we change the stack, the code
 * segment and the address to execute. */
-   lg-regs-ss = ss;
-   lg-regs-esp = virtstack + (gstack - origstack);
-   lg-regs-cs = (__KERNEL_CS|GUEST_PL);
-   lg-regs-eip = idt_address(lo, hi);
+   vcpu-regs-ss = ss;
+   vcpu-regs-esp = virtstack + (gstack - origstack);
+   vcpu-regs-cs = (__KERNEL_CS|GUEST_PL);
+   vcpu-regs-eip = idt_address(lo, hi);
 
/* There are two kinds of interrupt handlers: 0xE is an interrupt
 * gate which expects interrupts to be disabled on entry. */
@@ -158,7 +158,8 @@ void maybe_do_interrupt(struct lguest_vcpu *vcpu)
 
/* They may be in the middle of an iret, where they asked us never to
 * deliver interrupts. */
-   if (lg-regs-eip = lg-noirq_start  lg-regs-eip  lg-noirq_end)
+   if ((vcpu-regs-eip = lg-noirq_start) 
+   (vcpu-regs-eip  lg-noirq_end))
return;
 
/* If they're halted, interrupts restart them. */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index f6e9020..d05fe38 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -44,6 +44,10 @@ struct lguest_vcpu {
int vcpu_id;
struct lguest *lg;
 
+   /* At end of a page shared mapped over lguest_pages in guest.  */
+   unsigned long regs_page;
+   struct lguest_regs *regs;
+
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
u32 next_hcall;
@@ -58,9

[PATCH 14/16] makes special fields be per-vcpu

lguest struct have room for some fields, namely, cr2, ts, esp1
and ss1, that are not really guest-wide, but rather, vcpu-wide.

This patch puts it in the vcpu struct

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/hypercalls.c   |   10 +-
 drivers/lguest/interrupts_and_traps.c |   24 +---
 drivers/lguest/lg.h   |   18 ++
 drivers/lguest/page_tables.c  |   11 ++-
 drivers/lguest/x86/core.c |   10 --
 5 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 41ea2e2..c6b87ef 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -58,7 +58,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
/* FLUSH_TLB comes in two flavors, depending on the
 * argument: */
if (args-arg1)
-   guest_pagetable_clear_all(lg);
+   guest_pagetable_clear_all(vcpu);
else
guest_pagetable_flush_user(lg);
break;
@@ -66,10 +66,10 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
/* All these calls simply pass the arguments through to the right
 * routines. */
case LHCALL_NEW_PGTABLE:
-   guest_new_pagetable(lg, args-arg1);
+   guest_new_pagetable(vcpu, args-arg1);
break;
case LHCALL_SET_STACK:
-   guest_set_stack(lg, args-arg1, args-arg2, args-arg3);
+   guest_set_stack(vcpu, args-arg1, args-arg2, args-arg3);
break;
case LHCALL_SET_PTE:
guest_set_pte(lg, args-arg1, args-arg2, __pte(args-arg3));
@@ -82,7 +82,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
-   lg-ts = args-arg1;
+   vcpu-ts = args-arg1;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
@@ -189,7 +189,7 @@ static void initialize(struct lguest_vcpu *vcpu)
 * first write to a Guest page.  This may have caused a copy-on-write
 * fault, but the old page might be (read-only) in the Guest
 * pagetable. */
-   guest_pagetable_clear_all(lg);
+   guest_pagetable_clear_all(vcpu);
 }
 
 /*H:100
diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index 10c9aea..78f6210 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -74,8 +74,8 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 
lo, u32 hi,
if ((vcpu-regs-ss0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
 * hypercall: both the virtual address and the segment */
-   virtstack = lg-esp1;
-   ss = lg-ss1;
+   virtstack = vcpu-esp1;
+   ss = vcpu-ss1;
 
origstack = gstack = guest_pa(lg, virtstack);
/* We push the old stack segment and pointer onto the new
@@ -313,10 +313,11 @@ static int direct_trap(unsigned int num)
  * the Guest.
  *
  * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
-void pin_stack_pages(struct lguest *lg)
+void pin_stack_pages(struct lguest_vcpu *vcpu)
 {
unsigned int i;
 
+   struct lguest *lg = vcpu-lg;
/* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
 * two pages of stack space. */
for (i = 0; i  lg-stack_pages; i++)
@@ -324,7 +325,7 @@ void pin_stack_pages(struct lguest *lg)
 * start of the page after the kernel stack.  Subtract one to
 * get back onto the first stack page, and keep subtracting to
 * get to the rest of the stack pages. */
-   pin_page(lg, lg-esp1 - 1 - i * PAGE_SIZE);
+   pin_page(lg, vcpu-esp1 - 1 - i * PAGE_SIZE);
 }
 
 /* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -335,21 +336,22 @@ void pin_stack_pages(struct lguest *lg)
  *
  * In Linux each process has its own kernel stack, so this happens a lot: we
  * change stacks on each context switch. */
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+void guest_set_stack(struct lguest_vcpu *vcpu, u32 seg, u32 esp,
+unsigned int pages)
 {
/* You are not allowed have a stack segment with privilege level 0: bad
 * Guest! */
if ((seg  0x3) != GUEST_PL)
-   kill_guest(lg, bad stack segment %i, seg);
+   kill_guest(vcpu-lg, bad stack segment %i, seg);
/* We only expect one or two

[PATCH 13/16] per-vcpu lguest task management

lguest uses tasks to control its running behaviour (like sending
breaks, controlling halted state, etc). In a per-vcpu environment,
each vcpu will have its own underlying task. So this patch
makes the infrastructure for that possible

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/core.c |4 +-
 drivers/lguest/hypercalls.c   |2 +-
 drivers/lguest/interrupts_and_traps.c |8 ++--
 drivers/lguest/lg.h   |   14 
 drivers/lguest/lguest_user.c  |   56 ++--
 5 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4d0102d..285a465 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -197,7 +197,7 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
return -ERESTARTSYS;
 
/* If Waker set break_out, return to Launcher. */
-   if (lg-break_out)
+   if (vcpu-break_out)
return -EAGAIN;
 
/* Check if there are any interrupts which can be delivered
@@ -217,7 +217,7 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
 
/* If the Guest asked to be stopped, we sleep.  The Guest's
 * clock timer or LHCALL_BREAK from the Waker will wake us. */
-   if (lg-halted) {
+   if (vcpu-halted) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
continue;
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 4364bc2..41ea2e2 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -86,7 +86,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
-   lg-halted = 1;
+   vcpu-halted = 1;
break;
case LHCALL_NOTIFY:
lg-pending_notify = args-arg1;
diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index b3d444a..10c9aea 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -163,11 +163,11 @@ void maybe_do_interrupt(struct lguest_vcpu *vcpu)
return;
 
/* If they're halted, interrupts restart them. */
-   if (lg-halted) {
+   if (vcpu-halted) {
/* Re-enable interrupts. */
if (put_user(X86_EFLAGS_IF, lg-lguest_data-irq_enabled))
kill_guest(lg, Re-enabling interrupts);
-   lg-halted = 0;
+   vcpu-halted = 0;
} else {
/* Otherwise we check if they have interrupts disabled. */
u32 irq_enabled;
@@ -500,8 +500,8 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer 
*timer)
/* Remember the first interrupt is the timer interrupt. */
set_bit(0, vcpu-irqs_pending);
/* If the Guest is actually stopped, we need to wake it up. */
-   if (vcpu-lg-halted)
-   wake_up_process(vcpu-lg-tsk);
+   if (vcpu-halted)
+   wake_up_process(vcpu-tsk);
return HRTIMER_NORESTART;
 }
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index f9429ff..b23694e 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -43,6 +43,8 @@ struct lguest;
 struct lguest_vcpu {
int vcpu_id;
struct lguest *lg;
+   struct task_struct *tsk;
+   struct mm_struct *mm;   /* == tsk-mm, but that becomes NULL on exit */
 
/* At end of a page shared mapped over lguest_pages in guest.  */
unsigned long regs_page;
@@ -55,6 +57,11 @@ struct lguest_vcpu {
/* Virtual clock device */
struct hrtimer hrt;
 
+   /* Do we need to stop what we're doing and return to userspace? */
+   int break_out;
+   wait_queue_head_t break_wq;
+   int halted;
+
/* Pending virtual interrupts */
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
 
@@ -65,8 +72,6 @@ struct lguest_vcpu {
 struct lguest
 {
struct lguest_data __user *lguest_data;
-   struct task_struct *tsk;
-   struct mm_struct *mm;   /* == tsk-mm, but that becomes NULL on exit */
struct lguest_vcpu vcpus[NR_CPUS];
unsigned int nr_vcpus;
 
@@ -76,15 +81,10 @@ struct lguest
void __user *mem_base;
unsigned long kernel_address;
u32 cr2;
-   int halted;
int ts;
u32 esp1;
u8 ss1;
 
-   /* Do we need to stop what we're doing and return to userspace? */
-   int break_out;
-   wait_queue_head_t break_wq;
-
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
struct lguest_pages *last_pages;
diff --git a/drivers/lguest/lguest_user.c

[PATCH 15/16] make pending notifications per-vcpu

this patch makes the pending_notify field, used to control
pending notifications, per-vcpu, instead of per-guest

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/core.c|6 +++---
 drivers/lguest/hypercalls.c  |6 +++---
 drivers/lguest/lg.h  |3 ++-
 drivers/lguest/lguest_user.c |4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 285a465..d628515 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -186,10 +186,10 @@ int run_guest(struct lguest_vcpu *vcpu, unsigned long 
__user *user)
 
/* It's possible the Guest did a NOTIFY hypercall to the
 * Launcher, in which case we return from the read() now. */
-   if (lg-pending_notify) {
-   if (put_user(lg-pending_notify, user))
+   if (vcpu-pending_notify) {
+   if (put_user(vcpu-pending_notify, user))
return -EFAULT;
-   return sizeof(lg-pending_notify);
+   return sizeof(vcpu-pending_notify);
}
 
/* Check for signals */
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index c6b87ef..95e1062 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -89,7 +89,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
vcpu-halted = 1;
break;
case LHCALL_NOTIFY:
-   lg-pending_notify = args-arg1;
+   vcpu-pending_notify = args-arg1;
break;
default:
/* It should be an architecture-specific hypercall. */
@@ -152,7 +152,7 @@ static void do_async_hcalls(struct lguest_vcpu *vcpu)
 
/* Stop doing hypercalls if they want to notify the Launcher:
 * it needs to service this first. */
-   if (lg-pending_notify)
+   if (vcpu-pending_notify)
break;
}
 }
@@ -217,7 +217,7 @@ void do_hypercalls(struct lguest_vcpu *vcpu)
/* If we stopped reading the hypercall ring because the Guest did a
 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
 * the hypercall. */
-   if (!vcpu-lg-pending_notify) {
+   if (!vcpu-pending_notify) {
do_hcall(vcpu, vcpu-hcall);
/* Tricky point: we reset the hcall pointer to mark the
 * hypercall as done.  We use the hcall pointer rather than
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index dbf70c6..6faf90d 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -51,6 +51,8 @@ struct lguest_vcpu {
u32 esp1;
u8 ss1;
 
+   unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+
/* At end of a page shared mapped over lguest_pages in guest.  */
unsigned long regs_page;
struct lguest_regs *regs;
@@ -95,7 +97,6 @@ struct lguest
struct pgdir pgdirs[4];
 
unsigned long noirq_start, noirq_end;
-   unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
 
unsigned int stack_pages;
u32 tsc_khz;
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index d081db4..349d69d 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -88,8 +88,8 @@ static ssize_t read(struct file *file, char __user *user, 
size_t size,loff_t*o)
 
/* If we returned from read() last time because the Guest notified,
 * clear the flag. */
-   if (lg-pending_notify)
-   lg-pending_notify = 0;
+   if (vcpu-pending_notify)
+   vcpu-pending_notify = 0;
 
/* Run the Guest until something interesting happens. */
return run_guest(vcpu, (unsigned long __user *)user);
-- 
1.5.0.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 16/16] per-vcpu lguest pgdir management

this patch makes the pgdir management per-vcpu. The pgdirs pool
is still guest-wide (although it'll probably need to grow when we
are really executing more vcpus), but the pgdidx index is gone,
since it makes no sense anymore. Instead, we use a per-vcpu
index.

Signed-off-by: Glauber de Oliveira Costa [EMAIL PROTECTED]
---
 drivers/lguest/hypercalls.c   |2 +-
 drivers/lguest/interrupts_and_traps.c |6 ++--
 drivers/lguest/lg.h   |   12 +++---
 drivers/lguest/page_tables.c  |   60 +
 drivers/lguest/x86/core.c |6 ++--
 5 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 95e1062..f379475 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -60,7 +60,7 @@ static void do_hcall(struct lguest_vcpu *vcpu, struct 
hcall_args *args)
if (args-arg1)
guest_pagetable_clear_all(vcpu);
else
-   guest_pagetable_flush_user(lg);
+   guest_pagetable_flush_user(vcpu);
break;
 
/* All these calls simply pass the arguments through to the right
diff --git a/drivers/lguest/interrupts_and_traps.c 
b/drivers/lguest/interrupts_and_traps.c
index 78f6210..a0ac77e 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -77,7 +77,7 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 
lo, u32 hi,
virtstack = vcpu-esp1;
ss = vcpu-ss1;
 
-   origstack = gstack = guest_pa(lg, virtstack);
+   origstack = gstack = guest_pa(vcpu, virtstack);
/* We push the old stack segment and pointer onto the new
 * stack: when the Guest does an iret back from the interrupt
 * handler the CPU will notice they're dropping privilege
@@ -89,7 +89,7 @@ static void set_guest_interrupt(struct lguest_vcpu *vcpu, u32 
lo, u32 hi,
virtstack = vcpu-regs-esp;
ss = vcpu-regs-ss;
 
-   origstack = gstack = guest_pa(lg, virtstack);
+   origstack = gstack = guest_pa(vcpu, virtstack);
}
 
/* Remember that we never let the Guest actually disable interrupts, so
@@ -325,7 +325,7 @@ void pin_stack_pages(struct lguest_vcpu *vcpu)
 * start of the page after the kernel stack.  Subtract one to
 * get back onto the first stack page, and keep subtracting to
 * get to the rest of the stack pages. */
-   pin_page(lg, vcpu-esp1 - 1 - i * PAGE_SIZE);
+   pin_page(vcpu, vcpu-esp1 - 1 - i * PAGE_SIZE);
 }
 
 /* Direct traps also mean that we need to know whenever the Guest wants to use
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6faf90d..e700408 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -57,6 +57,8 @@ struct lguest_vcpu {
unsigned long regs_page;
struct lguest_regs *regs;
 
+   int vcpu_pgd; /* which pgd this vcpu is currently using */
+
/* If a hypercall was asked for, this points to the arguments. */
struct hcall_args *hcall;
u32 next_hcall;
@@ -92,8 +94,6 @@ struct lguest
int changed;
struct lguest_pages *last_pages;
 
-   /* We keep a small number of these. */
-   u32 pgdidx;
struct pgdir pgdirs[4];
 
unsigned long noirq_start, noirq_end;
@@ -175,14 +175,14 @@ void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lguest_vcpu *vcpu, unsigned long pgtable);
 void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
 void guest_pagetable_clear_all(struct lguest_vcpu *vcpu);
-void guest_pagetable_flush_user(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest_vcpu *vcpu);
 void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
   unsigned long vaddr, pte_t val);
 void map_switcher_in_guest(struct lguest_vcpu *vcpu,
   struct lguest_pages *pages);
-int demand_page(struct lguest *info, unsigned long cr2, int errcode);
-void pin_page(struct lguest *lg, unsigned long vaddr);
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
+int demand_page(struct lguest_vcpu *vcpu, unsigned long cr2, int errcode);
+void pin_page(struct lguest_vcpu *vcpu, unsigned long vaddr);
+unsigned long guest_pa(struct lguest_vcpu *vcpu, unsigned long vaddr);
 void page_table_guest_data_init(struct lguest *lg);
 
 /* arch/core.c: */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index f0f271d..84c22d7 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -94,10 +94,10 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, 
unsigned long vaddr)
 
 /* These two functions just like the above two, except they access the Guest
  * page tables.

Re: OOPS: 2.6.24-rc5-mm1 -- EIP is at r_show+0x2a/0x70 -- (triggered by cat /proc/iomem AFTER suspend-to-disk/resume)

2007-12-20 Thread Miles Lane

On further investigation, cat /proc/iomem does not trigger the stack 
trace until after a suspend-to-disk/resume cycle has occurred.


I am removing Ingo and Russell from the TO list (as they are apparently 
the wrong people) and adding the suspend folks, as suspend is implicated.
My .config file can be found here:  
http://marc.info/?l=linux-kernelm=119812903001296w=2


Miles Lane wrote:

.config attached in order to not trip spam filters.

Miles Lane wrote:
[  252.868386] BUG: unable to handle kernel NULL pointer dereference 
at virtual address 0018

[  252.868393] printing ip: c012d527 *pde = 
[  252.868399] Oops:  [#1] SMP
[  252.868403] last sysfs file: 
/sys/devices/pci:00/:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda3/stat 

[  252.868407] Modules linked in: aes_i586 aes_generic i915 drm 
rfcomm l2cap bluetooth acpi_cpufreq cpufreq_stats 
cpufreq_conservative sbs sbshc dm_crypt sbp2 parport_pc lp parport 
arc4 ecb crypto_blkcipher cryptomgr crypto_algapi snd_hda_intel 
snd_pcm_oss snd_mixer_oss pcmcia snd_pcm iTCO_wdt iTCO_vendor_support 
snd_seq_dummy watchdog_core watchdog_dev snd_seq_oss snd_seq_midi 
tifm_7xx1 snd_rawmidi iwl3945 snd_seq_midi_event rng_core tifm_core 
mac80211 snd_seq snd_timer snd_seq_device cfg80211 sky2 battery 
yenta_socket rsrc_nonstatic pcmcia_core ac snd soundcore 
snd_page_alloc button shpchp pci_hotplug sr_mod cdrom pata_acpi piix 
ide_core firewire_ohci firewire_core crc_itu_t thermal processor fan

[  252.868469]
[  252.868472] Pid: 7088, comm: head Not tainted (2.6.24-rc5-mm1 #9)
[  252.868476] EIP: 0060:[c012d527] EFLAGS: 00010297 CPU: 0
[  252.868481] EIP is at r_show+0x2a/0x70
[  252.868483] EAX:  EBX: 0001 ECX: c07e3224 EDX: c04bb034
[  252.868486] ESI: 0008 EDI: ed1f52c0 EBP: f5320f10 ESP: f5320f04
[  252.868489]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  252.868493] Process head (pid: 7088, ti=f532 task=f532e000 
task.ti=f532)
[  252.868495] Stack: c03a6cac ed1f52c0 c07e3224 f5320f50 c0199a7e 
2000 bf930807 e1007800
[  252.868504]ed1f52e0   01d3 000e 
 000d 
[  252.868512]fffb f7d39370 c01998e4 f5320f74 c01af4f5 
f5320f9c 2000 bf930807

[  252.868521] Call Trace:
[  252.868523]  [c0107d55] show_trace_log_lvl+0x12/0x25
[  252.868529]  [c0107df2] show_stack_log_lvl+0x8a/0x95
[  252.868534]  [c0107e89] show_registers+0x8c/0x154
[  252.868538]  [c010805f] die+0x10e/0x1d2
[  252.868542]  [c039c8c9] do_page_fault+0x52b/0x600
[  252.868547]  [c039af9a] error_code+0x72/0x78
[  252.868552]  [c0199a7e] seq_read+0x19a/0x26c
[  252.868557]  [c01af4f5] proc_reg_read+0x60/0x74
[  252.868562]  [c018390d] vfs_read+0xa2/0x11e
[  252.868567]  [c0183d02] sys_read+0x3b/0x60
[  252.868571]  [c0106bae] sysenter_past_esp+0x6b/0xc1
[  252.868575]  ===
[  252.868577] Code: c3 55 89 d1 89 e5 57 89 c7 56 53 8b 50 64 83 7a 
0c 00 77 0e 81 7a 08 ff ff 00 00 be 04 00 00 00 76 05 be 08 00 00 00 
89 c8 31 db 8b 40 18 39 d0 74 06 43 83 fb 05 75 f3 8b 41 10 ba 2f 
1b 45 c0

[  252.868623] EIP: [c012d527] r_show+0x2a/0x70 SS:ESP 0068:f5320f04

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 3/3] Enable setting of IRQ-thread priorities from kernel cmdline. (repost:CC to LKML)

2007-12-20 Thread Jaswinder Singh

hello Juergen,

On 12/20/07, Juergen Beisert [EMAIL PROTECTED] wrote:
 On Thursday 20 December 2007 13:45, Jaswinder Singh wrote:
  So I am curious, if possible, user can switch softirq-threads or IRQs
  RT tasks to non-RT tasks for slow hardware or least important hardware
  for NON-RT tasks. So this will improve RT behaviour.
 ^^
 Why?

 IMHO: Simply decrease the RT priority of less important IRQs or increase all
 other more important IRQs. IRQs are always more important than other
 processes in a system, also in non RT systems.


In RT Kernel we have :-
RT Tasks and Non-RT tasks.

So we can also support :-
RT softirq-threads/IRQs and non-RT softirq-threads/IRQs

So RT softirq-threads/IRQs for RT Tasks
and non-RT softirq-threads/IRQs for non-RT Tasks.

OR also remove Non-RT tasks from RT Kernel and simply decrease the RT
priority of less important task as per your suggestions.

Thank you,

Jaswinder Singh.

 Juergen
 --
 Dipl.-Ing. Juergen Beisert | http://www.pengutronix.de
  Pengutronix - Linux Solutions for Science and Industry
 Handelsregister: Amtsgericht Hildesheim, HRA 2686
  Vertretung Sued/Muenchen, Germany
Phone: +49-8766-939 228 |  Fax: +49-5121-206917-9

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 4/5] x86, ptrace: overflow signal API

Establish the user API for sending a user-defined signal to the traced task on 
a BTS buffer overflow.

This should complete the user API for the BTS ptrace extension.
The patches so far implement wrap-around overflow handling as is needed for 
debugging.

The remaining open is another overflow handling mechanism that sends a signal 
to the traced task on a buffer overflow.
This will take some more time from my side.

Since, from a user perspective, this occurs behind the scenes, the patch set 
should already be useful. More features may/will be added on top of it 
(overflow signal, pageable back-up buffers, kernel tracing, core file support, 
profiling, ...).


Signed-off-by: Markus Metzger [EMAIL PROTECTED]
 ---

Index: linux-2.6-x86/include/asm-x86/ptrace-abi.h
===
--- linux-2.6-x86.orig/include/asm-x86/ptrace-abi.h 2007-12-20 13:52:09.%N 
+0100
+++ linux-2.6-x86/include/asm-x86/ptrace-abi.h  2007-12-20 13:52:14.%N +0100
@@ -88,11 +88,13 @@
unsigned int size;
/* bitmask of below flags */
unsigned int flags;
+   /* buffer overflow signal */
+   unsigned int signal;
 };
 
 #define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
 #define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG? on buffer overflow
+#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIGsignal on buffer overflow
   instead of wrapping around */
 #define PTRACE_BTS_O_CUT_SIZE  0x8 /* cut requested size to max available
   instead of failing */
-
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH] msi: set 'En' bit of MSI Mapping Capability

2007-12-20 Thread Peer Chen

The quirk is for our Intel platform, we don't want HT MSI mapping
enabled in any of our devices.

BRs
Peer Chen

-Original Message-
From: Eric W. Biederman [mailto:[EMAIL PROTECTED] 
Sent: Wednesday, December 19, 2007 5:59 AM
To: peerchen
Cc: linux-kernel; akpm; Andy Currid; Peer Chen
Subject: Re: [PATCH] msi: set 'En' bit of MSI Mapping Capability

peerchen [EMAIL PROTECTED] writes:

 According to the HyperTransport spec, 'En' indicate if the MSI Mapping
is
 active.
 Set the 'En' bit when setup pci and add the quirk for some nvidia
devices. 

 The patch base on kernel 2.6.24-rc5

Ok.  This is starting to look good.

 Signed-off-by: Andy Currid [EMAIL PROTECTED]
 Signed-off-by: Peer Chen [EMAIL PROTECTED]

 ---
 diff -uprN -X linux-2.6.24-rc5-vanilla/Documentation/dontdiff
 linux-2.6.24-rc5-vanilla/drivers/pci/probe.c
 linux-2.6.24-rc5/drivers/pci/probe.c
 --- linux-2.6.24-rc5-vanilla/drivers/pci/probe.c 2007-12-18
14:35:46.0
 -0500
 +++ linux-2.6.24-rc5/drivers/pci/probe.c 2007-12-18 16:28:29.0
-0500
 @@ -721,6 +721,9 @@ static int pci_setup_device(struct pci_d
  
   /* Unknown power state */
   dev-current_state = PCI_UNKNOWN;
 + 
 + /* Enable HT MSI mapping */
 + ht_enable_msi_mapping(dev);
  
   /* Early fixups, before probing the BARs */
   pci_fixup_device(pci_fixup_early, dev);
 diff -uprN -X linux-2.6.24-rc5-vanilla/Documentation/dontdiff
 linux-2.6.24-rc5-vanilla/drivers/pci/quirks.c
 linux-2.6.24-rc5/drivers/pci/quirks.c
 --- linux-2.6.24-rc5-vanilla/drivers/pci/quirks.c 2007-12-18
14:35:46.0
 -0500
 +++ linux-2.6.24-rc5/drivers/pci/quirks.c 2007-12-18
16:28:41.0 -0500
 @@ -1705,6 +1705,45 @@ static void __devinit quirk_nvidia_ck804
  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA,
PCI_DEVICE_ID_NVIDIA_CK804_PCIE,
   quirk_nvidia_ck804_msi_ht_cap);
  
 +static void __devinit quirk_msi_ht_cap_disable(struct pci_dev *dev) {
 + struct pci_dev *host_bridge;
 + int pos, ttl = 48;
 +
 + /* HT MSI mapping should be disabled on devices that are below
 +  * a non-Hypertransport host bridge. Locate the host bridge...
 +  */
 +
 + if ((host_bridge = pci_get_bus_and_slot(0, PCI_DEVFN(0,0))) ==
NULL) {
 + printk(KERN_WARNING
 + PCI: quirk_msi_ht_cap_disable didn't locate host bridge\n);
 + return;
 + }
 +
 + if ((pos = pci_find_ht_capability(host_bridge, HT_CAPTYPE_SLAVE)) !=
0) {
 + /* Host bridge is to HT */
 + return;
 + }
 +
 + /* Host bridge is not to HT, disable HT MSI mapping on this
device */
 +
 + pos = pci_find_ht_capability(dev, HT_CAPTYPE_MSI_MAPPING);
 + while (pos  ttl--) {
 + u8 flags;
 +
 + if (pci_read_config_byte(dev, pos + HT_MSI_FLAGS, flags) == 0) {
 + printk(KERN_INFO PCI: Quirk disabling HT MSI mapping on %s\n,
 +pci_name(dev));
 +
 + pci_write_config_byte(dev, pos + HT_MSI_FLAGS,
 +   flags 
~HT_MSI_FLAGS_ENABLE);
 + }
 + pos = pci_find_next_ht_capability(dev, pos,
 +
HT_CAPTYPE_MSI_MAPPING);
 + }
 +}
 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 + quirk_msi_ht_cap_disable);

Could you explain the need for this quirk?

My expectation would be that if we turned an MSI interrupt into a
hypertransport interrupt then if we hit a non-hyptransport bridge
upstream it would just turn the hypertransport interrupts into
whatever makes sense for the upstream bridge.  I can see it being
excess work and adding some latency but I don't see it being a
correctness problem. 

Or are my expectations off and the NVIDIA chipsets do not handle
hypertransport interrupts the way I would expect.  Dropping them
instead of converting when going to a non-hypertransport host bridge.


  static void __devinit quirk_msi_intx_disable_bug(struct pci_dev *dev)
  {
   dev-dev_flags |= PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG;
 diff -uprN -X linux-2.6.24-rc5-vanilla/Documentation/dontdiff
 linux-2.6.24-rc5-vanilla/include/asm-generic/pci.h
 linux-2.6.24-rc5/include/asm-generic/pci.h
 --- linux-2.6.24-rc5-vanilla/include/asm-generic/pci.h 2007-12-18
 14:35:52.0 -0500
 +++ linux-2.6.24-rc5/include/asm-generic/pci.h 2007-12-18
16:29:12.0
 -0500
 @@ -45,6 +45,10 @@ pcibios_select_root(struct pci_dev *pdev
  
  #define pcibios_scan_all_fns(a, b)   0
  
 +#ifndef HAVE_ARCH_HT_ENABLE_MSI_MAPPING
 +#define ht_enable_msi_mapping(a) 0
 +#endif /* HAVE_ARCH_HT_ENABLE_MSI_MAPPING */
 +
  #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
  static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int
channel)
  {
 diff -uprN -X linux-2.6.24-rc5-vanilla/Documentation/dontdiff
 linux-2.6.24-rc5-vanilla/include/asm-x86/pci.h
 linux-2.6.24-rc5/include/asm-x86/pci.h
 --- linux-2.6.24-rc5-vanilla/include/asm-x86/pci.h 2007-12-18
14:35:51.0
 -0500
 +++ linux-2.6.24-rc5/include/asm-x86/pci.h 2007-12-18

[patch 5/5] x86, ptrace, man: man pages for ptrace BTS extensions

Document changes for this patch set.

Signed-off-by: Markus Metzger [EMAIL PROTECTED]
---

Index: man/man2/ptrace.2
===
--- man.orig/man2/ptrace.2  2007-12-14 17:45:33.%N +0100
+++ man/man2/ptrace.2   2007-12-20 13:20:07.%N +0100
@@ -40,6 +40,9 @@
 .\PTRACE_SETSIGINFO, PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP
 .\(Thanks to Blaisorblade, Daniel Jacobowitz and others who helped.)
 .\
+.\ Modified Nov 2007, Markus Metzger [EMAIL PROTECTED]
+.\ Added PTRACE_BTS_* commands
+.\
 .TH PTRACE 2 2007-11-15 Linux Linux Programmer's Manual
 .SH NAME
 ptrace \- process trace
@@ -378,6 +381,131 @@
 detached in this way regardless of which method was used to initiate
 tracing.
 (\fIaddr\fP is ignored.)
+.LP
+The following ptrace commands provide access to the hardware's last
+branch recording. They may not be available on all architectures.
+.LP
+Last branch recording stores an execution trace of the traced
+process. For every (conditional) control flow change, the source and
+destination address are stored. On some architectures, control flow
+changes inside the kernel are recorded, as well. On later
+architectures, these are automatically filtered out.
+.LP
+The buffer (called Branch Trace Store) can be configured to be either
+circular, or to send a signal to the traced task when it is about to
+overflow. Not all methods may be available on all architectures.
+.LP
+The buffer can be accessed in two ways matching the above
+configurations: either as an array of BTS records from newest
+record to older records, one record at a time; or all records at once,
+from oldest to newest.
+.LP
+The former is mostly used for circular buffers to capture a tail of
+the execution trace (e.g. for debugging); the latter is mostly used to
+collect a continuous trace (e.g. for profiling) where the user drains
+the hardware buffer into a larger private buffer or into a file.
+.LP
+In addition to branches, timestamps (in jiffies) may optionally be
+recorded when the traced process arrives and departs,
+respectively. This information can be used to obtain a qualitative
+execution order, if more than one process is traced.
+.LP
+A BTS record is defined as:
+.LP
+.nf
+enum ptrace_bts_qualifier {
+   PTRACE_BTS_INVALID = 0,
+   PTRACE_BTS_BRANCH,
+   PTRACE_BTS_TASK_ARRIVES,
+   PTRACE_BTS_TASK_DEPARTS
+};
+.sp
+struct ptrace_bts_record {
+   u64 qualifier;
+   union {
+   /* PTRACE_BTS_BRANCH */
+   struct {
+   u64 from_ip;
+   u64 to_ip;
+   } lbr;
+   /* PTRACE_BTS_TASK_ARRIVES or
+  PTRACE_BTS_TASK_DEPARTS */
+   u64 timestamp;
+   } variant;
+};
+.fi
+.LP
+For configuring last branch recording and for querying its status, the
+following struct is used:
+.LP
+.nf
+struct ptrace_bts_config {
+   unsigned int size;
+   unsigned int flags;
+   unsigned int signal;
+};
+.fi
+.LP
+\fISize\fP is either the requested or the actual size of the kernel
+BTS buffer in bytes.
+\fIFlags\fP is a bitmask of options, which are specified by the
+following flags:
+.RS
+.TP
+.BR PTRACE_BTS_O_TRACE
+Collect branch trace records
+.TP
+.BR PTRACE_BTS_O_SCHED
+Collect scheduling timing information
+.TP
+.BR PTRACE_BTS_O_SIGNAL
+Send \fIsignal\fP to the traced task in case of a buffer overflow
+.TP
+.BR PTRACE_BTS_O_CUT_SIZE
+Reduce the requested buffer size if it is bigger than the available
+buffer size.
+.RE
+\fISignal\fP is the signal to send to the traced task in case of a
+buffer overflow.
+.TP
+.BR PTRACE_BTS_CONFIG
+Configure last branch recording. \fIaddr\fP points to a
+\fIptrace_bts_config\fP structure (see above); \fIdata\fP specifies
+the size of that structure.
+Returns the number of bytes read.
+.TP
+.BR PTRACE_BTS_STATUS
+Writes the actual configuration into a \fIptrace_bts_config\fP
+structure pointed to by \fIaddr\fP. The caller is responsible for
+allocating memory at \fIaddr\fP to hold a \fIptrace_bts_config\fP
+structure. \fIData\fP specifies the size of that structure.
+Returns the number of bytes written.
+.TP
+.BR PTRACE_BTS_SIZE
+Returns the number of BTS records available for draining. For a
+circular buffer, this number is meaningless.
+(\fIaddr\fP and \fIdata\fP are ignored.)
+.TP
+.BR PTRACE_BTS_GET
+Reads a single BTS record at index \fIdata\fP into \fIaddr\fP. The
+caller is responsible for allocating memory at \fIaddr\fP to hold one
+\fIptrace_bts_record\fP structure.
+The bigger the index, the older the record; the latest record can
+always be found at index 0.
+Returns the number of bytes written.
+.TP
+.BR PTRACE_BTS_CLEAR
+Clears the BTS buffer. This command can be used after a manual
+draining using PTRACE_BTS_GET commands.
+(\fIaddr\fP and \fIdata\fP are ignored.)
+.TP
+.BR PTRACE_BTS_DRAIN
+Reads all available BTS records into the buffer pointed to by
+\fIaddr\fP and clears the buffer.

Re: 2.6.22.14 oops msg with commvault galaxy ?

2007-12-20 Thread Vincent Fortier

Le vendredi 14 décembre 2007 à 09:28 -0800, Greg KH a écrit :
On Fri, Dec 14, 2007 at 10:37:39PM +0530, Dhaval Giani wrote:
On Fri, Dec 14, 2007 at 08:26:42AM -0800, Greg KH wrote:
On Thu, Dec 13, 2007 at 09:21:26PM +0100, Ingo Molnar wrote:

* Kay Sievers [EMAIL PROTECTED] wrote:

This one also fails to apply properly at the exact same place
has Ingo's previously posted patch. Would need to backport his
one.

It depends on a completely reworked sysfs logic, I don't think it
makes any sense to backport that.

well, if it fixes a live bug in a still supported stable kernel
release...

Vincent, could you try to just get rid of all actual uses of
se-attr.owner, within fs/sysfs/*.c? Something like the patch
below.
(totally untested - might be fatally broken as well)

How can you think that this is not needed? You can not remove it with
sysfs you are patching. Hope this explains it:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0ab66088c855eca68513bdd7442a426c4b374ced

yeah - as i said it might be fatally broken (in fact it is). Do we
understand why Vincent got the crashes with vanilla 2.6.22.14 ?

No, and I can't seem to duplicate them here at all.

Does anyone have a test case for this that I can work on trying to
duplicate?

If you apply CFS without my fix, and try to constantly check cpu_shares
for a user who is logging and logging out, you should hit it. (That's
what I was doing).

Hm, how about a vanilla 2.6.22.14 kernel _without_ any patches.
That's what I am most worried about :)

Since I was getting the problem with both vanilla CFS patched kernels
and that, sadly, I don't have the time to do git bisect at the moment I
decided to go ahead and prepare a full migration to 2.6.23 (I was hoping
to skip directly to 2.6.24 but...).

I can confirm at the moment that 2.6.23 works properly with Galaxy (just
has 2.6.20 2.6.21 used to...).

Thnx very much everyone for the help but sadly this bug will have to
remain unresolved.

thanks,

- vin
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Re: [PATCH] Move page_assign_page_cgroup to VM_BUG_ON in free_hot_cold_page

2007-12-20 Thread Peter Zijlstra


On Thu, 2007-12-20 at 13:14 +, Hugh Dickins wrote:
 On Wed, 19 Dec 2007, Dave Hansen wrote:
   --- 
   linux-2.6.24-rc5/mm/page_alloc.c~memory-controller-move-to-bug-on-in-free_hot_cold_page
2007-12-19 11:31:46.0 +0530
   +++ linux-2.6.24-rc5-balbir/mm/page_alloc.c 2007-12-19 
   11:33:45.0 +0530
   @@ -995,7 +995,7 @@ static void fastcall free_hot_cold_page(

   if (!PageHighMem(page))
   debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
   -   page_assign_page_cgroup(page, NULL);
   +   VM_BUG_ON(page_get_page_cgroup(page));
   arch_free_page(page, 0);
   kernel_map_pages(page, 1, 0); 
  
  Hi Balbir,
  
  You generally want to do these like:
  
  foo = page_assign_page_cgroup(page, NULL);
  VM_BUG_ON(foo);
  
  Some embedded people have been known to optimize kernel size like this:
  
  #define VM_BUG_ON(x) do{}while(0)
 
 Balbir's patch looks fine to me: I don't get your point there, Dave.

There was a lengthy discussion here:
  http://lkml.org/lkml/2007/12/14/131

on the merit of debug statements with side effects. But looking at our
definition:

#ifdef CONFIG_DEBUG_VM
#define VM_BUG_ON(cond) BUG_ON(cond)
#else
#define VM_BUG_ON(condition) do { } while(0)
#endif

disabling CONFIG_DEBUG_VM breaks the code as proposed by Balbir in that
it will no longer acquire the reference.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/5] sg_ring for scsi

On Thu, Dec 20 2007 at 9:58 +0200, Jens Axboe [EMAIL PROTECTED] wrote:
 On Thu, Dec 20 2007, Rusty Russell wrote:
 On Thursday 20 December 2007 18:07:41 FUJITA Tomonori wrote:
 On Thu, 20 Dec 2007 16:45:18 +1100

 Rusty Russell [EMAIL PROTECTED] wrote:
 OK, some fixes since last time, as I wade through more SCSI drivers. 
 Some drivers use use_sg as a flag to know whether the request_buffer is
 a scatterlist: I don't need the counter, but I still need the flag, so I
 fixed that in a more intuitive way (an explicit -sg pointer in the cmd).
 use_sg and the request_buffer will be removed shortly.

 http://marc.info/?l=linux-scsim=119754650614813w=2
 Thanks!  Is there a git tree somewhere with these changes?

 I think that we tried the similar idea before, scsi_sgtable, but we
 seem to settle in the current simple approach.
 Yes, a scsi-specific solution is a bad idea: other people use sg.  
 Manipulating the magic chains is horrible; it looks simple to the places 
 which simply want to iterate through it, but it's awful for code which wants 
 to create them.
 
 The current code looks like that to minimize impact on 2.6.24, see this
 branch:
 
 http://git.kernel.dk/?p=linux-2.6-block.git;a=shortlog;h=sg
 
 for how it folds into lib/sg.c and the magic disappears from SCSI.
 Rusty, nobody claimed the sg code in 2.6.24 is perfect. I like to get
 things incrementally there.
 
Dear Jens.

Is this code scheduled for 2.6.25? I cannot find it in mm tree.

Because above code conflicts with the scsi_data_buffer patch,
that is in mm tree and I hope will get accepted into 2.6.25.
Now the concepts are not at all conflicting, only that they
both bang in the same places.
(And by the way it does not apply on scsi-misc either.
And it did not compile in your tree, a missing bit in 
ide-scsi.c)

I have rebase and fixed your code on top of scsi_data_buffer patchset.
Please review. (Patchset posted as reply to this mail)

They are totaly untested, based on -mm tree.
We should decide the order of these patches and rebase
accordingly.

AND ...
Please send, to-be-included-in-next-kernel patches to Morton. 
This way we can account for them. Also I do not see Ack-by: 
of the scsi maintainer in the scsi bits of your patches.
Is it not a costume to Ack-on bits that belong to other maintainers, 
even for maintainers?

Boaz
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/3] SG: Move functions to lib/scatterlist.c and add sg chaining allocator helpers


Manually doing chained sg lists is not trivial, so add some helpers
to make sure that drivers get it right.

Signed-off-by: Jens Axboe [EMAIL PROTECTED]
---
 include/linux/scatterlist.h |  125 ---
 lib/Makefile|2 +-
 lib/scatterlist.c   |  281 +++
 3 files changed, 307 insertions(+), 101 deletions(-)
 create mode 100644 lib/scatterlist.c

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 416e000..c3ca848 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -7,6 +7,12 @@
 #include linux/string.h
 #include asm/io.h
 
+struct sg_table {
+   struct scatterlist *sgl;/* the list */
+   unsigned int nents; /* number of mapped entries */
+   unsigned int orig_nents;/* original size of list */
+};
+
 /*
  * Notes on SG table design.
  *
@@ -106,31 +112,6 @@ static inline void sg_set_buf(struct scatterlist *sg, 
const void *buf,
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
 }
 
-/**
- * sg_next - return the next scatterlist entry in a list
- * @sg:The current sg entry
- *
- * Description:
- *   Usually the next entry will be @sg@ + 1, but if this sg element is part
- *   of a chained scatterlist, it could jump to the start of a new
- *   scatterlist array.
- *
- **/
-static inline struct scatterlist *sg_next(struct scatterlist *sg)
-{
-#ifdef CONFIG_DEBUG_SG
-   BUG_ON(sg-sg_magic != SG_MAGIC);
-#endif
-   if (sg_is_last(sg))
-   return NULL;
-
-   sg++;
-   if (unlikely(sg_is_chain(sg)))
-   sg = sg_chain_ptr(sg);
-
-   return sg;
-}
-
 /*
  * Loop over each sg element, following the pointer to a new list if necessary
  */
@@ -138,40 +119,6 @@ static inline struct scatterlist *sg_next(struct 
scatterlist *sg)
for (__i = 0, sg = (sglist); __i  (nr); __i++, sg = sg_next(sg))
 
 /**
- * sg_last - return the last scatterlist entry in a list
- * @sgl:   First entry in the scatterlist
- * @nents: Number of entries in the scatterlist
- *
- * Description:
- *   Should only be used casually, it (currently) scan the entire list
- *   to get the last entry.
- *
- *   Note that the @sgl@ pointer passed in need not be the first one,
- *   the important bit is that @nents@ denotes the number of entries that
- *   exist from @[EMAIL PROTECTED]
- *
- **/
-static inline struct scatterlist *sg_last(struct scatterlist *sgl,
- unsigned int nents)
-{
-#ifndef ARCH_HAS_SG_CHAIN
-   struct scatterlist *ret = sgl[nents - 1];
-#else
-   struct scatterlist *sg, *ret = NULL;
-   unsigned int i;
-
-   for_each_sg(sgl, sg, nents, i)
-   ret = sg;
-
-#endif
-#ifdef CONFIG_DEBUG_SG
-   BUG_ON(sgl[0].sg_magic != SG_MAGIC);
-   BUG_ON(!sg_is_last(ret));
-#endif
-   return ret;
-}
-
-/**
  * sg_chain - Chain two sglists together
  * @prv:   First scatterlist
  * @prv_nents: Number of entries in prv
@@ -223,47 +170,6 @@ static inline void sg_mark_end(struct scatterlist *sg)
 }
 
 /**
- * sg_init_table - Initialize SG table
- * @sgl:  The SG table
- * @nents:Number of entries in table
- *
- * Notes:
- *   If this is part of a chained sg table, sg_mark_end() should be
- *   used only on the last table part.
- *
- **/
-static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
-{
-   memset(sgl, 0, sizeof(*sgl) * nents);
-#ifdef CONFIG_DEBUG_SG
-   {
-   unsigned int i;
-   for (i = 0; i  nents; i++)
-   sgl[i].sg_magic = SG_MAGIC;
-   }
-#endif
-   sg_mark_end(sgl[nents - 1]);
-}
-
-/**
- * sg_init_one - Initialize a single entry sg list
- * @sg: SG entry
- * @buf:Virtual address for IO
- * @buflen: IO length
- *
- * Notes:
- *   This should not be used on a single entry that is part of a larger
- *   table. Use sg_init_table() for that.
- *
- **/
-static inline void sg_init_one(struct scatterlist *sg, const void *buf,
-  unsigned int buflen)
-{
-   sg_init_table(sg, 1);
-   sg_set_buf(sg, buf, buflen);
-}
-
-/**
  * sg_phys - Return physical address of an sg entry
  * @sg: SG entry
  *
@@ -293,4 +199,23 @@ static inline void *sg_virt(struct scatterlist *sg)
return page_address(sg_page(sg)) + sg-offset;
 }
 
+struct scatterlist *sg_next(struct scatterlist *);
+struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
+void sg_init_table(struct scatterlist *, unsigned int);
+void sg_init_one(struct scatterlist *, const void *, unsigned int);
+
+typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
+typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
+
+void __sg_free_table(struct sg_table *, sg_free_fn *);
+void sg_free_table(struct sg_table *);
+int __sg_alloc_table(struct sg_table *,

[PATCH 2/3] SG: Convert SCSI to use scatterlist helpers for sg chaining

From: Jens Axboe [EMAIL PROTECTED]

Signed-off-by: Jens Axboe [EMAIL PROTECTED]
---
 drivers/scsi/libsrp.c|2 +-
 drivers/scsi/scsi_error.c|4 +-
 drivers/scsi/scsi_lib.c  |  150 +-
 drivers/usb/storage/isd200.c |4 +-
 include/scsi/scsi_cmnd.h |9 +--
 5 files changed, 24 insertions(+), 145 deletions(-)

diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c
index 8a8562a..b81350d 100644
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -427,7 +427,7 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd 
*cmd, void *info,
sc-SCp.ptr = info;
memcpy(sc-cmnd, cmd-cdb, MAX_COMMAND_SIZE);
sc-sdb.length = len;
-   sc-sdb.sglist = (void *) (unsigned long) addr;
+   sc-sdb.sgt.sgl = (void *) (unsigned long) addr;
sc-tag = tag;
err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)cmd-lun,
 cmd-tag);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 5c8ba6a..1fd2a8c 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -629,9 +629,9 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct 
scsi_eh_save *ses,
   sizeof(scmd-sense_buffer), sense_bytes);
sg_init_one(ses-sense_sgl, scmd-sense_buffer,
  scmd-sdb.length);
-   scmd-sdb.sglist = ses-sense_sgl;
+   scmd-sdb.sgt.sgl = ses-sense_sgl;
scmd-sc_data_direction = DMA_FROM_DEVICE;
-   scmd-sdb.sg_count = 1;
+   scmd-sdb.sgt.nents = 1;
memset(scmd-cmnd, 0, sizeof(scmd-cmnd));
scmd-cmnd[0] = REQUEST_SENSE;
scmd-cmnd[4] = scmd-sdb.length;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a6aae56..c7107f1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -750,136 +750,15 @@ static inline unsigned int scsi_sgtable_index(unsigned 
short nents)
return index;
 }
 
-static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb,
-   unsigned short sg_count, gfp_t gfp_mask)
+static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
 {
-   struct scsi_host_sg_pool *sgp;
-   struct scatterlist *sgl, *prev, *ret;
-   unsigned int index;
-   int this, left;
-
-   left = sg_count;
-   ret = prev = NULL;
-   do {
-   this = left;
-   if (this  SCSI_MAX_SG_SEGMENTS) {
-   this = SCSI_MAX_SG_SEGMENTS - 1;
-   index = SG_MEMPOOL_NR - 1;
-   } else
-   index = scsi_sgtable_index(this);
-
-   left -= this;
-
-   sgp = scsi_sg_pools + index;
-
-   sgl = mempool_alloc(sgp-pool, gfp_mask);
-   if (unlikely(!sgl))
-   goto enomem;
-
-   sg_init_table(sgl, sgp-size);
-
-   /*
-* first loop through, set initial index and return value
-*/
-   if (!ret)
-   ret = sgl;
-
-   /*
-* chain previous sglist, if any. we know the previous
-* sglist must be the biggest one, or we would not have
-* ended up doing another loop.
-*/
-   if (prev)
-   sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
-
-   /*
-* if we have nothing left, mark the last segment as
-* end-of-list
-*/
-   if (!left)
-   sg_mark_end(sgl[this - 1]);
-
-   /*
-* don't allow subsequent mempool allocs to sleep, it would
-* violate the mempool principle.
-*/
-   gfp_mask = ~__GFP_WAIT;
-   gfp_mask |= __GFP_HIGH;
-   prev = sgl;
-   } while (left);
-
-   /*
-* -use_sg may get modified after dma mapping has potentially
-* shrunk the number of segments, so keep a copy of it for free.
-*/
-   sdb-alloc_sg_count = sdb-sg_count = sg_count;
-   sdb-sglist = ret;
-   return 0;
-enomem:
-   if (ret) {
-   /*
-* Free entries chained off ret. Since we were trying to
-* allocate another sglist, we know that all entries are of
-* the max size.
-*/
-   sgp = scsi_sg_pools + SG_MEMPOOL_NR - 1;
-   prev = ret;
-   ret = ret[SCSI_MAX_SG_SEGMENTS - 1];
-
-   while ((sgl = sg_chain_ptr(ret)) != NULL) {
-   ret = sgl[SCSI_MAX_SG_SEGMENTS - 1];
-   mempool_free(sgl, sgp-pool);
-   }
-
-   mempool_free(prev, sgp-pool);
-

[PATCH 3/3] SG: Update ide/ to use sg_table