I have attached the diff

On Mon, May 11, 2009 at 8:16 PM, Mulyadi Santosa
<[email protected]> wrote:
> Hi...
>
> On Mon, May 11, 2009 at 4:22 PM, Sukanto Ghosh
> <[email protected]> wrote:
>> Hi,
>>
>> I was adding a field to the 3 trace_mark() calls in kernel/sched.c.
>> The calls are at: i) context_switch(prev, next, ... )  ii)
>> try_to_wake_up(p, ...)  iii) wake_up_new_task(p, ...)  functions
>>
>> The field is 'task'->mm->pfrate. (I have added the pfrate field in
>> mm_struct), where 'task' is a placeholder for prev/next/p/rq->curr in
>> those trace_mark() calls.  I found that always either the next/p
>> pointers are NULL or task->mm is NULL at that particular point.  Is it
>> supposed to be so ? Why ?
>>
>> PS: The trace_mark() in kernel/sched.c calls are for adding entries to
>> the trace-file of sched_switch tracer.
>
> Always? hmm strange. sure you did the deferencing to task_struct correctly?
>
> perhaps you could share to us the change you made? in a form of diff -u 
> perhaps?
>
> regards,
>
> Mulyadi.
>



-- 
Regards,
Sukanto Ghosh
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,10 +225,25 @@ struct mm_struct {
         * it has been since this task got the token.
         * Look at mm/thrash.c
         */
-       unsigned int faultstamp;
-       unsigned int token_priority;
-       unsigned int last_interval;
+       u64 last_logical_faultstamp;    /* global faultstamp value at last 
major fault */
+       u64 last_interval;              /* in jiffies */
+       unsigned long long last_interval_ns;            /* in nanoseconds (for 
intervals < 1 jiffie) */
+       u64 last_logical_interval;      /* in global faults count */
+       unsigned int mm_maj_flts;
+       u64 page_flt_rate;              /* unit: (ns)^(-1)  */
+       u64 last_faultstamp_jiffies64;  /* jiffies of last major fault */
+       cycles_t last_faultstamp_cycles;        /* clock_value of last major 
fault */
+       u64 token_expiry_jiffies64;     /* expiry time of the swap_token */
+       u64 creation_jiffies64;         /* time at which the mm_struct was 
created */
+       
+       
+       /* boolean value: whether this process has a swap token or not */
+       unsigned int has_swap_token;            
+       
+       spinlock_t swap_token_lock;
+       struct rb_node token_tree_node;
 
+       unsigned int token_priority;
        unsigned long flags; /* Must use atomic bitops to access the bits */
 
        struct core_state *core_state; /* coredumping support */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f16..d742a0e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,24 +254,28 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
+extern int honor_swap_token;
 extern struct mm_struct * swap_token_mm;
-extern void grab_swap_token(void);
+extern void grab_swap_token(struct mm_struct *);
 extern void __put_swap_token(struct mm_struct *);
 
 static inline int has_swap_token(struct mm_struct *mm)
 {
-       return (mm == swap_token_mm);
+       return ((mm->has_swap_token)==1);
 }
 
 static inline void put_swap_token(struct mm_struct *mm)
 {
+       spin_lock(&(mm->swap_token_lock));
        if (has_swap_token(mm))
                __put_swap_token(mm);
+       spin_unlock(&(mm->swap_token_lock));
 }
 
 static inline void disable_swap_token(void)
 {
-       put_swap_token(swap_token_mm);
+//     put_swap_token(swap_token_mm);
+//     honor_swap_token = 0;   
 }
 
 #else /* CONFIG_SWAP */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe..8aed6dd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -567,8 +567,19 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
 
        /* Initializing for Swap token stuff */
-       mm->token_priority = 0;
+//     mm->token_priority = 0;
        mm->last_interval = 0;
+       mm->last_logical_interval = 0;
+       mm->has_swap_token = 0;
+       mm->last_logical_faultstamp = 0;
+       mm->mm_maj_flts = 0;
+       mm->page_flt_rate = 0;
+       mm->creation_jiffies64 = get_jiffies_64();
+       mm->last_faultstamp_jiffies64 = mm->creation_jiffies64;
+       mm->last_faultstamp_cycles = get_cycles();
+       mm->last_interval_ns = 0;
+
+       spin_lock_init(&(mm->swap_token_lock));
 
        if (!mm_init(mm, tsk))
                goto fail_nomem;
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962d..00731d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2283,8 +2283,8 @@ out_activate:
 
 out_running:
        trace_mark(kernel_sched_wakeup,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+               "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu 
page_flt_rate %llu",
+               p->pid, p->state, rq, p, rq->curr, rq->load.weight, 
p->mm->page_flt_rate);
        check_preempt_curr(rq, p);
 
        p->state = TASK_RUNNING;
@@ -2418,8 +2418,8 @@ void wake_up_new_task(struct task_struct *p, unsigned 
long clone_flags)
                inc_nr_running(rq);
        }
        trace_mark(kernel_sched_wakeup_new,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
+               "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu 
page_flt_rate %llu",
+               p->pid, p->state, rq, p, rq->curr, rq->load.weight, 
p->mm->page_flt_rate);
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -2594,9 +2594,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
        prepare_task_switch(rq, prev, next);
        trace_mark(kernel_sched_schedule,
                "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
+               "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
                prev->pid, next->pid, prev->state,
-               rq, prev, next);
+               rq, prev, next, rq->load.weight, next->mm->page_flt_rate);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec088..e187896 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -83,6 +83,9 @@ extern int compat_log;
 extern int maps_protect;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+/* Swap-token related */
+extern int vm_token_validity_period_ms; 
+extern int max_swap_token_frac;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -855,6 +858,23 @@ static struct ctl_table kern_table[] = {
 };
 
 static struct ctl_table vm_table[] = {
+       /* swap token memory management controls */
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "swap_token_validity_ms",
+               .data           = &vm_token_validity_period_ms,
+               .maxlen         = sizeof(vm_token_validity_period_ms),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "max_swap_token_holder_frac",
+               .data           = &max_swap_token_frac,
+               .maxlen         = sizeof(max_swap_token_frac),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec
+       },
        {
                .ctl_name       = VM_OVERCOMMIT_MEMORY,
                .procname       = "overcommit_memory",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3d..2311486 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,7 +948,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
                           struct trace_array_cpu *data,
                           struct task_struct *prev,
                           struct task_struct *next,
-                          unsigned long flags)
+                          unsigned long flags,
+                          unsigned long rq_load, 
+                          u64 page_flt_rate)
 {
        struct trace_entry *entry;
        unsigned long irq_flags;
@@ -964,6 +966,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry->ctx.next_pid     = next->pid;
        entry->ctx.next_prio    = next->prio;
        entry->ctx.next_state   = next->state;
+       entry->ctx.rq_load      = rq_load;
+       entry->ctx.page_flt_rate = page_flt_rate;
        __trace_stack(tr, data, flags, 5);
        __raw_spin_unlock(&data->lock);
        raw_local_irq_restore(irq_flags);
@@ -974,7 +978,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
                           struct trace_array_cpu *data,
                           struct task_struct *wakee,
                           struct task_struct *curr,
-                          unsigned long flags)
+                          unsigned long flags,
+                          unsigned long rq_load,
+                          u64 page_flt_rate)
 {
        struct trace_entry *entry;
        unsigned long irq_flags;
@@ -990,6 +996,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->ctx.next_pid     = wakee->pid;
        entry->ctx.next_prio    = wakee->prio;
        entry->ctx.next_state   = wakee->state;
+       entry->ctx.rq_load      = rq_load;
+       entry->ctx.page_flt_rate = page_flt_rate;
        __trace_stack(tr, data, flags, 6);
        __raw_spin_unlock(&data->lock);
        raw_local_irq_restore(irq_flags);
@@ -1524,13 +1532,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int 
trace_idx, int cpu)
                state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 
1 : 0;
                S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 
'X';
                comm = trace_find_cmdline(entry->ctx.next_pid);
-               trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+               trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu %s\n",
                                 entry->ctx.prev_pid,
                                 entry->ctx.prev_prio,
                                 S, entry->type == TRACE_CTX ? "==>" : "  +",
                                 entry->ctx.next_pid,
                                 entry->ctx.next_prio,
-                                T, comm);
+                                T,
+                                entry->ctx.rq_load,
+                                entry->ctx.page_flt_rate,
+                                comm);
                break;
        case TRACE_SPECIAL:
                trace_seq_printf(s, "# %ld %ld %ld\n",
@@ -1611,14 +1622,16 @@ static int print_trace_fmt(struct trace_iterator *iter)
                        state_to_char[entry->ctx.prev_state] : 'X';
                T = entry->ctx.next_state < sizeof(state_to_char) ?
                        state_to_char[entry->ctx.next_state] : 'X';
-               ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+               ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu 
%llu\n",
                                       entry->ctx.prev_pid,
                                       entry->ctx.prev_prio,
                                       S,
                                       entry->type == TRACE_CTX ? "==>" : "  +",
                                       entry->ctx.next_pid,
                                       entry->ctx.next_prio,
-                                      T);
+                                      T,
+                                      entry->ctx.rq_load,
+                                      entry->ctx.page_flt_rate);
                if (!ret)
                        return 0;
                break;
@@ -1679,13 +1692,15 @@ static int print_raw_fmt(struct trace_iterator *iter)
                        state_to_char[entry->ctx.next_state] : 'X';
                if (entry->type == TRACE_WAKE)
                        S = '+';
-               ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+               ret = trace_seq_printf(s, "%d %d %c %d %d %c %lu %llu\n",
                                       entry->ctx.prev_pid,
                                       entry->ctx.prev_prio,
                                       S,
                                       entry->ctx.next_pid,
                                       entry->ctx.next_prio,
-                                      T);
+                                      T,
+                                      entry->ctx.rq_load,
+                                      entry->ctx.page_flt_rate);
                if (!ret)
                        return 0;
                break;
@@ -1783,6 +1798,8 @@ static int print_bin_fmt(struct trace_iterator *iter)
                SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
                SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
                SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.rq_load);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.page_flt_rate);
                break;
        case TRACE_SPECIAL:
        case TRACE_STACK:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f867..aafa40e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -39,6 +39,8 @@ struct ctx_switch_entry {
        unsigned int            next_pid;
        unsigned char           next_prio;
        unsigned char           next_state;
+       unsigned long           rq_load;
+       u64                     page_flt_rate;
 };
 
 /*
@@ -204,14 +206,18 @@ void tracing_sched_switch_trace(struct trace_array *tr,
                                struct trace_array_cpu *data,
                                struct task_struct *prev,
                                struct task_struct *next,
-                               unsigned long flags);
+                               unsigned long flags,
+                               unsigned long rq_load,
+                               u64 page_flt_rate);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
                                struct trace_array_cpu *data,
                                struct task_struct *wakee,
                                struct task_struct *cur,
-                               unsigned long flags);
+                               unsigned long flags,
+                               unsigned long rq_load,
+                               u64 page_flt_rate);
 void trace_special(struct trace_array *tr,
                   struct trace_array_cpu *data,
                   unsigned long arg1,
diff --git a/kernel/trace/trace_sched_switch.c 
b/kernel/trace/trace_sched_switch.c
index cb817a2..93b0412 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,7 +20,7 @@ static atomic_t                       sched_ref;
 
 static void
 sched_switch_func(void *private, void *__rq, struct task_struct *prev,
-                       struct task_struct *next)
+                       struct task_struct *next, unsigned long rq_load, u64 
page_flt_rate)
 {
        struct trace_array **ptr = private;
        struct trace_array *tr = *ptr;
@@ -40,8 +40,10 @@ sched_switch_func(void *private, void *__rq, struct 
task_struct *prev,
        data = tr->data[cpu];
        disabled = atomic_inc_return(&data->disabled);
 
+//     rq_load = ((struct rq*)__rq)->load.weight;
+
        if (likely(disabled == 1))
-               tracing_sched_switch_trace(tr, data, prev, next, flags);
+               tracing_sched_switch_trace(tr, data, prev, next, flags, 
rq_load, page_flt_rate);
 
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -54,6 +56,8 @@ sched_switch_callback(void *probe_data, void *call_data,
        struct task_struct *prev;
        struct task_struct *next;
        struct rq *__rq;
+       unsigned long rq_load;
+       u64 page_flt_rate;
 
        if (!atomic_read(&sched_ref))
                return;
@@ -65,17 +69,19 @@ sched_switch_callback(void *probe_data, void *call_data,
        __rq = va_arg(*args, typeof(__rq));
        prev = va_arg(*args, typeof(prev));
        next = va_arg(*args, typeof(next));
+       rq_load = va_arg(*args, typeof(rq_load));
+       page_flt_rate = va_arg(*args, typeof(page_flt_rate));
 
        /*
         * If tracer_switch_func only points to the local
         * switch func, it still needs the ptr passed to it.
         */
-       sched_switch_func(probe_data, __rq, prev, next);
+       sched_switch_func(probe_data, __rq, prev, next, rq_load, page_flt_rate);
 }
 
 static void
 wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
-                       task_struct *curr)
+                       task_struct *curr, unsigned long rq_load, u64 
page_flt_rate)
 {
        struct trace_array **ptr = private;
        struct trace_array *tr = *ptr;
@@ -94,8 +100,10 @@ wakeup_func(void *private, void *__rq, struct task_struct 
*wakee, struct
        data = tr->data[cpu];
        disabled = atomic_inc_return(&data->disabled);
 
+//     rq_load = ((struct rq*)__rq)->load.weight;
+
        if (likely(disabled == 1))
-               tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+               tracing_sched_wakeup_trace(tr, data, wakee, curr, flags, 
rq_load, page_flt_rate);
 
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -108,6 +116,8 @@ wake_up_callback(void *probe_data, void *call_data,
        struct task_struct *curr;
        struct task_struct *task;
        struct rq *__rq;
+       unsigned long rq_load;
+       u64 page_flt_rate;
 
        if (likely(!tracer_enabled))
                return;
@@ -119,11 +129,13 @@ wake_up_callback(void *probe_data, void *call_data,
        __rq = va_arg(*args, typeof(__rq));
        task = va_arg(*args, typeof(task));
        curr = va_arg(*args, typeof(curr));
+       rq_load = va_arg(*args, typeof(rq_load));
+       page_flt_rate = va_arg(*args, typeof(page_flt_rate));
 
        tracing_record_cmdline(task);
        tracing_record_cmdline(curr);
 
-       wakeup_func(probe_data, __rq, task, curr);
+       wakeup_func(probe_data, __rq, task, curr, rq_load, page_flt_rate);
 }
 
 static void sched_switch_reset(struct trace_array *tr)
@@ -141,7 +153,7 @@ static int tracing_sched_register(void)
        int ret;
 
        ret = marker_probe_register("kernel_sched_wakeup",
-                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p rq_load 
%lu page_flt_rate %llu",
                        wake_up_callback,
                        &ctx_trace);
        if (ret) {
@@ -151,7 +163,7 @@ static int tracing_sched_register(void)
        }
 
        ret = marker_probe_register("kernel_sched_wakeup_new",
-                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p rq_load 
%lu page_flt_rate %llu",
                        wake_up_callback,
                        &ctx_trace);
        if (ret) {
@@ -162,7 +174,7 @@ static int tracing_sched_register(void)
 
        ret = marker_probe_register("kernel_sched_schedule",
                "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
+               "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
                sched_switch_callback,
                &ctx_trace);
        if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 1002f47..ccc6eee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2268,7 +2268,9 @@ static int do_swap_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
-               grab_swap_token(); /* Contend for token _before_ read-in */
+               /* Contend for token _before_ read-in */
+               if(honor_swap_token)
+                       grab_swap_token(mm); 
                page = swapin_readahead(entry,
                                        GFP_HIGHUSER_MOVABLE, vma, address);
                if (!page) {
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b..8fb0ed5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -317,11 +317,16 @@ static int page_referenced_one(struct page *page,
        } else if (ptep_clear_flush_young_notify(vma, address, pte))
                referenced++;
 
-       /* Pretend the page is referenced if the task has the
-          swap token and is in the middle of a page fault. */
-       if (mm != current->mm && has_swap_token(mm) &&
-                       rwsem_is_locked(&mm->mmap_sem))
-               referenced++;
+       /* If the task has an invalid swap token, revoke it. 
+          Otherwise, pretend the page is referenced */
+       spin_lock(&(mm->swap_token_lock));
+       if ((mm != current->mm) && has_swap_token(mm)) {
+               if(time_after64(get_jiffies_64(), mm->token_expiry_jiffies64))
+                       __put_swap_token(mm);
+               else
+                       referenced++;
+       }
+       spin_unlock(&(mm->swap_token_lock));
 
        (*mapcount)--;
        pte_unmap_unlock(pte, ptl);
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205..222ce54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,59 +21,244 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/rbtree.h>
+#include <linux/timex.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
 
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
 static unsigned int global_faults;
 
-void grab_swap_token(void)
+int honor_swap_token = 1;
+
+struct rb_root token_tree_root = RB_ROOT;
+struct rb_node *token_tree_leftmost = NULL;
+int num_swap_token_holders = 0;
+
+/* maximum allowed fraction of swap-token holders */
+int max_swap_token_frac = 50;
+/* number of msec for which the token is valid */
+int vm_token_validity_period_ms = 10;
+
+
+void token_tree_insert(struct mm_struct *mm) {
+
+       struct rb_node **new = &(token_tree_root.rb_node), *parent = NULL;
+       int leftmost = 1;
+
+       while(*new) {
+               struct mm_struct *this = container_of(*new, struct mm_struct, 
token_tree_node);
+
+               parent = *new;
+               if(mm->page_flt_rate < this->page_flt_rate)
+                       new = &((*new)->rb_left);
+//             else if(!((mm->page_flt_rate == this->page_flt_rate) && (mm == 
this))) {
+               else if(mm != this) {
+                       new = &((*new)->rb_right);
+                       leftmost = 0;
+               }
+               else
+                       return;
+       }
+
+       if(leftmost)
+               token_tree_leftmost = &(mm->token_tree_node);
+
+       rb_link_node(&(mm->token_tree_node), parent, new);
+       rb_insert_color(&(mm->token_tree_node), &token_tree_root);
+}
+
+void token_tree_delete(struct mm_struct* mm) {
+
+       if(&(mm->token_tree_node) == token_tree_leftmost) {
+               struct rb_node* next_node = rb_next(token_tree_leftmost);
+               token_tree_leftmost = next_node;
+       }
+       rb_erase(&(mm->token_tree_node), &token_tree_root);
+}
+/* 64-bit division [copied from x86/include/div64.h]
+ *
+ * modifies the dividend to store the quotient */
+#define my_do_div64(n, base)                                         \
+       ({                                                              \
+                unsigned long __upper, __low, __high, __mod, __base;    \
+                __base = (base);                                        \
+                asm("":"=a" (__low), "=d" (__high) : "A" (n));          \
+                __upper = __high;                                       \
+                if (__high) {                                           \
+                        __upper = __high % (__base);                    \
+                        __high = __high / (__base);                     \
+                }                                                       \
+                asm("divl %2":"=a" (__low), "=d" (__mod)                \
+                                    : "rm" (__base), "0" (__low), "1" 
(__upper));       \
+                asm("":"=A" (n) : "a" (__low), "d" (__high));           \
+                __mod;                                                  \
+        })
+
+#define sg_jiffies_2_ns(x)  (x * 1000000LLU  * (1000/HZ))
+
+/*TODO: to add clock_t get_cycles(void)  for timing measurements when time 
elapsed 
+ * between successive page faults is < 1 jiffies               
+ *
+ * use 'unsigned int cpufreq_quick_get(unsigned int cpu)' for getting the cpu 
freq
+ * in KHz unit
+ *
+ * conversion from number of clocks to nanosecond:
+ * 
+ * x clocks  = ((x * 10^6)/cpufreq) ns
+ *
+ */
+
+void grab_swap_token(struct mm_struct *cur_mm)
 {
-       int current_interval;
+//     struct mm_struct *cur_mm;
+       struct mm_struct *leftmost_mm = NULL;
+       int current_logical_interval;
+       u64 current_interval;
+       unsigned long long current_interval_ns = 0;
+       int cur_swap_token_frac;
+
+       int nr_total;
+//     u64 duration;   
+       u64 a,b,c,d,e,f;
+       u64 prev_page_flt_rate;
+
+//     cur_mm = current->mm;
 
+
+       // global lock
+       spin_lock(&(swap_token_lock));
+       
        global_faults++;
+       current_logical_interval = global_faults - 
cur_mm->last_logical_faultstamp;
+
+       spin_lock(&(cur_mm->swap_token_lock));
+
+       cur_mm->mm_maj_flts++;
+       /* calculate the page fault rate */
+
+       current_interval = (get_jiffies_64() -  
cur_mm->last_faultstamp_jiffies64);
+       if(current_interval == 0LLU) {  /* interval less than 1 jiffie */
+               current_interval_ns = cyc2ns(clock, (long)get_cycles() - 
(long)cur_mm->last_faultstamp_cycles); 
+       }
+       
+       cur_mm->last_faultstamp_jiffies64 = get_jiffies_64();
+       cur_mm->last_faultstamp_cycles = get_cycles();
+
+       prev_page_flt_rate = cur_mm->page_flt_rate;
+
+#define MULTIPLIER 1000000LLU
+       if(cur_mm->last_interval == 0) {
+               cur_mm->page_flt_rate = MULTIPLIER;
+               if(current_interval) {
+                       a = sg_jiffies_2_ns(current_interval);
+                       my_do_div64(cur_mm->page_flt_rate, a);
+               }
+               else 
+                       my_do_div64(cur_mm->page_flt_rate, current_interval_ns);
+               
+       }
+       else {
+               a = 25LLU*cur_mm->page_flt_rate;
+               b = 100LLU;
+               c = 35*MULTIPLIER;
 
-       current_interval = global_faults - current->mm->faultstamp;
+               if(cur_mm->last_interval)
+                       d = 100LLU * sg_jiffies_2_ns(cur_mm->last_interval);
+               else
+                       d = 100LLU * cur_mm->last_interval_ns;
 
-       if (!spin_trylock(&swap_token_lock))
-               return;
+               e = 40*MULTIPLIER;
 
-       /* First come first served */
-       if (swap_token_mm == NULL) {
-               current->mm->token_priority = current->mm->token_priority + 2;
-               swap_token_mm = current->mm;
-               goto out;
+               if(current_interval)
+                       f = 100LLU * sg_jiffies_2_ns(current_interval);
+               else
+                       f = 100LLU * current_interval_ns;
+                       
+               printk(KERN_NOTICE "SGDEBUG: f becomes %llu\n", f);
+               my_do_div64(a,b);
+               my_do_div64(c,d);
+               my_do_div64(e,f);
+               cur_mm->page_flt_rate = (a+c+e);
        }
+#undef MULTIPLIER
 
-       if (current->mm != swap_token_mm) {
-               if (current_interval < current->mm->last_interval)
-                       current->mm->token_priority++;
+       
+       printk(KERN_NOTICE "SGDEBUG: [pid:%u] # page-faults: %u; 
current_interval %llu; pg-flt rate: %llu \n", current->pid, 
cur_mm->mm_maj_flts, current_interval, cur_mm->page_flt_rate); 
+
+       /* calculate the current fraction of token_holders */
+       nr_total = nr_running();
+       cur_swap_token_frac = (num_swap_token_holders *100) / nr_total;
+
+       printk(KERN_NOTICE "SGDEBUG: [pid:%u] Total proc: %d ; # Holders: %d ; 
Holder-frac: %d \n", current->pid, nr_total, num_swap_token_holders, 
cur_swap_token_frac); 
+
+       /* the leftmost node in the rb-tree, with the least page-fault rate */ 
+       if( token_tree_leftmost != NULL)
+               leftmost_mm = container_of(token_tree_leftmost, struct 
mm_struct, token_tree_node);
+       
+       /* check if 'eligible' to get the token */
+       if( (prev_page_flt_rate < cur_mm->page_flt_rate) && 
+               (((token_tree_leftmost != NULL) &&  (cur_mm->page_flt_rate > 
leftmost_mm->page_flt_rate)) || 
+                (cur_swap_token_frac < max_swap_token_frac)) )  {
+
+               printk(KERN_NOTICE "SGDEBUG: [pid:%u] Eligible for token\n", 
current->pid); 
+               if(cur_mm->has_swap_token) {
+                       /* update the position in the tree */
+                       token_tree_delete(cur_mm);
+                       token_tree_insert(cur_mm);
+               }
                else {
-                       if (likely(current->mm->token_priority > 0))
-                               current->mm->token_priority--;
+                       /* give the token */
+                       cur_mm->has_swap_token = 1;
+                       token_tree_insert(cur_mm);
+                       num_swap_token_holders++;
                }
-               /* Check if we deserve the token */
-               if (current->mm->token_priority >
-                               swap_token_mm->token_priority) {
-                       current->mm->token_priority += 2;
-                       swap_token_mm = current->mm;
+               /* set the expiry time of the token */
+               cur_mm->token_expiry_jiffies64 = get_jiffies_64() + 
+                       (HZ * vm_token_validity_period_ms) / 1000;
+
+               /* re-calculate the fraction of token holders */
+               cur_swap_token_frac = (num_swap_token_holders *100) / 
nr_running();
+               
+               /* the leftmost_token_tree may get updated, so recompute 
leftmost_mm  */
+               leftmost_mm = container_of(token_tree_leftmost, struct 
mm_struct, token_tree_node);
+
+               /* if max allowed fraction exceeded take the token from the 
leftmost */
+               if(cur_swap_token_frac > max_swap_token_frac) {
+//                     printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = 
%x\n", cur_mm); 
+                       __put_swap_token(leftmost_mm);
                }
-       } else {
-               /* Token holder came in again! */
-               current->mm->token_priority += 2;
        }
+       /* if not eligible for page-faults and token has expired*/
+       else if(cur_mm->has_swap_token && 
+                       time_after64(get_jiffies_64(), 
cur_mm->token_expiry_jiffies64) ) {              
+//             printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n", 
cur_mm); 
+               __put_swap_token(cur_mm);
+       }
+       cur_mm->last_logical_faultstamp = global_faults;
+       cur_mm->last_logical_interval = current_logical_interval;
+       cur_mm->last_interval_ns = current_interval_ns;
+       cur_mm->last_interval = current_interval;
+       spin_unlock(&(cur_mm->swap_token_lock));
+       spin_unlock(&(swap_token_lock));
 
-out:
-       current->mm->faultstamp = global_faults;
-       current->mm->last_interval = current_interval;
-       spin_unlock(&swap_token_lock);
-return;
+       return;
 }
 
-/* Called on process exit. */
+/* Revokes the swap token */
+
+/* Expects that the mm->swap_token_lock is held before it is called */
 void __put_swap_token(struct mm_struct *mm)
 {
-       spin_lock(&swap_token_lock);
-       if (likely(mm == swap_token_mm))
-               swap_token_mm = NULL;
-       spin_unlock(&swap_token_lock);
+//     printk(KERN_NOTICE "SGDEBUG: (put_swap_token) mm = %x\n", mm); 
+//     spin_lock(&(mm->swap_token_lock));
+       if (likely(mm->has_swap_token)) {
+               mm->has_swap_token = 0;
+               token_tree_delete(mm);
+               num_swap_token_holders--;
+               printk(KERN_NOTICE "SGDEBUG: Token revoked\n"); 
+       }
+//     spin_unlock(&(mm->swap_token_lock));
 }
+

Reply via email to