I have attached the diff
On Mon, May 11, 2009 at 8:16 PM, Mulyadi Santosa
<[email protected]> wrote:
> Hi...
>
> On Mon, May 11, 2009 at 4:22 PM, Sukanto Ghosh
> <[email protected]> wrote:
>> Hi,
>>
>> I was adding a field to the 3 trace_mark() calls in kernel/sched.c.
>> The calls are at: i) context_switch(prev, next, ... ) ii)
>> try_to_wake_up(p, ...) iii) wake_up_new_task(p, ...) functions
>>
>> The field is 'task'->mm->pfrate. (I have added the pfrate field in
>> mm_struct), where 'task' is a placeholder for prev/next/p/rq->curr in
>> those trace_mark() calls. I found that always either the next/p
>> pointers are NULL or task->mm is NULL at that particular point. Is it
>> supposed to be so ? Why ?
>>
>> PS: The trace_mark() in kernel/sched.c calls are for adding entries to
>> the trace-file of sched_switch tracer.
>
> Always? hmm strange. sure you did the deferencing to task_struct correctly?
>
> perhaps you could share to us the change you made? in a form of diff -u
> perhaps?
>
> regards,
>
> Mulyadi.
>
--
Regards,
Sukanto Ghosh
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,10 +225,25 @@ struct mm_struct {
* it has been since this task got the token.
* Look at mm/thrash.c
*/
- unsigned int faultstamp;
- unsigned int token_priority;
- unsigned int last_interval;
+ u64 last_logical_faultstamp; /* global faultstamp value at last
major fault */
+ u64 last_interval; /* in jiffies */
+ unsigned long long last_interval_ns; /* in nanoseconds (for
intervals < 1 jiffie) */
+ u64 last_logical_interval; /* in global faults count */
+ unsigned int mm_maj_flts;
+ u64 page_flt_rate; /* unit: (ns)^(-1) */
+ u64 last_faultstamp_jiffies64; /* jiffies of last major fault */
+ cycles_t last_faultstamp_cycles; /* clock_value of last major
fault */
+ u64 token_expiry_jiffies64; /* expiry time of the swap_token */
+ u64 creation_jiffies64; /* time at which the mm_struct was
created */
+
+
+ /* boolean value: whether this process has a swap token or not */
+ unsigned int has_swap_token;
+
+ spinlock_t swap_token_lock;
+ struct rb_node token_tree_node;
+ unsigned int token_priority;
unsigned long flags; /* Must use atomic bitops to access the bits */
struct core_state *core_state; /* coredumping support */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f16..d742a0e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,24 +254,28 @@ extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
/* linux/mm/thrash.c */
+extern int honor_swap_token;
extern struct mm_struct * swap_token_mm;
-extern void grab_swap_token(void);
+extern void grab_swap_token(struct mm_struct *);
extern void __put_swap_token(struct mm_struct *);
static inline int has_swap_token(struct mm_struct *mm)
{
- return (mm == swap_token_mm);
+ return ((mm->has_swap_token)==1);
}
static inline void put_swap_token(struct mm_struct *mm)
{
+ spin_lock(&(mm->swap_token_lock));
if (has_swap_token(mm))
__put_swap_token(mm);
+ spin_unlock(&(mm->swap_token_lock));
}
static inline void disable_swap_token(void)
{
- put_swap_token(swap_token_mm);
+// put_swap_token(swap_token_mm);
+// honor_swap_token = 0;
}
#else /* CONFIG_SWAP */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe..8aed6dd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -567,8 +567,19 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
/* Initializing for Swap token stuff */
- mm->token_priority = 0;
+// mm->token_priority = 0;
mm->last_interval = 0;
+ mm->last_logical_interval = 0;
+ mm->has_swap_token = 0;
+ mm->last_logical_faultstamp = 0;
+ mm->mm_maj_flts = 0;
+ mm->page_flt_rate = 0;
+ mm->creation_jiffies64 = get_jiffies_64();
+ mm->last_faultstamp_jiffies64 = mm->creation_jiffies64;
+ mm->last_faultstamp_cycles = get_cycles();
+ mm->last_interval_ns = 0;
+
+ spin_lock_init(&(mm->swap_token_lock));
if (!mm_init(mm, tsk))
goto fail_nomem;
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962d..00731d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2283,8 +2283,8 @@ out_activate:
out_running:
trace_mark(kernel_sched_wakeup,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu
page_flt_rate %llu",
+ p->pid, p->state, rq, p, rq->curr, rq->load.weight,
p->mm->page_flt_rate);
check_preempt_curr(rq, p);
p->state = TASK_RUNNING;
@@ -2418,8 +2418,8 @@ void wake_up_new_task(struct task_struct *p, unsigned
long clone_flags)
inc_nr_running(rq);
}
trace_mark(kernel_sched_wakeup_new,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu
page_flt_rate %llu",
+ p->pid, p->state, rq, p, rq->curr, rq->load.weight,
p->mm->page_flt_rate);
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -2594,9 +2594,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
prepare_task_switch(rq, prev, next);
trace_mark(kernel_sched_schedule,
"prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
+ "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
prev->pid, next->pid, prev->state,
- rq, prev, next);
+ rq, prev, next, rq->load.weight, next->mm->page_flt_rate);
mm = next->mm;
oldmm = prev->active_mm;
/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec088..e187896 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -83,6 +83,9 @@ extern int compat_log;
extern int maps_protect;
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
+/* Swap-token related */
+extern int vm_token_validity_period_ms;
+extern int max_swap_token_frac;
#ifdef CONFIG_RCU_TORTURE_TEST
extern int rcutorture_runnable;
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -855,6 +858,23 @@ static struct ctl_table kern_table[] = {
};
static struct ctl_table vm_table[] = {
+ /* swap token memory management controls */
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "swap_token_validity_ms",
+ .data = &vm_token_validity_period_ms,
+ .maxlen = sizeof(vm_token_validity_period_ms),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_swap_token_holder_frac",
+ .data = &max_swap_token_frac,
+ .maxlen = sizeof(max_swap_token_frac),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
{
.ctl_name = VM_OVERCOMMIT_MEMORY,
.procname = "overcommit_memory",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3d..2311486 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,7 +948,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags)
+ unsigned long flags,
+ unsigned long rq_load,
+ u64 page_flt_rate)
{
struct trace_entry *entry;
unsigned long irq_flags;
@@ -964,6 +966,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->ctx.next_pid = next->pid;
entry->ctx.next_prio = next->prio;
entry->ctx.next_state = next->state;
+ entry->ctx.rq_load = rq_load;
+ entry->ctx.page_flt_rate = page_flt_rate;
__trace_stack(tr, data, flags, 5);
__raw_spin_unlock(&data->lock);
raw_local_irq_restore(irq_flags);
@@ -974,7 +978,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *curr,
- unsigned long flags)
+ unsigned long flags,
+ unsigned long rq_load,
+ u64 page_flt_rate)
{
struct trace_entry *entry;
unsigned long irq_flags;
@@ -990,6 +996,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->ctx.next_pid = wakee->pid;
entry->ctx.next_prio = wakee->prio;
entry->ctx.next_state = wakee->state;
+ entry->ctx.rq_load = rq_load;
+ entry->ctx.page_flt_rate = page_flt_rate;
__trace_stack(tr, data, flags, 6);
__raw_spin_unlock(&data->lock);
raw_local_irq_restore(irq_flags);
@@ -1524,13 +1532,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int
trace_idx, int cpu)
state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) +
1 : 0;
S = state < sizeof(state_to_char) - 1 ? state_to_char[state] :
'X';
comm = trace_find_cmdline(entry->ctx.next_pid);
- trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+ trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu %s\n",
entry->ctx.prev_pid,
entry->ctx.prev_prio,
S, entry->type == TRACE_CTX ? "==>" : " +",
entry->ctx.next_pid,
entry->ctx.next_prio,
- T, comm);
+ T,
+ entry->ctx.rq_load,
+ entry->ctx.page_flt_rate,
+ comm);
break;
case TRACE_SPECIAL:
trace_seq_printf(s, "# %ld %ld %ld\n",
@@ -1611,14 +1622,16 @@ static int print_trace_fmt(struct trace_iterator *iter)
state_to_char[entry->ctx.prev_state] : 'X';
T = entry->ctx.next_state < sizeof(state_to_char) ?
state_to_char[entry->ctx.next_state] : 'X';
- ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+ ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu
%llu\n",
entry->ctx.prev_pid,
entry->ctx.prev_prio,
S,
entry->type == TRACE_CTX ? "==>" : " +",
entry->ctx.next_pid,
entry->ctx.next_prio,
- T);
+ T,
+ entry->ctx.rq_load,
+ entry->ctx.page_flt_rate);
if (!ret)
return 0;
break;
@@ -1679,13 +1692,15 @@ static int print_raw_fmt(struct trace_iterator *iter)
state_to_char[entry->ctx.next_state] : 'X';
if (entry->type == TRACE_WAKE)
S = '+';
- ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+ ret = trace_seq_printf(s, "%d %d %c %d %d %c %lu %llu\n",
entry->ctx.prev_pid,
entry->ctx.prev_prio,
S,
entry->ctx.next_pid,
entry->ctx.next_prio,
- T);
+ T,
+ entry->ctx.rq_load,
+ entry->ctx.page_flt_rate);
if (!ret)
return 0;
break;
@@ -1783,6 +1798,8 @@ static int print_bin_fmt(struct trace_iterator *iter)
SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+ SEQ_PUT_FIELD_RET(s, entry->ctx.rq_load);
+ SEQ_PUT_FIELD_RET(s, entry->ctx.page_flt_rate);
break;
case TRACE_SPECIAL:
case TRACE_STACK:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f867..aafa40e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -39,6 +39,8 @@ struct ctx_switch_entry {
unsigned int next_pid;
unsigned char next_prio;
unsigned char next_state;
+ unsigned long rq_load;
+ u64 page_flt_rate;
};
/*
@@ -204,14 +206,18 @@ void tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
- unsigned long flags);
+ unsigned long flags,
+ unsigned long rq_load,
+ u64 page_flt_rate);
void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *cur,
- unsigned long flags);
+ unsigned long flags,
+ unsigned long rq_load,
+ u64 page_flt_rate);
void trace_special(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long arg1,
diff --git a/kernel/trace/trace_sched_switch.c
b/kernel/trace/trace_sched_switch.c
index cb817a2..93b0412 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,7 +20,7 @@ static atomic_t sched_ref;
static void
sched_switch_func(void *private, void *__rq, struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next, unsigned long rq_load, u64
page_flt_rate)
{
struct trace_array **ptr = private;
struct trace_array *tr = *ptr;
@@ -40,8 +40,10 @@ sched_switch_func(void *private, void *__rq, struct
task_struct *prev,
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
+// rq_load = ((struct rq*)__rq)->load.weight;
+
if (likely(disabled == 1))
- tracing_sched_switch_trace(tr, data, prev, next, flags);
+ tracing_sched_switch_trace(tr, data, prev, next, flags,
rq_load, page_flt_rate);
atomic_dec(&data->disabled);
local_irq_restore(flags);
@@ -54,6 +56,8 @@ sched_switch_callback(void *probe_data, void *call_data,
struct task_struct *prev;
struct task_struct *next;
struct rq *__rq;
+ unsigned long rq_load;
+ u64 page_flt_rate;
if (!atomic_read(&sched_ref))
return;
@@ -65,17 +69,19 @@ sched_switch_callback(void *probe_data, void *call_data,
__rq = va_arg(*args, typeof(__rq));
prev = va_arg(*args, typeof(prev));
next = va_arg(*args, typeof(next));
+ rq_load = va_arg(*args, typeof(rq_load));
+ page_flt_rate = va_arg(*args, typeof(page_flt_rate));
/*
* If tracer_switch_func only points to the local
* switch func, it still needs the ptr passed to it.
*/
- sched_switch_func(probe_data, __rq, prev, next);
+ sched_switch_func(probe_data, __rq, prev, next, rq_load, page_flt_rate);
}
static void
wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
- task_struct *curr)
+ task_struct *curr, unsigned long rq_load, u64
page_flt_rate)
{
struct trace_array **ptr = private;
struct trace_array *tr = *ptr;
@@ -94,8 +100,10 @@ wakeup_func(void *private, void *__rq, struct task_struct
*wakee, struct
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
+// rq_load = ((struct rq*)__rq)->load.weight;
+
if (likely(disabled == 1))
- tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+ tracing_sched_wakeup_trace(tr, data, wakee, curr, flags,
rq_load, page_flt_rate);
atomic_dec(&data->disabled);
local_irq_restore(flags);
@@ -108,6 +116,8 @@ wake_up_callback(void *probe_data, void *call_data,
struct task_struct *curr;
struct task_struct *task;
struct rq *__rq;
+ unsigned long rq_load;
+ u64 page_flt_rate;
if (likely(!tracer_enabled))
return;
@@ -119,11 +129,13 @@ wake_up_callback(void *probe_data, void *call_data,
__rq = va_arg(*args, typeof(__rq));
task = va_arg(*args, typeof(task));
curr = va_arg(*args, typeof(curr));
+ rq_load = va_arg(*args, typeof(rq_load));
+ page_flt_rate = va_arg(*args, typeof(page_flt_rate));
tracing_record_cmdline(task);
tracing_record_cmdline(curr);
- wakeup_func(probe_data, __rq, task, curr);
+ wakeup_func(probe_data, __rq, task, curr, rq_load, page_flt_rate);
}
static void sched_switch_reset(struct trace_array *tr)
@@ -141,7 +153,7 @@ static int tracing_sched_register(void)
int ret;
ret = marker_probe_register("kernel_sched_wakeup",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
+ "pid %d state %ld ## rq %p task %p rq->curr %p rq_load
%lu page_flt_rate %llu",
wake_up_callback,
&ctx_trace);
if (ret) {
@@ -151,7 +163,7 @@ static int tracing_sched_register(void)
}
ret = marker_probe_register("kernel_sched_wakeup_new",
- "pid %d state %ld ## rq %p task %p rq->curr %p",
+ "pid %d state %ld ## rq %p task %p rq->curr %p rq_load
%lu page_flt_rate %llu",
wake_up_callback,
&ctx_trace);
if (ret) {
@@ -162,7 +174,7 @@ static int tracing_sched_register(void)
ret = marker_probe_register("kernel_sched_schedule",
"prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
+ "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
sched_switch_callback,
&ctx_trace);
if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 1002f47..ccc6eee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2268,7 +2268,9 @@ static int do_swap_page(struct mm_struct *mm, struct
vm_area_struct *vma,
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- grab_swap_token(); /* Contend for token _before_ read-in */
+ /* Contend for token _before_ read-in */
+ if(honor_swap_token)
+ grab_swap_token(mm);
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b..8fb0ed5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -317,11 +317,16 @@ static int page_referenced_one(struct page *page,
} else if (ptep_clear_flush_young_notify(vma, address, pte))
referenced++;
- /* Pretend the page is referenced if the task has the
- swap token and is in the middle of a page fault. */
- if (mm != current->mm && has_swap_token(mm) &&
- rwsem_is_locked(&mm->mmap_sem))
- referenced++;
+ /* If the task has an invalid swap token, revoke it.
+ Otherwise, pretend the page is referenced */
+ spin_lock(&(mm->swap_token_lock));
+ if ((mm != current->mm) && has_swap_token(mm)) {
+ if(time_after64(get_jiffies_64(), mm->token_expiry_jiffies64))
+ __put_swap_token(mm);
+ else
+ referenced++;
+ }
+ spin_unlock(&(mm->swap_token_lock));
(*mapcount)--;
pte_unmap_unlock(pte, ptl);
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205..222ce54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,59 +21,244 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
+#include <linux/rbtree.h>
+#include <linux/timex.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
static unsigned int global_faults;
-void grab_swap_token(void)
+int honor_swap_token = 1;
+
+struct rb_root token_tree_root = RB_ROOT;
+struct rb_node *token_tree_leftmost = NULL;
+int num_swap_token_holders = 0;
+
+/* maximum allowed fraction of swap-token holders */
+int max_swap_token_frac = 50;
+/* number of msec for which the token is valid */
+int vm_token_validity_period_ms = 10;
+
+
+void token_tree_insert(struct mm_struct *mm) {
+
+ struct rb_node **new = &(token_tree_root.rb_node), *parent = NULL;
+ int leftmost = 1;
+
+ while(*new) {
+ struct mm_struct *this = container_of(*new, struct mm_struct,
token_tree_node);
+
+ parent = *new;
+ if(mm->page_flt_rate < this->page_flt_rate)
+ new = &((*new)->rb_left);
+// else if(!((mm->page_flt_rate == this->page_flt_rate) && (mm ==
this))) {
+ else if(mm != this) {
+ new = &((*new)->rb_right);
+ leftmost = 0;
+ }
+ else
+ return;
+ }
+
+ if(leftmost)
+ token_tree_leftmost = &(mm->token_tree_node);
+
+ rb_link_node(&(mm->token_tree_node), parent, new);
+ rb_insert_color(&(mm->token_tree_node), &token_tree_root);
+}
+
+void token_tree_delete(struct mm_struct* mm) {
+
+ if(&(mm->token_tree_node) == token_tree_leftmost) {
+ struct rb_node* next_node = rb_next(token_tree_leftmost);
+ token_tree_leftmost = next_node;
+ }
+ rb_erase(&(mm->token_tree_node), &token_tree_root);
+}
+/* 64-bit division [copied from x86/include/div64.h]
+ *
+ * modifies the dividend to store the quotient */
+#define my_do_div64(n, base) \
+ ({ \
+ unsigned long __upper, __low, __high, __mod, __base; \
+ __base = (base); \
+ asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2":"=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1"
(__upper)); \
+ asm("":"=A" (n) : "a" (__low), "d" (__high)); \
+ __mod; \
+ })
+
+#define sg_jiffies_2_ns(x) (x * 1000000LLU * (1000/HZ))
+
+/*TODO: to add clock_t get_cycles(void) for timing measurements when time
elapsed
+ * between successive page faults is < 1 jiffies
+ *
+ * use 'unsigned int cpufreq_quick_get(unsigned int cpu)' for getting the cpu
freq
+ * in KHz unit
+ *
+ * conversion from number of clocks to nanosecond:
+ *
+ * x clocks = ((x * 10^6)/cpufreq) ns
+ *
+ */
+
+void grab_swap_token(struct mm_struct *cur_mm)
{
- int current_interval;
+// struct mm_struct *cur_mm;
+ struct mm_struct *leftmost_mm = NULL;
+ int current_logical_interval;
+ u64 current_interval;
+ unsigned long long current_interval_ns = 0;
+ int cur_swap_token_frac;
+
+ int nr_total;
+// u64 duration;
+ u64 a,b,c,d,e,f;
+ u64 prev_page_flt_rate;
+
+// cur_mm = current->mm;
+
+ // global lock
+ spin_lock(&(swap_token_lock));
+
global_faults++;
+ current_logical_interval = global_faults -
cur_mm->last_logical_faultstamp;
+
+ spin_lock(&(cur_mm->swap_token_lock));
+
+ cur_mm->mm_maj_flts++;
+ /* calculate the page fault rate */
+
+ current_interval = (get_jiffies_64() -
cur_mm->last_faultstamp_jiffies64);
+ if(current_interval == 0LLU) { /* interval less than 1 jiffie */
+ current_interval_ns = cyc2ns(clock, (long)get_cycles() -
(long)cur_mm->last_faultstamp_cycles);
+ }
+
+ cur_mm->last_faultstamp_jiffies64 = get_jiffies_64();
+ cur_mm->last_faultstamp_cycles = get_cycles();
+
+ prev_page_flt_rate = cur_mm->page_flt_rate;
+
+#define MULTIPLIER 1000000LLU
+ if(cur_mm->last_interval == 0) {
+ cur_mm->page_flt_rate = MULTIPLIER;
+ if(current_interval) {
+ a = sg_jiffies_2_ns(current_interval);
+ my_do_div64(cur_mm->page_flt_rate, a);
+ }
+ else
+ my_do_div64(cur_mm->page_flt_rate, current_interval_ns);
+
+ }
+ else {
+ a = 25LLU*cur_mm->page_flt_rate;
+ b = 100LLU;
+ c = 35*MULTIPLIER;
- current_interval = global_faults - current->mm->faultstamp;
+ if(cur_mm->last_interval)
+ d = 100LLU * sg_jiffies_2_ns(cur_mm->last_interval);
+ else
+ d = 100LLU * cur_mm->last_interval_ns;
- if (!spin_trylock(&swap_token_lock))
- return;
+ e = 40*MULTIPLIER;
- /* First come first served */
- if (swap_token_mm == NULL) {
- current->mm->token_priority = current->mm->token_priority + 2;
- swap_token_mm = current->mm;
- goto out;
+ if(current_interval)
+ f = 100LLU * sg_jiffies_2_ns(current_interval);
+ else
+ f = 100LLU * current_interval_ns;
+
+ printk(KERN_NOTICE "SGDEBUG: f becomes %llu\n", f);
+ my_do_div64(a,b);
+ my_do_div64(c,d);
+ my_do_div64(e,f);
+ cur_mm->page_flt_rate = (a+c+e);
}
+#undef MULTIPLIER
- if (current->mm != swap_token_mm) {
- if (current_interval < current->mm->last_interval)
- current->mm->token_priority++;
+
+ printk(KERN_NOTICE "SGDEBUG: [pid:%u] # page-faults: %u;
current_interval %llu; pg-flt rate: %llu \n", current->pid,
cur_mm->mm_maj_flts, current_interval, cur_mm->page_flt_rate);
+
+ /* calculate the current fraction of token_holders */
+ nr_total = nr_running();
+ cur_swap_token_frac = (num_swap_token_holders *100) / nr_total;
+
+ printk(KERN_NOTICE "SGDEBUG: [pid:%u] Total proc: %d ; # Holders: %d ;
Holder-frac: %d \n", current->pid, nr_total, num_swap_token_holders,
cur_swap_token_frac);
+
+ /* the leftmost node in the rb-tree, with the least page-fault rate */
+ if( token_tree_leftmost != NULL)
+ leftmost_mm = container_of(token_tree_leftmost, struct
mm_struct, token_tree_node);
+
+ /* check if 'eligible' to get the token */
+ if( (prev_page_flt_rate < cur_mm->page_flt_rate) &&
+ (((token_tree_leftmost != NULL) && (cur_mm->page_flt_rate >
leftmost_mm->page_flt_rate)) ||
+ (cur_swap_token_frac < max_swap_token_frac)) ) {
+
+ printk(KERN_NOTICE "SGDEBUG: [pid:%u] Eligible for token\n",
current->pid);
+ if(cur_mm->has_swap_token) {
+ /* update the position in the tree */
+ token_tree_delete(cur_mm);
+ token_tree_insert(cur_mm);
+ }
else {
- if (likely(current->mm->token_priority > 0))
- current->mm->token_priority--;
+ /* give the token */
+ cur_mm->has_swap_token = 1;
+ token_tree_insert(cur_mm);
+ num_swap_token_holders++;
}
- /* Check if we deserve the token */
- if (current->mm->token_priority >
- swap_token_mm->token_priority) {
- current->mm->token_priority += 2;
- swap_token_mm = current->mm;
+ /* set the expiry time of the token */
+ cur_mm->token_expiry_jiffies64 = get_jiffies_64() +
+ (HZ * vm_token_validity_period_ms) / 1000;
+
+ /* re-calculate the fraction of token holders */
+ cur_swap_token_frac = (num_swap_token_holders *100) /
nr_running();
+
+ /* the leftmost_token_tree may get updated, so recompute
leftmost_mm */
+ leftmost_mm = container_of(token_tree_leftmost, struct
mm_struct, token_tree_node);
+
+ /* if max allowed fraction exceeded take the token from the
leftmost */
+ if(cur_swap_token_frac > max_swap_token_frac) {
+// printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm =
%x\n", cur_mm);
+ __put_swap_token(leftmost_mm);
}
- } else {
- /* Token holder came in again! */
- current->mm->token_priority += 2;
}
+ /* if not eligible for page-faults and token has expired*/
+ else if(cur_mm->has_swap_token &&
+ time_after64(get_jiffies_64(),
cur_mm->token_expiry_jiffies64) ) {
+// printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n",
cur_mm);
+ __put_swap_token(cur_mm);
+ }
+ cur_mm->last_logical_faultstamp = global_faults;
+ cur_mm->last_logical_interval = current_logical_interval;
+ cur_mm->last_interval_ns = current_interval_ns;
+ cur_mm->last_interval = current_interval;
+ spin_unlock(&(cur_mm->swap_token_lock));
+ spin_unlock(&(swap_token_lock));
-out:
- current->mm->faultstamp = global_faults;
- current->mm->last_interval = current_interval;
- spin_unlock(&swap_token_lock);
-return;
+ return;
}
-/* Called on process exit. */
+/* Revokes the swap token */
+
+/* Expects that the mm->swap_token_lock is held before it is called */
void __put_swap_token(struct mm_struct *mm)
{
- spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm))
- swap_token_mm = NULL;
- spin_unlock(&swap_token_lock);
+// printk(KERN_NOTICE "SGDEBUG: (put_swap_token) mm = %x\n", mm);
+// spin_lock(&(mm->swap_token_lock));
+ if (likely(mm->has_swap_token)) {
+ mm->has_swap_token = 0;
+ token_tree_delete(mm);
+ num_swap_token_holders--;
+ printk(KERN_NOTICE "SGDEBUG: Token revoked\n");
+ }
+// spin_unlock(&(mm->swap_token_lock));
}
+