Here is a new trick.
Labeling previous implementation (assume system applied [1/2] only) as V.1,
it does:
- At exit of kernel (ia64_leave_kernel), calculate cycles from last
check point using last stamp (ac_stamp), then accumulate the cycle
as "system's cycles" (ac_stime) and updates the stamp.
- At entrance of kernel (break_fault etc.), calculate cycles from last
check point using last stamp, then accumulate the cycle as "user's
cycles" (ac_utime) and updates the stamp.
It takes times from both of kernel entrance path and exit path, so it
results in considerable increase of system call overhead, unfortunately.
This 2 of 2 is magical patch to reduce the overhead.
Labeling new implementation (assume system applied [1/2] and [2/2]) as V.2,
it does:
- At exit of kernel (ia64_leave_kernel), do nothing but only save the
"leave time" as ac_leave separated from usual time stamp (ac_stamp).
- At entrance of kernel (break_fault etc.):
1. calculate cycles from last check point (in kernel) to "last leave"
using ac_stamp and ac_leave, then accumulate the cycle to ac_stime.
2. calculate cycles from "last leave" to now, then accumulate the
cycle to ac_utime.
3. updates the stamp (ac_stamp).
It can be said that this patch combines most part of separated jobs and
moves it to entrance side. The change is simple, however:
- Exit path becomes quite simple. Only needed is posting value of ITC
to memory. There were few registers and slots in bundles available
for extra work, but fortunately I made it without increasing the
number of bundles ;-)
- Entry path becomes slightly complicated. But we can load/store data
at once, and do not need to do it at both of exit/entrance anymore.
Following result of benchmark shows the performance impact of my patches.
(V.1 = 2.6.24-rc5 + [1/2], V.2 = rc5 + [1/2] + [2/2], orig. = rc5)
===========================================================================
INDEX VALUES RATIO(%)
TEST (Unixbench-v4.1.0) V.1 V.2 orig. V.1 V.2 orig.
=============================== ====== ====== ====== ===== ===== =====
Dhrystone 2 using register var. 304.3 304.3 304.4 100.0 100.0 100.0
Double-Precision Whetstone 171.3 171.3 171.1 100.1 100.1 100.0
Execl Throughput 471.3 466.2 467.3 100.9 99.8 100.0
File Copy 1024 buf 2000 maxblks 496.6 511.1 507.1 97.9 100.8 100.0
File Copy 256 buf 500 maxblks 352.4 355.3 366.2 96.2 97.0 100.0
File Copy 4096 buf 8000 maxblks 765.8 768.6 778.1 98.4 98.8 100.0
Pipe Throughput 422.0 427.0 416.1 101.4 102.6 100.0
Process Creation 945.1 949.3 948.0 99.7 100.1 100.0
Shell Scripts (8 concurrent) 1646.7 1646.2 1654.5 99.5 99.5 100.0
System Call Overhead 695.4 732.1 820.0 84.8 89.3 100.0
=============================== ====== ====== ====== ===== ===== =====
FINAL SCORE 522.0 527.1 533.9 97.8 98.7 100.0
===========================================================================
(@ Madison 1.5GHz x 4)
Most affected is undoubtedly system call, which is originally well optimized.
But from macro-viewpoint, unless you are a full-time-system-call-aholic, I
believe it's worthwhile to make a concession.
The faster hardware goes (or even the more active software becomes), the
more accuracy of traditional tick-sampling based CPU time accounting drops.
When would be the decision point?
Signed-off-by: Hidetoshi Seto <[EMAIL PROTECTED]>
---
arch/ia64/kernel/asm-offsets.c | 1
arch/ia64/kernel/entry.S | 87 ++++++++++++++++++++++++++++++-----------
arch/ia64/kernel/fsys.S | 20 ++++++---
arch/ia64/kernel/ivt.S | 42 ++++++++++++-------
include/asm-ia64/thread_info.h | 1
5 files changed, 107 insertions(+), 44 deletions(-)
Index: linux-2.6.24-rc5/arch/ia64/kernel/asm-offsets.c
===================================================================
--- linux-2.6.24-rc5.orig/arch/ia64/kernel/asm-offsets.c
+++ linux-2.6.24-rc5/arch/ia64/kernel/asm-offsets.c
@@ -41,6 +41,7 @@
DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
+ DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
#endif
Index: linux-2.6.24-rc5/arch/ia64/kernel/entry.S
===================================================================
--- linux-2.6.24-rc5.orig/arch/ia64/kernel/entry.S
+++ linux-2.6.24-rc5/arch/ia64/kernel/entry.S
@@ -710,6 +710,16 @@
(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
#endif
.work_processed_syscall:
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ adds r2=PT(LOADRS)+16,r12
+(pUStk) mov.m r22=ar.itc // fetch time at leave
+ adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
+ ;;
+(p6) ld4 r31=[r18] // load
current_thread_info()->flags
+ ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for
"loadrs"
+ adds r3=PT(AR_BSPSTORE)+16,r12 // deferred
+ ;;
+#else
adds r2=PT(LOADRS)+16,r12
adds r3=PT(AR_BSPSTORE)+16,r12
adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
@@ -718,6 +728,7 @@
ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for
"loadrs"
nop.i 0
;;
+#endif
mov r16=ar.bsp // M2 get existing backing
store pointer
ld8 r18=[r2],PT(R9)-PT(B6) // load b6
(p6) and r15=TIF_WORK_MASK,r31 // any work other than
TIF_SYSCALL_TRACE?
@@ -737,12 +748,21 @@
ld8 r29=[r2],16 // M0|1 load cr.ipsr
ld8 r28=[r3],16 // M0|1 load cr.iip
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
+ ;;
+ ld8 r30=[r2],16 // M0|1 load cr.ifs
+ ld8 r25=[r3],16 // M0|1 load ar.unat
+(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+ ;;
+#else
mov r22=r0 // A clear r22
;;
ld8 r30=[r2],16 // M0|1 load cr.ifs
ld8 r25=[r3],16 // M0|1 load ar.unat
(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
;;
+#endif
ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
(pKStk) mov r22=psr // M2 read PSR now that
interrupts are disabled
nop 0
@@ -759,7 +779,11 @@
ld8.fill r1=[r3],16 // M0|1 load r1
(pUStk) mov r17=1 // A
;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) st1 [r15]=r17 // M2|3
+#else
(pUStk) st1 [r14]=r17 // M2|3
+#endif
ld8.fill r13=[r3],16 // M0|1
mov f8=f0 // F clear f8
;;
@@ -775,12 +799,22 @@
shr.u r18=r19,16 // I0|1 get byte size of existing
"dirty" partition
cover // B add current frame into dirty
partition & set cr.ifs
;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ mov r19=ar.bsp // M2 get new backing store pointer
+ st8 [r14]=r22 // M save time at leave
+ mov f10=f0 // F clear f10
+
+ mov r22=r0 // A clear r22
+ movl r14=__kernel_syscall_via_epc // X
+ ;;
+#else
mov r19=ar.bsp // M2 get new backing store pointer
mov f10=f0 // F clear f10
nop.m 0
movl r14=__kernel_syscall_via_epc // X
;;
+#endif
mov.m ar.csd=r0 // M2 clear ar.csd
mov.m ar.ccv=r0 // M2 clear ar.ccv
mov b7=r14 // I0 clear b7 (hint with
__kernel_syscall_via_epc)
@@ -913,10 +947,18 @@
adds r16=PT(CR_IPSR)+16,r12
adds r17=PT(CR_IIP)+16,r12
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ .pred.rel.mutex pUStk,pKStk
+(pKStk) mov r22=psr // M2 read PSR now that interrupts are
disabled
+(pUStk) mov.m r22=ar.itc // M fetch time at leave
+ nop.i 0
+ ;;
+#else
(pKStk) mov r22=psr // M2 read PSR now that interrupts are
disabled
nop.i 0
nop.i 0
;;
+#endif
ld8 r29=[r16],16 // load cr.ipsr
ld8 r28=[r17],16 // load cr.iip
;;
@@ -938,15 +980,37 @@
;;
ld8.fill r12=[r16],16
ld8.fill r13=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
+#else
(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
+#endif
;;
ld8 r20=[r16],16 // ar.fpsr
ld8.fill r15=[r17],16
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 // deferred
+#endif
;;
ld8.fill r14=[r16],16
ld8.fill r2=[r17]
(pUStk) mov r17=1
;;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ // mmi_ : ld8 st1 shr;; mmi_ : st8 st1 shr;;
+ // mib : mov add br -> mib : ld8 add br
+ // bbb_ : br nop cover;; mbb_ : mov br cover;;
+ //
+ // no one require bsp in r16 if (pKStk) branch is selected.
+(pUStk) st8 [r3]=r22 // save time at leave
+(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack
+ shr.u r18=r19,16 // get byte size of existing "dirty" partition
+ ;;
+ ld8.fill r3=[r16] // deferred
+ LOAD_PHYS_STACK_REG_SIZE(r17)
+(pKStk) br.cond.dpnt skip_rbs_switch
+ mov r16=ar.bsp // get existing backing store pointer
+#else
ld8.fill r3=[r16]
(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack
shr.u r18=r19,16 // get byte size of existing "dirty" partition
@@ -954,6 +1018,7 @@
mov r16=ar.bsp // get existing backing store pointer
LOAD_PHYS_STACK_REG_SIZE(r17)
(pKStk) br.cond.dpnt skip_rbs_switch
+#endif
/*
* Restore user backing store.
@@ -995,28 +1060,6 @@
shladd in0=loc1,3,r17
mov in1=0
;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-account_sys_leave:
- // The size of current frame
- // (in * 2 + loc * N (N = 8 or 12) + out * 2)
- // is enough to work, so just take care to keep in0,in1
- mov loc0=ar.itc
- mov loc1=IA64_KR(CURRENT) // M2 (12 cycle read latency)
- ;;
- add loc2=TI_AC_STAMP+IA64_TASK_SIZE,loc1
- add loc3=TI_AC_STIME+IA64_TASK_SIZE,loc1
- ;;
- ld8 loc4=[loc2] // get last stamp
- ld8 loc5=[loc3] // cumulated stime
- ;;
- sub loc4=loc0,loc4 // elapsed time
- ;;
- add loc5=loc5,loc4 // sum
- ;;
- st8 [loc2]=loc0 // update stamp
- st8 [loc3]=loc5 // update stime
- ;;
-#endif
TEXT_ALIGN(32)
rse_clear_invalid:
#ifdef CONFIG_ITANIUM
Index: linux-2.6.24-rc5/arch/ia64/kernel/fsys.S
===================================================================
--- linux-2.6.24-rc5.orig/arch/ia64/kernel/fsys.S
+++ linux-2.6.24-rc5/arch/ia64/kernel/fsys.S
@@ -689,17 +689,23 @@
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
// mov.m r30=ar.itc is called in advance
add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2
- add r17=TI_AC_UTIME+IA64_TASK_SIZE,r2
+ add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2
;;
- ld8 r18=[r16] // get last stamp
- ld8 r19=[r17] // cumulated utime
+ ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel
+ ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at leave kernel
;;
- sub r18=r30,r18 // elapsed time
+ ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime
+ ld8 r21=[r17] // cumulated utime
+ sub r22=r19,r18 // stime before leave kernel
;;
- add r19=r19,r18 // sum
+ st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // update stamp
+ sub r18=r30,r19 // elapsed time in user mode
;;
- st8 [r16]=r30 // update stamp
- st8 [r17]=r19 // update utime
+ add r20=r20,r22 // sum stime
+ add r21=r21,r18 // sum utime
+ ;;
+ st8 [r16]=r20 // update stime
+ st8 [r17]=r21 // update utime
;;
#endif
mov ar.rsc=0x3 // M2 set eager mode, pl 0,
LE, loadrs=0
Index: linux-2.6.24-rc5/arch/ia64/kernel/ivt.S
===================================================================
--- linux-2.6.24-rc5.orig/arch/ia64/kernel/ivt.S
+++ linux-2.6.24-rc5/arch/ia64/kernel/ivt.S
@@ -841,18 +841,24 @@
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
// mov.m r30=ar.itc is called in advance, and r13 is current
add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 // A
- add r17=TI_AC_UTIME+IA64_TASK_SIZE,r13 // A
+ add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 // A
(pKStk) br.cond.spnt .skip_accounting // B unlikely skip
;;
- ld8 r18=[r16] // M get last stamp
- ld8 r19=[r17] // M cumulated utime
+ ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // M get last stamp
+ ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // M time at leave
;;
- sub r18=r30,r18 // A elapsed time
+ ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // M cumulated stime
+ ld8 r21=[r17] // M cumulated utime
+ sub r22=r19,r18 // A stime before leave
;;
- add r19=r19,r18 // A sum
+ st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // M update stamp
+ sub r18=r30,r19 // A elapsed time in user
;;
- st8 [r16]=r30 // M update stamp
- st8 [r17]=r19 // M update utime
+ add r20=r20,r22 // A sum stime
+ add r21=r21,r18 // A sum utime
+ ;;
+ st8 [r16]=r20 // M update stime
+ st8 [r17]=r21 // M update utime
;;
.skip_accounting:
#endif
@@ -1131,18 +1137,24 @@
ENTRY(account_sys_enter)
// mov.m r20=ar.itc is called in advance, and r13 is current
add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13
- add r17=TI_AC_UTIME+IA64_TASK_SIZE,r13
+ add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13
+ ;;
+ ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel
+ ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at left from kernel
+ ;;
+ ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime
+ ld8 r21=[r17] // cumulated utime
+ sub r22=r19,r18 // stime before leave kernel
;;
- ld8 r18=[r16] // get last stamp
- ld8 r19=[r17] // cumulated utime
+ st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP // update stamp
+ sub r18=r20,r19 // elapsed time in user mode
;;
- sub r18=r20,r18 // elapsed time
+ add r23=r23,r22 // sum stime
+ add r21=r21,r18 // sum utime
;;
- add r19=r19,r18 // sum
+ st8 [r16]=r23 // update stime
+ st8 [r17]=r21 // update utime
;;
- st8 [r16]=r20 // update stamp
- st8 [r17]=r19 // update utime
- ;;
br.ret.sptk.many rp
END(account_sys_enter)
#endif
Index: linux-2.6.24-rc5/include/asm-ia64/thread_info.h
===================================================================
--- linux-2.6.24-rc5.orig/include/asm-ia64/thread_info.h
+++ linux-2.6.24-rc5/include/asm-ia64/thread_info.h
@@ -33,6 +33,7 @@
struct restart_block restart_block;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
__u64 ac_stamp;
+ __u64 ac_leave;
__u64 ac_stime;
__u64 ac_utime;
#endif
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html