Hi all,
We're having difficulty getting a share tree scheduler policy working on one of
our systems running OGS/GE 2011.11p1. Viewed in qmon, the share tree is
populated with the auto-created users, but the "combined usage" figure remains
at 0. qstat shows that no stckts are being are being allocated, and the whole
thing is operating simply as a FIFO queue (apart from those jobs for which I've
tweaked the POSIX priority). I suspect I've missed a setting somewhere or done
something else daft, but can't for the life of me see what!
Usage information is being correctly recorded in the accounting file, and some
usage information does appear to be making it into the user records. For
example:
# Version: 2011.11p1
#
# DO NOT MODIFY THIS FILE MANUALLY!
#
name user_x
oticket 0
fshare 0
delete_time 0
usage NONE
usage_time_stamp 1407934640
long_term_usage NONE
project PROJECT_S
cpu=14370153.453620,mem=14878026.134226,io=48.299435,iow=0.000000,vmem=543616509738.411987,maxvmem=117843309842.954834,submission_time=85836816136.214783,priority=0.000000,exit_status=0.000000,signal=0.000000,start_time=85852164094.266663,end_time=85852732907.583344,ru_wallclock=568813.316622,ru_utime=564971.933055,ru_stime=239.647192,ru_maxrss=42305077.391669,ru_ixrss=0.000000,ru_ismrss=0.000000,ru_idrss=0.000000,ru_isrss=0.000000,ru_minflt=25676250.593291,ru_majflt=0.000000,ru_nswap=0.000000,ru_inblock=335.896186,ru_oublock=90855.213429,ru_msgsnd=0.000000,ru_msgrcv=0.000000,ru_nsignals=0.000000,ru_nvcsw=56133.645660,ru_nivcsw=1301071.501176,acct_cpu=565211.580247,acct_mem=561536.308697,acct_io=10.064424,acct_iow=0.000000,acct_maxvmem=78396867223.625519,finished_jobs=1.000000
cpu=14376021.568465,mem=14884098.951880,io=48.318488,iow=0.000000,vmem=543824986112.000000,maxvmem=117887852544.000000,submission_time=85868051231.000000,priority=0.000000,exit_status!
=0.000000,signal=0.000000,start_time=85883404771.000000,end_time=85883973789.000000,ru_wallclock=569018.000000,ru_utime=565175.266266,ru_stime=239.733526,ru_maxrss=42320472.000000,ru_ixrss=0.000000,ru_ismrss=0.000000,ru_idrss=0.000000,ru_isrss=0.000000,ru_minflt=25685617.000000,ru_majflt=0.000000,ru_nswap=0.000000,ru_inblock=336.000000,ru_oublock=90888.000000,ru_msgsnd=0.000000,ru_msgrcv=0.000000,ru_nsignals=0.000000,ru_nvcsw=56154.000000,ru_nivcsw=1301544.000000,acct_cpu=565414.999792,acct_mem=561735.223195,acct_io=10.068075,acct_iow=0.000000,acct_maxvmem=78425395200.000000,finished_jobs=61.000000;
default_project NONE
debited_job_usage 1696
cpu=20.730000,mem=10.670671,io=0.002276,iow=0.000000,vmem=733593600.000000,maxvmem=733593600.000000;debited_job_usage
1695
cpu=13263.580000,mem=13730.393731,io=0.173561,iow=0.000000,vmem=1199620096.000000,maxvmem=1285763072.000000;
Our qmaster and execd spool directories are local to the nodes on which they're
running. Some configuration details:
[root@headnode ~]# qconf -sconf
#global:
enforce_project true
enforce_user auto
reporting_params accounting=true reporting=false \
flush_time=00:00:15 joblog=false sharelog=00:00:00
auto_user_oticket 0
auto_user_fshare 0
auto_user_default_project none
auto_user_delete_time 0
[root@headnode ~]# qconf -ssconf
algorithm default
schedule_interval 0:0:15
maxujobs 0
queue_sort_method load
job_load_adjustments np_load_avg=0.50
load_adjustment_decay_time 0:7:30
load_formula np_load_avg
schedd_job_info true
flush_submit_sec 0
flush_finish_sec 0
params none
reprioritize_interval 0:0:0
halftime 336
usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor 5.000000
weight_user 0.250000
weight_project 0.250000
weight_department 0.250000
weight_job 0.250000
weight_tickets_functional 0
weight_tickets_share 100000
share_override_tickets TRUE
share_functional_shares TRUE
max_functional_jobs_to_schedule 200
report_pjob_tickets TRUE
max_pending_tasks_per_job 50
halflife_decay_list none
policy_hierarchy OSF
weight_ticket 10.000000
weight_waiting_time 0.500000
weight_deadline 0.000000
weight_urgency 0.000000
weight_priority 1.000000
max_reservation 0
default_duration INFINITY
[root@headnode ~]# qconf -sstree
id=0
name=Root
type=0
shares=1
childnodes=1
id=1
name=default
type=0
shares=100000
childnodes=NONE
I'd be grateful for any pointers or suggestions you might be able to offer.
Regards,
Gordon
_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users