[gridengine users] Share tree usage problem

Gordon Stewart Thu, 14 Aug 2014 00:28:57 -0700

Hi all,

We're having difficulty getting a share tree scheduler policy working on one of 
our systems running OGS/GE 2011.11p1.  Viewed in qmon, the share tree is 
populated with the auto-created users, but the "combined usage" figure remains 
at 0.  qstat shows that no stckts are being are being allocated, and the whole 
thing is operating simply as a FIFO queue (apart from those jobs for which I've 
tweaked the POSIX priority).  I suspect I've missed a setting somewhere or done 
something else daft, but can't for the life of me see what!


Usage information is being correctly recorded in the accounting file, and some 
usage information does appear to be making it into the user records.  For 
example:

# Version: 2011.11p1
#
# DO NOT MODIFY THIS FILE MANUALLY!
#
name user_x
oticket 0
fshare 0
delete_time 0
usage NONE
usage_time_stamp 1407934640
long_term_usage NONE
project PROJECT_S 
cpu=14370153.453620,mem=14878026.134226,io=48.299435,iow=0.000000,vmem=543616509738.411987,maxvmem=117843309842.954834,submission_time=85836816136.214783,priority=0.000000,exit_status=0.000000,signal=0.000000,start_time=85852164094.266663,end_time=85852732907.583344,ru_wallclock=568813.316622,ru_utime=564971.933055,ru_stime=239.647192,ru_maxrss=42305077.391669,ru_ixrss=0.000000,ru_ismrss=0.000000,ru_idrss=0.000000,ru_isrss=0.000000,ru_minflt=25676250.593291,ru_majflt=0.000000,ru_nswap=0.000000,ru_inblock=335.896186,ru_oublock=90855.213429,ru_msgsnd=0.000000,ru_msgrcv=0.000000,ru_nsignals=0.000000,ru_nvcsw=56133.645660,ru_nivcsw=1301071.501176,acct_cpu=565211.580247,acct_mem=561536.308697,acct_io=10.064424,acct_iow=0.000000,acct_maxvmem=78396867223.625519,finished_jobs=1.000000
 
cpu=14376021.568465,mem=14884098.951880,io=48.318488,iow=0.000000,vmem=543824986112.000000,maxvmem=117887852544.000000,submission_time=85868051231.000000,priority=0.000000,exit_status!
 
=0.000000,signal=0.000000,start_time=85883404771.000000,end_time=85883973789.000000,ru_wallclock=569018.000000,ru_utime=565175.266266,ru_stime=239.733526,ru_maxrss=42320472.000000,ru_ixrss=0.000000,ru_ismrss=0.000000,ru_idrss=0.000000,ru_isrss=0.000000,ru_minflt=25685617.000000,ru_majflt=0.000000,ru_nswap=0.000000,ru_inblock=336.000000,ru_oublock=90888.000000,ru_msgsnd=0.000000,ru_msgrcv=0.000000,ru_nsignals=0.000000,ru_nvcsw=56154.000000,ru_nivcsw=1301544.000000,acct_cpu=565414.999792,acct_mem=561735.223195,acct_io=10.068075,acct_iow=0.000000,acct_maxvmem=78425395200.000000,finished_jobs=61.000000;
default_project NONE
debited_job_usage 1696 
cpu=20.730000,mem=10.670671,io=0.002276,iow=0.000000,vmem=733593600.000000,maxvmem=733593600.000000;debited_job_usage
 1695 
cpu=13263.580000,mem=13730.393731,io=0.173561,iow=0.000000,vmem=1199620096.000000,maxvmem=1285763072.000000;

Our qmaster and execd spool directories are local to the nodes on which they're 
running.  Some configuration details:

[root@headnode ~]# qconf -sconf
#global:
enforce_project              true
enforce_user                 auto
reporting_params             accounting=true reporting=false \
                             flush_time=00:00:15 joblog=false sharelog=00:00:00
auto_user_oticket            0
auto_user_fshare             0
auto_user_default_project    none
auto_user_delete_time        0

[root@headnode ~]# qconf -ssconf
algorithm                         default
schedule_interval                 0:0:15
maxujobs                          0
queue_sort_method                 load
job_load_adjustments              np_load_avg=0.50
load_adjustment_decay_time        0:7:30
load_formula                      np_load_avg
schedd_job_info                   true
flush_submit_sec                  0
flush_finish_sec                  0
params                            none
reprioritize_interval             0:0:0
halftime                          336
usage_weight_list                 cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor               5.000000
weight_user                       0.250000
weight_project                    0.250000
weight_department                 0.250000
weight_job                        0.250000
weight_tickets_functional         0
weight_tickets_share              100000
share_override_tickets            TRUE
share_functional_shares           TRUE
max_functional_jobs_to_schedule   200
report_pjob_tickets               TRUE
max_pending_tasks_per_job         50
halflife_decay_list               none
policy_hierarchy                  OSF
weight_ticket                     10.000000
weight_waiting_time               0.500000
weight_deadline                   0.000000
weight_urgency                    0.000000
weight_priority                   1.000000
max_reservation                   0
default_duration                  INFINITY

[root@headnode ~]# qconf -sstree
id=0
name=Root
type=0
shares=1
childnodes=1
id=1
name=default
type=0
shares=100000
childnodes=NONE

I'd be grateful for any pointers or suggestions you might be able to offer.

Regards,


Gordon

_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

[gridengine users] Share tree usage problem

Reply via email to