> Begin forwarded message:
>
> From: "Gary E. Gorbet" <[email protected]>
> Subject: cluster status function in Airavata
> Date: October 7, 2015 at 5:30:22 PM CDT
> To: SciGaP Dev <[email protected]>
>
> The two text files attached illustrate a Gateway function that is currently
> performed by direct ssh commands. I would like to see that functionality
> moved to Airavata for two main reasons:
>
> (1) Any refinements for changes on clusters or for new clusters would be
> centralized; and
>
> (2) Gateway servers on multiple hosts would referenced common APIs.
>
> The first attached file - cluster_status.php-local_status.txt - is the code
> portion from a local_status() function within the UltraScan gateway script
> cluster_status.php. This code portion shows the ssh commands issued for each
> of a list of clusters. The parsed command output reveals the count of running
> jobs and queued jobs for each cluster. That information is used on the
> gateway submit page to hint at likely wait-in-queue-status time. My proposal
> is that a Thrift client API would return this information, using code on the
> Thrift server similar to that in the sample PHP script.
>
> The second attached file shows a bash shell script executed remotely on the
> Jureca cluster at the PRACE Juelich center in Germany. This special script
> was found to be necessary because none of the normal queue status commands (
> sinfo, qstat, ... ) were found to return reliable information. The script
> shown basically just issues squeue commands and counts the lines returned as
> a way of counting PENDING and RUNNING jobs in the “batch” queue on Jureca.
>
> The attached image file shows how the job count information is used at the
> submit stage of the UltraScan gateway.
>
> - Gary
>
>
>
>
>
>
...
// Get local cluster status
function local_status()
{
global $self;
global $data;
//$clusters = array( "alamo", "lonestar", "stampede", "comet", "gordon" );
$clusters = array( "alamo", "lonestar", "stampede", "comet", "gordon",
"jureca", "jacinto" );
foreach ( $clusters as $clname )
{
$a = Array();
switch( $clname )
{
case 'alamo':
{
$host = "[email protected]";
$qstat = `ssh $host '/usr/bin/qstat -B 2>&1|tail -1'`;
$sparts = preg_split( '/\s+/', $qstat );
$que = $sparts[ 3 ];
$run = $sparts[ 4 ];
$sta = $sparts[ 10 ];
if ( $sta == "Active" )
$sta = "up";
else
$sta = "down";
break;
}
case 'jacinto':
{
$host = "[email protected]";
$qstat = `ssh $host '/opt/torque/bin/qstat -B 2>&1|tail -1'`;
$sparts = preg_split( '/\s+/', $qstat );
$que = $sparts[ 3 ];
$run = $sparts[ 4 ];
$sta = $sparts[ 9 ];
if ( $sta == "Active" )
$sta = "up";
else
$sta = "down";
break;
}
case 'stampede':
{
$host = "[email protected]";
$qstat = `ssh $host '/usr/local/bin/showq 2>&1|tail -1'`;
$sparts = preg_split( '/\s+/', $qstat );
$tot = $sparts[ 2 ];
$run = $sparts[ 5 ];
$que = $sparts[ 8 ];
$sta = "up";
if ( $tot == '' || $tot == '0' )
$sta = "down";
break;
}
case 'lonestar':
{
$host = "[email protected]";
$qstat = `ssh $host 'showq 2>&1|tail -1'`;
$sparts = preg_split( '/\s+/', $qstat );
$tot = $sparts[ 2 ];
$run = '0';
$que = '0';
$sta = "up";
if ( $tot == '' || $tot == '0' )
{
$sta = "down";
}
else
{
$run = $sparts[ 5 ];
$que = $sparts[ 8 ];
}
break;
}
case 'comet':
{
$host = "[email protected]";
$qstat = `ssh $host '/usr/bin/sinfo -s -p compute -o "%a %F" |tail
-1'`;
$sparts = preg_split( '/\s+/', $qstat );
$sta = $sparts[ 0 ];
$knts = $sparts[ 1 ];
$sparts = preg_split( '/\//', $knts );
$run = $sparts[ 0 ];
$que = $sparts[ 1 ];
if ( $sta == "" )
$sta = "down";
break;
}
case 'gordon':
{
$host = "[email protected]";
$qstat = `ssh $host '/opt/torque/bin/qstat -B 2>&1|tail -1'`;
$sparts = preg_split( '/\s+/', $qstat );
$que = $sparts[ 3 ];
$run = $sparts[ 4 ];
$sta = $sparts[ 10 ];
if ( $sta == "Active" )
$sta = "up";
else
$sta = "down";
break;
}
case 'jureca':
{
$host = "[email protected]";
$qstat = `ssh $host '~swus1/scripts/qstat-jureca 2>&1'`;
$sparts = preg_split( '/\s+/', $qstat );
$sta = $sparts[ 0 ];
$run = $sparts[ 1 ];
$que = $sparts[ 2 ];
break;
}
}
if ( $sta == "down" )
{
$que = "0";
$run = "0";
}
$a[ 'cluster' ] = $clname;
$a[ 'queued' ] = $que;
$a[ 'running' ] = $run;
$a[ 'status' ] = $sta;
$data[] = $a;
if ( $clname == 'alamo' || $clname == 'jacinto' )
{
$a[ 'cluster' ] = $clname . "-local";
$data[] = $a;
}
}
}
...
#!/bin/bash
# qstat-jureca - count queued/running jobs on Jureca
JRUN=`squeue -t RUNNING -p batch|wc -l`
JQUE=`squeue -t PENDING -p batch|wc -l`
JQUE=`expr $JQUE - 1`
JRUN=`expr $JRUN - 1`
JOBS=`expr $JQUE + $JRUN`
QSTA="up"
if [ $JOBS -lt 1 ]; then
QSTA="down"
fi
echo "$QSTA $JRUN $JQUE $JOBS"
>
>
>