
/*
   rawread 2.4.2

   The rawread program is a microbenchmark used to measure the performance
   and scalability of :

      sequential and random for :
      read() and write(), readv) and writev(), pread() and pwrite() for :

         raw io                     version 1.0.1
         direct io on SCSI devices  version 1.0.1

   rawread reports :

      parameters used during the run
      throughput in KB/s
      CPU utilization (from /proc/stat)

   NOTE : need to pickup def for O_DIRECT so build with -D_GNU_SOURCE

   gcc -D_GNU_SOURCE -O2 -laio -lpthread -o rawread rawread.c 

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

   Much of the process sync and shared mem code came from dbench.
   Timestamp code came from Shailabh Nagar's test program.

   Copyright (C) IBM 2002
   Written by Bill Hartner, IBM Linux Technology Center (18 Sep 2002)

   05 Dec 2002 - added pread(), pwrite(), write(), writev() tests.
                 added random io tests.
                 fixed CPU utilization report for 2.5 kernels.
   01 April 2003  -slp added aio and batch aio  tests 12-19
   11 April 2003  -slp Sync threads to all stop at the same time 1.1.0
   14 April 2003 - slp added keep alive loop to catch dead threads, and swithed
				 	to pthreads insted of fork/exec.
   22 April 2003  -mcao added aio minwait tests 20-23, reorganize the loop in
			work() function.
   24 April 2003  - 2.0.0 added shared offsets with locking.
   28 April 2003  - 2.0.1 removed extra lock on non-batch mode
					 	  fixed filehandle for aio overhead test
   29 April 2003  - 2.0.2 move locking for batch mode outside the loop to only take lock once.
   29 April 2003  - 2.0.3 fix offset cal in non-batch case.
   30 April 2003  - 2.1.0 changes pthread mutex to lock xadd for offset increment
   1 May 2003     - 2.1.1 Fix loop to front of region and offset at end of region
   1 May 2003     - 2.1.1 use lseek64 and O_LARGEFILE
   14 May 2003 	  - 2.1.2 Fix offset calculation for niovecs for readv case 
   15 May 2003 	  - 2.1.3 Fix RAN_AIOWRITE test to set aiowrite flag. fix invalid test msg. 
   15 May 2003 	  - 2.1.4 Fix AIOWRITE overhead test to access correct dev_info. 
   21 May 2003    - 2.1.5 Fix test enumeration array.
   7  July 2003   - 2.1.6 Add O_SYNC option for file based IO. "-c"
					 	  Add support for ide hard drives "-h"
	17 Jult 2003  - 2.1.7 add fsync at end of worker thread to handle buffered fs case.
   19 August 2003 - 2.1.8 make readv skip every other mem location
   20 August 2003 - 2.1.9 Added range parm to set range for random tests instead of using numreads
   5 Sept 2003 	  - 2.1.10 Fix read error checking to detect read past end of file.
   25 Sept 2003   - 2.1.11 Fix read error checking in AIO case. Was causing AIO to always fail in 2.1.10
   9 Jan 2004     - 2.2 Use cross platform atomics
   23 Jan 3002    - 2.2.1 Fix initial range calculation for 4GB overflow.
   July 27 2004   - 2.2.2 Fix _offset for random overflow, add hack for O_DIRECT to make sure it compiles 
   Aug 6 2004     - 2.3.0 fixed race on device name to open.
   Nov 1 2004     - 2.4.0 fixed offset use in sequetial read
   Nov 9 2004     - 2.4.1 remove useless microsecs, replace with IOs/sec
   Aug 30 2005    - 2.4.2 Fix ppc64 compile link issues.
*/

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <getopt.h>
#include <signal.h>
#include <sys/shm.h>
#include <sys/uio.h>
#include <libaio.h>
#include <inttypes.h>
#include <pthread.h>
#include <unistd.h>
#ifdef __i386__
#define __KERNEL__	
#endif
#include <asm/atomic.h>
#ifdef __i386__
#undef __KERNEL__
#endif
//hack for some ppc boxes where this just does not work right.
#ifndef O_DIRECT
#define O_DIRECT     040000 /* direct disk access hint */
#endif
/* need to pickup def for O_DIRECT so build with -D_GNU_SOURCE */

#define TESTID_MIN	(0)
#define SEQ_READ	(0)
#define SEQ_WRITE	(1)
#define SEQ_READV	(2)
#define SEQ_WRITEV	(3)
#define SEQ_PREAD	(4)
#define SEQ_PWRITE	(5)
#define RAN_READ	(6)
#define RAN_WRITE	(7)
#define RAN_READV	(8)
#define RAN_WRITEV	(9)
#define RAN_PREAD	(10)
#define RAN_PWRITE	(11)
#define SEQ_AIOREAD	(12)
#define SEQ_AIOWRITE	(13)
#define RAN_AIOREAD	(14)
#define RAN_AIOWRITE	(15)
#define SEQ_BAIOREAD	(16)
#define SEQ_BAIOWRITE	(17)
#define RAN_BAIOREAD	(18)
#define RAN_BAIOWRITE	(19)
#define SEQ_MWBAIOREAD	(20)
#define SEQ_MWBAIOWRITE	(21)
#define RAN_MWBAIOREAD	(22)
#define RAN_MWBAIOWRITE	(23)
#define TESTID_MAX	(23)

char *testname[TESTID_MAX+1] = {
"sequential read()",
"sequential write()",
"sequential readv()",
"sequential writev()",
"sequential pread()",
"sequential pwrite()",
"random read()",
"random write()",
"random readv()",
"random writev()",
"random pread()",
"random pwrite()",
"sequential aioread()",
"sequential aiowrite()",
"random aioread()",
"random aiowrite()",
"batch sequential aioread()",
"batch sequential aiowrite()",
"batch random aioread()",
"batch random aiowrite()",
"minimum batch sequential aioread()",
"minimum batch sequential aiowrite()",
"minimum batch random aioread()",
"minimum batch random aiowrite()"
};

#define MAXNREQ    512			// maximum number of concurrent requests
#define MAXPROC    512			// maximum number of concurrent processes
#define MAXVECTOR  512			// maximum number of blocks in a vector
#define MAXBLOCKS  819200000                // maximum number of blocks in a file
#define BLOCKSIZE  4096			// default blocksize
#define ROOT_FNAME "t"			// default root file name
#define MINFNUM    0			// default minimum file id

struct _shared {
	int status;
	unsigned long numreads[256];
	unsigned long long micro;
	char busy;
	char pad[14];
};
typedef volatile struct _shared shared_t;

struct _shared2 {
	int count;
    pthread_mutex_t lock;
};
typedef volatile struct _shared2 shared2_t;

typedef struct myinfo{
	long long offset;
	int fd;
	int disk_index;
/* might add numreq here */
	struct iocb *iocb;
}disk_info[MAXNREQ];

/* prototypes */

int parseparm (int argc, char *argv[]);
void * worker(void *);
void *shm_setup(int size);
//static void sigcont(int sig);
char *alloc_read_buffer (int size);
void get_device_name (int i, char * device_name);

#ifdef fubar
static inline uint32_t __attribute__ ((unused))
exchange_and_add (volatile uint32_t *mem, uint32_t val)
{
  register uint32_t result;
  __asm__ __volatile__ ("lock; xaddl %0,%1"
			: "=r" (result), "=m" (*mem)
			: "0" (val), "1" (*mem));
  return result;
}

#endif

#ifdef __i386__
static inline uint32_t __attribute__ ((unused))
atomic_add_return(int val, atomic_t  *foo)
{
  volatile int *mem=&(foo->counter);
  register uint32_t result;
  __asm__ __volatile__ ("lock; xaddl %0,%1"
			: "=r" (result), "=m" (*mem)
			: "0" (val), "1" (*mem));
  return result;
}

#endif

/* global variables */

char *rawread_version = "2.4.2";
int testid	= -1;		/* test id  - see usage(() */
int frandom	= 0;		/* 1 = random io */
int ffread	= 0;		/* 1 = read() test */
int ffwrite	= 0;		/* 1 = write() test */
int freadv	= 0;		/* 1 = readv() test */
int fwritev	= 0;		/* 1 = writev() test */
int fpread	= 0;		/* 1 = pread() test */
int aiobatch= 1;		/* 1 = number of aios at once */
int aioread	= 0;		/* 1 = aioread() test */
int aiowrite= 0;		/* 1 = aiowrite() test */
int fpwrite	= 0;		/* 1 = pwrite() test */
int rsize	= 65536*4;	/* read buffer size or iovec len */
int range = 1024;		/* range for random operations in MB*/
long long noffset	= 0;		/* lseek() offset prior to inner loop */
int nreads	= 1024;		/* inner loop cntr */
int nrepeat	= 1;		/* outer loop cntr (seeks to offset 0 1st) */
int nprocs	= 1;		/* number of processes */
int numdevices     = -1;		/* module applied to device number */
int ndevice	= 1;		/* starting /dev/raw/raw# or /dev/sd# */
int fExtra	= 0;		/* print for spreadsheet */
int fraw	= 1;		/* flag to use /dev/raw/raw# or /dev/sd# */
int fdirect	= 0;		/* flag to use O_DIRECT */
int fscsi	= 0;		/* flag to use /dev/sd.. instead of raw */
int fide	= 0;		/* flag to use /dev/hd.. instead of raw */
int ffile	= 0;		/* flag to turn on/off filesystem I/O test */
int f_sync	= 0;		/* flag to turn on/off Sync flag in filesystem I/O test */
int niovec	= 0;		/* number of iovecs for readv */
int flags	= O_RDWR | O_LARGEFILE;	/* open flags */
int minwait	= 0;		/* flag to turn on/off min getevent for aio */
void * (*fn)(void *)	= worker;	/* worker function - read, readv, direct */
shared_t *pshared;		/* shared memory for reporting results */
shared2_t *shared_lock;		/* shared memory for reporting results */
char rawname[]  = "/dev/raw/raw1\0\0\0\0";
char devname[]  = "/dev/sda\0\0\0\0";
char fsname[]	= "/mnt/mnt1\0\0\0\0\0\0\0\0";
char *foo_fname = "foo";
//static volatile int offsets[MAXNREQ];
atomic_t offsets[MAXNREQ];
#define DEBUG 0

#ifdef fubar
__inline__ unsigned long long int rdtsc()
{
  unsigned long long int x;
  __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
  return x;
}
#endif

void usage(void)
{
	printf ("Usage: rawread %s [options].\n", rawread_version);
	printf ("-p N, N = number of processes (def=1).\n");
	printf ("-d N, N = starting device e.g. 2 = raw2 or sdb (def=/dev/raw/raw1 or /dev/sda).\n");
	printf ("-i N, N = if > 0, readv used and specifies number of iovecs (def=0).\n");
	printf ("-s N, N = size of raw read buffer or each readv iovec length (def=256KB).\n");
	printf ("-n N, N = num times to read or readv ..inner loop.. (def=1024).\n");
	printf ("-l N, N = num times lseek to -o parm and repeat -n parm ..outer loop.. (def=1).\n");
	printf ("-o N, N = lseek to offset N prior to starting inner loop (def=0).\n");
	printf ("-r N, N = size in MB of range to use (def=1024).\n");
	printf ("-x  ,     open /dev/sd.. instead of /dev/raw/raw.. (def=/dev/raw/raw..).\n");
	printf ("-h  ,     open /dev/hd.. instead of /dev/raw/raw.. (def=/dev/raw/raw..).\n");
	printf ("-c  ,     Use O_SYNC flag on open\n");
	printf ("-f  ,     open /mnt/mnt../foo instead of /dev/raw/raw.. \n");
	printf ("-z  ,     add O_DIRECT flag to open, requires -x (def=O_RDONLY).\n");
	printf ("-m N, N = device modulo, allow processes to use same device. (def= -p parm).\n");
	printf ("-t N, N = test id (def = 0, sequential read() test)\n");
	printf ("      0  = sequential read()\n");
	printf ("      1  = sequential write()\n");
	printf ("      2  = sequential readv()\n");
	printf ("      3  = sequential writev()\n");
	printf ("      4  = sequential pread()\n");
	printf ("      5  = sequential pwrite()\n");
	printf ("      6  = random read()\n");
	printf ("      7  = random write()\n");
	printf ("      8  = random readv()\n");
	printf ("      9  = random writev()\n");
	printf ("      10 = random pread()\n");
	printf ("      11 = random pwrite()\n");
	printf ("      12 = sequential aioread()\n");
	printf ("      13 = sequential aiowrite()\n");
	printf ("      14 = random aioread()\n");
	printf ("      15 = random aiowrite()\n");
	printf ("      16 = batch sequential aioread()\n");
	printf ("      17 = batch sequential aiowrite()\n");
	printf ("      18 = batch random aioread()\n");
	printf ("      19 = batch random aiowrite()\n");
	printf ("      20 = minwait batch sequential aioread()\n");
	printf ("      21 = minwait batch sequential aiowrite()\n");
	printf ("      22 = minwait batch random aioread()\n");
	printf ("      23 = minwait batch random aiowrite()\n");
}

int main (int argc, char *argv[])
{
	int rc, i, j, synccount;
//	char *p;
	FILE *bstat;	/* /proc/stat */
	char str[8];
	unsigned long buser, bnice, bsys, bidle, bidleio;
	unsigned long euser, enice, esys, eidle, eidleio;
	int num;
	float ftotal;
	float puser, psys, pidle;
	unsigned long long total_micro = 0;
	unsigned long total_count = 0;
	unsigned long total_numreads = 0;
	unsigned long long total_bytes_read = 0;
	unsigned long long total_io_rate = 0;
	char done=0;
	unsigned long check = 0;

	if (parseparm (argc, argv))	/* get the parms and validate */
		return(1);

	range = (long long)range * (long long)(1024*1024) / (long long)(rsize * ((niovec==0) ? 1 : niovec)); // set the range once in blocks
	range -= 1; // 0 based index required.
//	signal(SIGCONT, sigcont);	/* new process will pause() */

	pshared = (shared_t *)shm_setup(sizeof(shared_t)*nprocs);
	if (!pshared) {
		perror("failed to setup shared memory\n");
		return (1);
	}
	shared_lock = (shared2_t *)shm_setup(sizeof(shared2_t));
	if (!pshared) {
		perror("failed to setup shared memory\n");
		return (1);
	}
    pthread_mutex_init((pthread_mutex_t*)&shared_lock->lock, NULL);
	shared_lock->count=0;

	memset((void *)pshared, 0, sizeof(struct _shared)*nprocs);

	bstat = fopen("/proc/stat","r");
	if (!bstat) {
		perror("fopen failed on /proc/stat");
		return(1);
	}

	pthread_mutex_lock((pthread_mutex_t*)&shared_lock->lock);

	for (i=0 ; i < nprocs ; i++) {
		pthread_t thread;
		rc = pthread_create((pthread_t *)&thread, NULL, fn, (void*)i);
		if (rc) {
			printf("Error creating thread rc = %d\n",rc);
			_exit(0);
		}
	}

//	printf ("waiting for procs to start...\n");

	/* wait 30 seconds for all of the procs to start */
	for (i = 0 ; i < 30 ; i ++) {
		synccount = 0;
		for (j = 0 ; j < nprocs ; j++) {
			if (pshared[j].status) synccount++;
		}
		if (synccount == nprocs) break;
		sleep(1);
	};

	if (synccount != nprocs) {
		printf("failed to start all procs %d %d\n", nprocs, synccount);
		return (1);
	}
//	printf ("all pids reported they are started...\n");

	num = fscanf(bstat,"%s %lu %lu %lu %lu %lu",&str[0],&buser,&bnice,&bsys,&bidle,&bidleio);
	if ((num != 5 && num != 6) || str[0] != 'c' || str[1] != 'p' || str[2] != 'u' ) {
		printf ("format error on /proc/stat\n");
		return(1);
	} else if (num == 5) {
		bidleio = 0;
	}

	pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);
	rc = fseek(bstat, 0, SEEK_SET);
	if (rc == -1) {
		perror("fseek on bstat failed");
		return(1);
	}

	//printf ("all pids were SIGCONT, waiting for complete...\n");
	while (!done) {
		sleep(1);
		pthread_mutex_lock((pthread_mutex_t*)&shared_lock->lock);
		if (shared_lock->count >= nprocs) {	  // are we done?
			done=1;
			pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);
			break;
		}
		pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);
		check++;
		if (!(check % 300)) { // do keepalive check every minute
     		for (i = 0 ; i < nprocs; i++) {	/* wait for completion */
     			if (pshared[i].busy != 1) {	  // someone didn't make progress
     				printf("Doh! someone made no progress (thread %d) Time to quit\n",i);
     				return 1;
     			} else {
     				pshared[i].busy = 0;
     			}
			}
		}
	}
	num = fscanf(bstat,"%s %lu %lu %lu %lu %lu",&str[0],&euser,&enice,&esys,&eidle,&eidleio);
	if ((num != 5 && num != 6) || str[0] != 'c' || str[1] != 'p' || str[2] != 'u' ) {
		printf ("format error on /proc/stat\n");
		return(1);
	} else if (num == 5) {
		eidleio = 0;
	}

	for (i = 0 ; i < nprocs ; i++) {	/* make sure all threads have fully updated stats */
		if (pshared[i].micro == 1) {
			printf("A thread set erro code in micro, Aborting\n");
			return 1;
		}
		if (pshared[i].micro == 0) {
			sleep(1);
			i--;
		}
	}

	ftotal = euser+enice+esys+eidle+eidleio-buser-bnice-bsys-bidle-bidleio;
	if (ftotal) {
		puser = (float)(euser+enice-buser-bnice);
		puser *= 100.0;
		puser /= ftotal;

		psys = (float)(esys-bsys);
		psys *= 100.0;
		psys /= ftotal;

		pidle = (float)(eidle+eidleio-bidle-bidleio);
		pidle *= 100.0;
		pidle /= ftotal;
	} else {
		puser = 0.0;
		psys  = 0.0;
		pidle = 0.0;
	}
	printf ("              rawread %s\n", rawread_version);
	printf ("-t %-10u %s test\n", testid, testname[testid]);
	printf ("-p %-10u number of processes\n", nprocs);
	printf ("-m %-10u device modulo\n", numdevices);
	printf ("-d %-10u starting device number\n", ndevice);
	printf ("-s %-10u read buffer or readv size in bytes\n", rsize);
	printf ("-n %-10u inner loop count\n", nreads);
	printf ("-l %-10u outer loop count\n", nrepeat);
	printf ("-r %-10lld range in MB\n", (long long)((long long)range * (long long)rsize)/(long long)(1024*1024));
	printf ("-o %-10lld is seek offset\n", noffset/(1024*1024));
	if (niovec)
		printf ("-i %-10u iovec count\n", niovec);
	if (flags & O_DIRECT)
		printf ("-z            O_DIRECT flag used\n");
	if (fscsi)
		printf ("-x            using /dev/sd...\n");
	if (fide)
		printf ("-h            using /dev/hd...\n");
	if (ffile)
		printf ("-f            using /mnt/mnt...\n");
	if (f_sync)
		printf ("-c            O_SYNC flag used \n");
	printf ("\n   %-10u length of DASD read (kbytes)\n", (rsize*nreads*((niovec == 0) ? 1 : niovec))/1024);
	printf ("   %-10u minimum kbytes read by each process\n\n", (rsize*nreads*((niovec == 0) ? 1 : niovec)*nrepeat)/1024);
	printf ("   %-10.2f %%user\n", puser);
	printf ("   %-10.2f %%sys\n", psys);
	printf ("   %-10.2f %%idle\n", pidle);

	printf ("\n");
	printf ("procs           device  %10s  %7s  %14s\n", (niovec ? "num readv" : "num read"), "KB/sec", "I/O Ops/sec");
	printf ("-----  ---------------  ----------  -------  --------------\n");
	{

	for (i = 0 ; i < numdevices ; i++) {
		unsigned long numreads;
		unsigned long long micro;
		unsigned long long bytes_read;
		unsigned long long io_rate;
		unsigned long count;

		numreads = 0;
		count = 0;
		micro = 0;
		bytes_read = 0;
		for (j = 0 ; j < nprocs ; j++ ) {	/* get the results */
			if (pshared[j].numreads[i]) {
				count++;
				numreads += pshared[j].numreads[i];
				micro += pshared[j].micro;
			}
		}
		if (count) {
			char device_name[30];
			total_count += count;
			micro /= count;
			total_micro += micro;
			total_numreads += numreads;
			io_rate = ((long long)numreads * 1000000) / micro;
			total_io_rate += io_rate;
			bytes_read = numreads;
			bytes_read *= rsize;
			bytes_read *= ((niovec == 0) ? 1 : niovec);
			bytes_read *= 1000000;
			bytes_read /= (micro*1024);
			total_bytes_read += bytes_read;
			get_device_name(i, device_name);
			//printf ("%5lu  %15s  %10lu  %7llu  %14llu \n",
			printf ("%5lu  %15s  %10lu  %7llu  %14llu \n",
				count,
				device_name,
				numreads,
				bytes_read,
				io_rate);
		}
	}
	printf ("-----  ---------------  ----------  -------  --------------\n");
	printf ("%5lu  %15s  %10lu  %7llu  %14llu \n",
		total_count,
		"",
		total_numreads,
		total_bytes_read,
		total_io_rate);
	printf("\nTotal run time %f seconds\n",(float)((float)total_micro/(float)1000000)/total_count);

	if (fExtra) {
		printf ("Rawread_Version_%s %llu %0.2f %llu t %u p %d s %u o %llu i %u n %u l %u x %u z %u d %u m %u\n",
		rawread_version,	// rawread version
		total_bytes_read,	// throughput in KB/s
		puser+psys,		// cpu utilization
		total_micro/numdevices,	// average time per device
		testid,			// test id
		nprocs,			// -p N
		rsize,			// -s N
		noffset,		// -o N
		niovec,			// -i N
		nreads,			// -n N
		nrepeat,		// -l N
		fdirect,		// -x (O_DIRECT)
		fscsi,			// -z (/dev/sd...)
		ndevice,		// -d N
		numdevices);		// -m N
	}
	}
	return(0);
}

void * worker(void * idp)
{
	int id=(int)idp;
	int rc, i, bytes_read=0;
	char *p;
	char device_name[40];
	int _nreads;
	int total_reads;
	long long _offset;
	int this_aiobatch = aiobatch;
	int x;
	struct timeval  tv1;
	struct timezone tz1;
	struct timeval  tv2;
	struct timeval  tvr;
	io_context_t    ctx;			// io complete context
	struct timespec    ts;
	struct iocb     iocb[MAXNREQ];		// iocb pool
	struct iocb*    iocbp[MAXNREQ];		// pointer pool for iocbs
	struct io_event events[MAXNREQ];	// event pool
	struct myinfo dev_info[MAXNREQ];
	int fd=0;
	int async_reads = 0;
	int pall_done = 0;			// I/O finish flag
	int minevent;

	pshared[id].status = getpid();
	/*
	 * if niovec was specified then rsize is the size of each iovec.
	 */
	p = alloc_read_buffer(rsize * ((niovec==0) ? 1 : (niovec * 2)) );
		// alloc 2x required size for readv so we can skip every other chunk
	//	printf("data buf = %p  size =%d\n",p,rsize);
	if (!p) {
		  pshared[id].micro = 1;
		  return NULL;
	}
	
	if (aioread || aiowrite) {
		for (i = 0; i < aiobatch; i++) {
			dev_info[i].offset = 0;
			if (aiobatch <= 1) {
				get_device_name(id,device_name);
			} else {
				get_device_name(i,device_name);
			}
			fd = open(device_name,flags);
			if (fd == -1) {
				printf("%s\n", device_name);
				perror("open failed on device");
				pshared[id].micro = 1;
				return NULL;
			}
			dev_info[i].fd = fd;
			dev_info[i].disk_index = i;
			dev_info[i].iocb = &iocb[i];
		}

		rc = io_queue_init(aiobatch, &ctx);
		if (rc != 0) {
			printf("io_queue_init: aiobatch = %d, ctx %p, res=%d [%s] on id=%d\n", aiobatch, &ctx, rc, strerror(-rc),id);
			pshared[id].micro = 1;
			return NULL;
		}
		// zero out memory
		memset(events, 0, sizeof(events));
		for (i = 0; i < MAXNREQ; ++i)
			iocbp[i] = iocb+i;

		ts.tv_sec = 30;
		ts.tv_nsec = 0;
	} else {
		get_device_name(id, device_name);
		fd = open(device_name,flags);
		if (fd == -1) {
			printf("%s\n", device_name);
			perror("open failed on device");
			pshared[id].micro = 1;
			return NULL;
		}
	}

	total_reads = nreads * nrepeat * aiobatch;
	_nreads = 0;

	if (minwait)
		minevent = 1;
	else
		minevent = aiobatch;

	// used to wait for the master to unlock and let us go.
	pthread_mutex_lock((pthread_mutex_t*)&shared_lock->lock);
	pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);


	rc = gettimeofday (&tv1, &tz1);
	if (rc) {
		perror ("gettimeofday failed on tv1 ");
		pshared[id].micro = 1;
		return NULL;
	}

	if (frandom){
		//srand(id % numdevices);
		srand(time(NULL));
		x = rand();
        _offset = (long long)((long long)(x % range) * (long long)rsize * ((niovec==0) ? 1 : niovec)) + noffset;
	} else {
		_offset = noffset;
	}

   	if (!fpread && !fpwrite && !aioread && !aiowrite) {
   		// printf("seek to %d\n", _offset);
   		rc = lseek64(fd, _offset, SEEK_SET);
   		if (rc == -1) {
   			printf("lseek() failed rc=%d offset=%lld",rc,_offset);
			pshared[id].micro = 1;
   			return NULL;
   		}
   	}

	if (aioread || aiowrite ){
		for (i = 0; i <aiobatch ;i++) {
			if (aiowrite)
				io_prep_pwrite(dev_info[i].iocb, 
					dev_info[i].fd, p,rsize, _offset);
			else {
				io_prep_pread(dev_info[i].iocb, dev_info[i].fd, 
					p,(size_t)rsize, _offset);
				//iocb[i].data = &this_iocb;
			}
			dev_info[i].iocb->data = &dev_info[i];
			//printf("iocb data = %p\n",dev_info[i].iocb->data);
			iocbp[i] = dev_info[i].iocb;
		}
	}

	while (shared_lock->count < nprocs) {
		struct iovec iovec[UIO_MAXIOV];
		rc = 0;
		if (freadv || fwritev) {
			char *pp = p;
			for (i = 0 ; i < niovec ; i++) {
				iovec[i].iov_len  = rsize;
				iovec[i].iov_base = pp;
				pp = pp + (rsize * 2); //*2 is to make buffer locations non-contiguous
									   // buffer must have been allocated 2x required size
			}
			if (freadv) {
				bytes_read = readv(fd, iovec, niovec);
			} else {
				bytes_read = writev(fd, iovec, niovec);
			}
		} else if (ffread) {
			bytes_read = read(fd, p, rsize);
		} else if (ffwrite) {
			bytes_read = write(fd, p, rsize);
		} else if (fpread) {
			bytes_read = pread(fd, p, rsize, _offset);
		} else if (fpwrite) {
			bytes_read = pwrite(fd,p,rsize,_offset);
		} else if (aioread || aiowrite) {
			/* submit i/o */
			rc =io_submit(ctx, this_aiobatch, iocbp);
			if (rc != this_aiobatch){
				printf("AIO submit failed rc= %d id=%d\n",rc,id);
			}

			/* async wait for I/O complete */
			if (DEBUG)
				printf("minevent = %d, maxevent = %d, for %d\n",
						minevent,this_aiobatch,id);
			//rc = io_getevents(ctx, minevent, this_aiobatch, events, &ts);
			rc = io_getevents(ctx, minevent, aiobatch, events, &ts);

			if ((rc <= 0) || ((!minwait) && (rc != this_aiobatch))) {
					printf("AIO getevents failed rc= %d id=%d\n",rc,id);
					pshared[id].micro = 1;
					return NULL;
			}

			if (minwait)
				this_aiobatch = rc;
			rc = 0;

			/* validate */
			for (i = 0; i < this_aiobatch; i++) {
				struct iocb *iocb;
				struct myinfo *info;
				int idx;
				/* get the fd who just finished an aio */
				iocb = (struct iocb *)events[i].obj;
				info = (struct myinfo *)iocb->data;
				idx = info->disk_index;

				if ((long)events[i].res < 0) {
					printf("ERROR rc=%ld on AIO operation,aborting. id=%d\n",
							events[i].res,id);
					pshared[id].micro = 1;
					return NULL;
				}
				bytes_read += (long)events[i].res;
				if (aiobatch > 1) {
					/* update the number of reads */
					pshared[id].numreads[idx]++;
					if (pshared[id].numreads[idx] == nreads * nrepeat)
						async_reads++;
					if (async_reads == aiobatch) {
						/* aiobatch?*/
						pall_done = 1;
						async_reads = 0; //reset
						if (DEBUG)
							printf("async_reads=%d, aiobatch=%d,pshared[%d].numreads[0] = %ld, numreads[1] = %ld\n", async_reads, aiobatch,id,pshared[id].numreads[0],pshared[id].numreads[1]);					}
					_nreads++;
				}
			}

		} else {
			printf("Invalid test flag\n");
			return NULL;
		}

		if (bytes_read <= 0) {
			printf("read/write failed return=%d errno=%d offset %lld \n",bytes_read,errno,_offset);
			pshared[id].micro = 1;
			return NULL;
		}

		pshared[id].busy=1;
		/*update the number of reads */

		if (aiobatch <= 1) { 
			_nreads = ++pshared[id].numreads[id % numdevices];
			if (DEBUG)
				printf("pshared[%d].numreads[%d] =%ld\n",
					id,id%numdevices, pshared[id].numreads[id% numdevices]);
			if (_nreads == total_reads) 
				pall_done = 1;
		}

		if (pall_done) {
			if (DEBUG)
				printf("total number of reads %d have been done, for %d\n", _nreads, id);
			/* Let the master know we are done! */
			pthread_mutex_lock((pthread_mutex_t*)&shared_lock->lock);
			shared_lock->count++;
			pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);
			/* clear this flag to avoid this again next time */
			pall_done = 0;
		}

		if (aiobatch <= 1 ) {
			if (frandom) {
				x = rand();
				_offset = (long long)((long long)(x % range) * (long long)rsize * ((niovec==0) ? 1 : niovec)) + noffset;
			} else if ((_nreads % range) == 0){
				_offset = noffset;
			}
			else {
				int tmpvar;
				tmpvar = atomic_add_return(1,&offsets[id % numdevices]);
				//_offset = (long long)(tmpvar % range) * (long long)((niovec==0) ? 1 : niovec) * (long long)rsize + (long long)rsize;
				_offset = (long long)(tmpvar % range) * (long long)((niovec==0) ? 1 : niovec) * (long long)rsize + (long long)noffset;
				}
		}

   		if (fpread || fpwrite)
			continue;
		if(!aioread && !aiowrite) {
   		// printf("seek to %d\n", _offset);
   			rc = lseek64(fd, _offset, SEEK_SET);
   			if (rc == -1) {
   				printf("lseek() failed rc=%d offset=%lld\n",rc,_offset);
				pshared[id].micro = 1;
   				return NULL;
   			}
			continue;
   		}

		if ((aiobatch <= 1) && (aioread || aiowrite) ) {
			if (aiowrite){
				io_prep_pwrite(dev_info[0].iocb, 
						dev_info[0].fd, 
						p,(size_t)rsize, _offset);
			} else {
				io_prep_pread(dev_info[0].iocb, 
						dev_info[0].fd, 
						p,(size_t)rsize, 
						_offset);
			}
			dev_info[0].iocb->data = &dev_info[0];
			iocbp[0] = dev_info[0].iocb;
			continue;
		}

		//pthread_mutex_lock((pthread_mutex_t*)&shared_lock->lock);
		for (i = 0; i < this_aiobatch; i++) {
			struct iocb *iocb;
			int nr;
			struct myinfo *info;
			int idx;
			/* get the fd who just finished an aio */
			iocb = (struct iocb *)events[i].obj;
			info = (struct myinfo *)iocb->data;
			idx = info->disk_index;
			nr = pshared[id].numreads[idx];
			if (DEBUG)
				printf("Update offset for aio, this_aiobatch = %d, fdes = %d, dev_info[%d].fd = %d,for %d\n", 
					this_aiobatch, idx, idx, dev_info[idx].fd,id);
			/* Update the offset */
			if (frandom) {
				x = rand();
				_offset = (long long)((long long)(x % range) * (long long)rsize * ((niovec==0) ? 1 : niovec)) + noffset;
			} else if ((nr % range) == 0) {
				_offset = noffset;
			} else {
				int tmpvar;
				tmpvar = atomic_add_return(1,&offsets[idx]);
				_offset = (long long)(tmpvar % range) * (long long)rsize + (long long)noffset;
			}
			dev_info[idx].offset = _offset;

			if (DEBUG) 
				printf(" preapre for aio, iocb = %p, "
					"fd =%d, for %d\n", 
					dev_info[idx].iocb, 
					dev_info[idx].fd,
					id);

			/*prepare for next aio submit*/
			if (aiowrite)
				io_prep_pwrite(dev_info[idx].iocb, 
						dev_info[idx].fd, p,
						rsize, _offset);
			else {
				io_prep_pread(dev_info[idx].iocb, 
						dev_info[idx].fd, p,
						rsize, _offset);
				//iocb[i].data = &this_iocb;
			}
			iocbp[i] = dev_info[idx].iocb;
			dev_info[idx].iocb->data = &dev_info[idx];

			/* clean up events */
			events[i].obj = 0;
		}
		//pthread_mutex_unlock((pthread_mutex_t*)&shared_lock->lock);
	}
	if (ctx) {
		io_queue_release(ctx);
	}
	if (aioread || aiowrite) {
		for (i = 0; i < aiobatch; i++) {
			fsync(dev_info[i].fd);
		}
	} else {					 
		fsync(fd);
	}
	rc = gettimeofday (&tv2, &tz1);
	if (rc) {
		perror ("gettimeofday failed on tv2 ");
		pshared[id].micro = 1;
		return NULL;
	}
	timersub(&tv2, &tv1, &tvr); /* tvr now contains result of tv2-tv1 */
	pshared[id].micro = ((unsigned long long)tvr.tv_sec * 1000000) + ((unsigned long long)tvr.tv_usec);
	return(0);
}

int parseparm (int argc, char *argv[])
{
	int c;

	while ((c = getopt(argc, argv, "t:o:m:ezxfci:l:p:d:s:n:r:?")) != -1) {
		switch (c) {
			case 't': testid = atoi(optarg); break;
			case 'o': noffset = (long long)(1024 * 1024) * ((long long)(atoi(optarg))); break;
			case 'r': range = atoi(optarg); break;
			case 'z': flags |= O_DIRECT; fdirect = 1; fide=0; break;
			case 'x': fraw = 0; ffile = 0; fscsi = 1; fide=0; break;
			case 'f': ffile = 1; fraw = 0; fscsi = 0; break;
 		    case 'c': f_sync = 1; flags |= O_SYNC; break;
			case 'i': niovec = atoi(optarg); break;
			case 'm': numdevices = atoi(optarg); break;
			case 's': rsize = atoi(optarg); break;
			case 'n': nreads = atoi(optarg); break;
			case 'l': nrepeat = atoi(optarg); break;
			case 'd': ndevice = atoi(optarg); break;
			case 'p': nprocs = atoi(optarg); break;
			case 'e': fExtra = 1; break;
		    case 'h': devname[5]='h'; fraw = 0; ffile = 0; fscsi = 0; fide=1; break;
			case '?':
			default:  usage(); return(1);
		}
	}

	/* user buffer must be 512 byte aligned */
	if (rsize & (512-1)) {
		printf("-s %d parm is not sector size aligned\n", rsize);
		return(1);
	}

	if (niovec && testid == -1)
		testid = 2;	/* default to sequential readv() test */

	if (!niovec && testid == -1)
		testid = 0;	/* default to sequential read() test */

	if ((testid == 2 || testid == 3 || testid == 8 || testid == 9) && !niovec)
		niovec = 8;	/* default to 8 iovecs */

	if ((testid != 2 && testid != 3 && testid != 8 && testid != 9) && niovec) {
		printf("-i (number of iovecs) can not be used with pread() or rwrite() test\n");
		return(1);
	}

	if (testid < TESTID_MIN || testid > TESTID_MAX) {
		printf("Invalid test id -t %d\n", testid);
		return(1);
	}

	/* noffset cannot be negative */
	if (noffset < 0) {
		printf("-o %lld parm cannot be negative\n", noffset);
		return(1);
	}

	/* nreads must be positive */
	if (nreads < 1) {
		printf("-n %d parm must be > 0\n", nreads);
		return(1);
	}

	/* niovec must be >= 0 <= UIO_MAXIOV */
	if (niovec < 0 || niovec > UIO_MAXIOV) {
		printf("-i %d parm must be >= 0 and <= %d\n", niovec, UIO_MAXIOV);
		return(1);
	}

	/* nreapeat must be positive */
	if (nrepeat < 1) {
		printf("-l %d parm must be > 0\n", nrepeat);
		return(1);
	}

	/* nprocs must be positive */
	if (nprocs < 1) {
		printf("-p %d parm must be > 0\n", nprocs);
		return(1);
	}

	/* nmodule must be > 0 */
	if (numdevices == -1) {
		numdevices = nprocs;
	} else {
		if (numdevices < 1) {
			printf("-m %d parm must be > 0\n", numdevices);
			return(1);
		} else if (numdevices > nprocs) {
			numdevices = nprocs;
		}
	}

	/* check if default starting raw device overridden */
	if (ndevice != 1) {
		if (ndevice < 1 || ndevice > 256) {
			printf("-d %d parm must be > 0 and < 257\n", ndevice);
			return(1);
		}
	}

	switch (testid) {
		case SEQ_READ:
			ffread = 1;
			break;
		case SEQ_WRITE:
			ffwrite = 1;
			break;
		case SEQ_READV:
			freadv = 1;
			break;
		case SEQ_WRITEV:
			fwritev = 1;
			break;
		case SEQ_PREAD:
			fpread = 1;
			break;
		case SEQ_PWRITE:
			fpwrite = 1;
			break;
		case SEQ_AIOREAD:
			aioread = 1;
			break;
   		case SEQ_AIOWRITE:
   			aiowrite = 1;
   			break;
   	 	case RAN_AIOREAD:
   			aioread = 1;
			frandom = 1;
   			break;
   	 	case RAN_AIOWRITE:
   			aiowrite = 1;
			frandom = 1;
   			break;
		case SEQ_BAIOREAD:
			nprocs = nprocs/numdevices;
   			aioread = 1;
			aiobatch=numdevices;
   			break;
   		case SEQ_BAIOWRITE:
   			nprocs = nprocs/numdevices;
  	  		aiowrite = 1;
   			aiobatch=numdevices;
   			break;
  	  	case RAN_BAIOREAD:
   			nprocs = nprocs/numdevices;
   			aioread = 1;
			frandom = 1;
   			aiobatch=numdevices;
   			break;
   		case RAN_BAIOWRITE:
   			nprocs = nprocs/numdevices;
   			aiowrite = 1;
			frandom = 1;
    		aiobatch=numdevices;
			break;
		case SEQ_MWBAIOREAD:
			nprocs = nprocs/numdevices;
   			aioread = 1;
			aiobatch=numdevices;
			minwait = 1;
   			break;
   		case SEQ_MWBAIOWRITE:
   			nprocs = nprocs/numdevices;
  	  		aiowrite = 1;
   			aiobatch=numdevices;
			minwait = 1;
   			break;
  	  	case RAN_MWBAIOREAD:
   			nprocs = nprocs/numdevices;
   			aioread = 1;
			frandom = 1;
    		aiobatch=numdevices;
			minwait = 1;
   			break;
  		case RAN_MWBAIOWRITE:
   			nprocs = nprocs/numdevices;
   			aiowrite =1;
			frandom = 1;
    		aiobatch=numdevices;
			minwait = 1;
			break;
		case RAN_READ:
			ffread = 1;
			frandom = 1;
			break;
		case RAN_WRITE:
			ffwrite = 1;
			frandom = 1;
			break;
		case RAN_READV:
			freadv = 1;
			frandom = 1;
			break;
		case RAN_WRITEV:
			fwritev = 1;
			frandom = 1;
			break;
		case RAN_PREAD:
			fpread = 1;
			frandom = 1;
			break;
		case RAN_PWRITE:
			fpwrite = 1;
			frandom = 1;
			break;
		default:
			printf("Invalid testid\b");
			return(1);
	};
	return(0);
}

//static void sigcont(int sig)
//{
//}

void *shm_setup(int size)
{
	int shmid;
	void *ret;

	shmid = shmget(IPC_PRIVATE, size, SHM_R | SHM_W);
	if (shmid == -1) {
		perror("shmget failed\n");
		return(NULL);
	}
	ret = (void *)shmat(shmid, 0, 0);
	if (!ret || ret == (void *)-1) {
		perror("shmat failed\n");
		return(NULL);
	}
	/* the following releases the ipc, but note that this process
	   and all its children will still have access to the memory, its
	   just that the shmid is no longer valid for other shm calls. This
	   means we don't leave behind lots of shm segments after we exit

	   See Stevens "advanced programming in unix env" for details
	   */
	shmctl(shmid, IPC_RMID, 0);

	return ret;
}

char *alloc_read_buffer (int size)
{
	char *p;
	int i;

	/* page align the record buffer */
	if ((p = (char*)malloc(size+4096)) == NULL) {
		perror("malloc of read buffer");
		return(NULL);
	}

	/* user read buffer is page aligned */
	p += (4096-1);
	p = (char*) ((unsigned long)p & ~(4096-1));

	/* touch each page, do not want page faults for test */
	/* actually touching each "sector" */
	for (i = 0; i < (size >> 9) ; i++) {
		char * pp = p;
		*pp = 1;
		pp += 512;
	}
	return(p);
}

void get_device_name (int i, char * device_name)
{
	//char *device_name;
	/*
	 * case of raw io on /dev/raw/raw...
	 */
	if (fraw) {
        strcpy(device_name, rawname);
		sprintf(&device_name[12], "%d", ndevice + (i % numdevices));
		//device_name = rawname;
		return;//(device_name);
	}
	/*
	 * case of filesystem io on /mnt/mntn/foo...
	 */
	if (ffile) {
        strcpy(device_name, fsname);
		sprintf(&device_name[8], "%d/%s", ndevice + (i % numdevices), foo_fname);
		//device_name = fsname;
		return;//(device_name);
	}

	/*
	 * case of O_DIRECT on /dev/sd...
	 */
    strcpy(device_name, devname);
	if ((ndevice - 1 + (i % numdevices)) < 26) {
//		printf ("first\n");
		device_name[7] = 'a'+ ndevice - 1 + (i % numdevices);
		device_name[8] = 0;
	} else {
//		printf ("second\n");
		device_name[7] = 'a' + ((ndevice - 1 + (i % numdevices))/26) - 1 ;
		device_name[8] = 'a' + ((ndevice - 1 + (i % numdevices))%26);
		device_name[9] = 0;
	}
	//device_name = devname;
	return;//(device_name);
}


