Hi All,
I using the GEM5 simulator to collect statistics of a
micro-benchmark program. I am encountering the functional read access
failed for address "0xXXXX".
I have attached the source file of the micro-benchmark program. The
simulation is running fine for the case "1" and "4" in the switch
construct. The error is encountered for the cases "2" and "3" while I am
using pthread locks.
I am using the MESI_Two_Level protocol with 4 core configuration( private
L1I and L1D cache for each core and shared L2 cache) in SE mode.
I have read the discussion on google group "
https://groups.google.com/u/1/g/gem5-gpu-dev/c/Wt43jSYYXag"
and can infer the issue is generated when we have multiple copies of a
block in our system (might be in the transient state) and we tried to
perform a store or load to it.
Can you point out the way to fix this?
Is pthread lock not supported in the gem5? Does the current release of gem5
provide a workaround for this?
Any help would be highly appreciated. Thanks in advance.
Regards,
Vipin
Research Scholar
IIT Kanpur
/**
* A micro benchmark program to trigger different FS behavior
* 1. Only FS
* 2. FS with TS (both occur in different Cache block
* 3. FS with TS (in same cache line)
* 4. NO FS
* 5. FS (on multiple cache line by different thread pair)
*
*/
#include <iostream>
#include <sys/time.h>
#include <cstdint>
#include <pthread.h>
#include <cstring>
#define NUM_THREADS 4
#define LOOP_COUNT 2 << 17
#define BLOCK_SIZE 64
using namespace std;
void *without_fs(void *);
void *only_fs(void *);
void *ts_fs_naive(void *);
void *ts_fs(void *);
double rtclock();
//void sequential_version();
//void check_result(uint16_t *, uint16_t*);
//uint16_t sequential_array[NUM_THREADS] = { 0 };
//const double threshold = 0.0000001;
uint32_t arr_seq_acc[NUM_THREADS] = {0};
uint32_t arr_padding_1[8]={0};
uint32_t array_with_fs[NUM_THREADS] = {0};
uint32_t arr_padding_2[8]= {0};
uint32_t array_without_fs[NUM_THREADS * 16] = {0};//per entry 4 bytes: 1 block per thread
uint32_t ts_var = 0;
uint8_t padd_arr[60]={0};
struct FSTS
{
uint32_t shared_var = 0;
uint32_t shared_arr[NUM_THREADS] = {0};
};
FSTS fs_ts_struct;
pthread_mutex_t lock_var;
int main(int argc, char **argv)
{
pthread_t threads[NUM_THREADS];
printf("Starting address of seq array : %p \n",(void *)&arr_seq_acc);
printf("Starting address of array with FS : %p \n",(void *)array_with_fs);
printf("Starting address of array without FS : %p \n",(void *)&array_without_fs);
printf("Starting address of FS_TS Struct :%p \n", (void *)&fs_ts_struct);
printf("Starting address of TS_SharedVar :%p \n", (void *)&ts_var);
double clkbegin, clkend;
if(argc == 2)
{
int option = atoi(argv[1]);
switch(option)
{
// only_fs
case 1:
cout<< "Only FS variable"<<"\n";
clkbegin= rtclock();
for (int i = 0; i < NUM_THREADS; i++)
pthread_create(&threads[i], NULL, &only_fs, (void*)(intptr_t)i);
break;
//ts_fs_naive: at different cache line
case 2:
cout << "FS TS for var located on diff cache line.\n" ;
clkbegin= rtclock();
for (int i = 0; i < NUM_THREADS; i++)
pthread_create(&threads[i], NULL, &ts_fs_naive, (void*)(intptr_t)i);
break;
//ts_fs same line
case 3:
cout << "FS TS on same cache line. \n";
clkbegin= rtclock();
for (int i = 0; i < NUM_THREADS; i++)
pthread_create(&threads[i], NULL, &ts_fs, (void*)(intptr_t)i);
break;
//no_fs
case 4:
cout << "Parallel version of code. \n";
clkbegin= rtclock();
for (int i = 0; i < NUM_THREADS; i++)
pthread_create(&threads[i], NULL, &without_fs, (void*)(intptr_t)i);
break;
}
// joining 4 threads i.e. waiting for all 4 threads to complete
for (int i = 0; i < NUM_THREADS; i++)
pthread_join(threads[i], NULL);
clkend = rtclock();
std::cout << "# of threads :" << NUM_THREADS << "\t Exec time:" <<
clkend - clkbegin << " sec" << std::endl;
}
else
{
cout << "Sequential version";
clkbegin= rtclock();
for (int i = 0; i< NUM_THREADS; i++)
{
for (uint32_t j = 0; j < LOOP_COUNT; j++)
arr_seq_acc[i] *= 1;
}
clkend = rtclock();
std::cout << "SEQ VERSION: \t Exec time:" <<
clkend - clkbegin << " sec" << std::endl;
}
/**
double clkbegin, clkend;
clkbegin= rtclock();
clkend = rtclock();
std::cout << "# of threads :" << NUM_THREADS << "\t Exec time:" <<
clkend - clkbegin << " sec" << std::endl;
clkbegin = rtclock();
sequential_version();
clkend= rtclock();
std::cout << "Sequential Version :" << "\t Exec time:" <<
clkend - clkbegin << " sec" << std::endl;
check_result(shared_array,sequential_array);
*/
return 0;
}
/**
* Program with padded array
* BEHAVIOR: NO FS NO TS (padded array)
*/
void *without_fs(void* threadId)
{
int currID = (intptr_t) threadId;
for (uint32_t i = 0; i < LOOP_COUNT; i++)
{
array_without_fs[currID * BLOCK_SIZE] *= 1;//shared_array[currID];
}
}
/**
* The BM suffer fs where all thread accesses are non-overlapping access
* to same cache block
* FS: array accesses
* */
void *only_fs(void *threadId)
{
int currID = (intptr_t) threadId;
for (uint32_t i = 0; i < LOOP_COUNT; i++)
{
array_with_fs[currID] *= 1;//shared_array[currID];
}
}
/**
* program experience fs and ts on different cache line.
* TS: shared counter update accesses
* FS: array accesses
* Similar to locked micro benchmark of HURON
*
*/
void *ts_fs_naive(void * threadId)
{
int currID = (intptr_t) threadId;
for (uint32_t i = 0; i < LOOP_COUNT; i++)
{
//pthread_mutex_lock(&lock_var);
ts_var *= 1;
//pthread_mutex_unlock(&lock_var);
array_with_fs[currID] *= 1;
}
}
/**
* TS and FS for var belonging to same line
* FS: Array accesses
* TS:
*/
void *ts_fs(void *threadId)
{
int currID = (intptr_t) threadId;
for (uint32_t i = 0; i < LOOP_COUNT; i++)
{
pthread_mutex_lock(&lock_var);
fs_ts_struct.shared_var *= 1;
pthread_mutex_unlock(&lock_var);
fs_ts_struct.shared_arr[currID] *= 1;
}
}
/**
void sequential_version()
{
for (int i = 0; i < NUM_THREADS; i++)
{
for (uint16_t j = 0; j < LOOP_COUNT; j++)
{
arr_seq_acc[i] += 1;//sequential_array[i];
}
}
}
**/
double rtclock()
{
struct timezone Tzp;
struct timeval Tp;
int stat;
stat = gettimeofday(&Tp, &Tzp);
if (stat != 0)
{
std::cout << "Error return from gettimeofday: " << stat << std::endl;
}
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
}
/**
void check_result(uint16_t *shared_arr, uint16_t *seq_arr)
{
double maxdiff, this_diff;
int numdiffs;
int i, j;
numdiffs = 0;
maxdiff = 0;
for (i = 0; i < NUM_THREADS; i++)
{
this_diff = shared_arr[i] - seq_arr[i];
if (this_diff < 0)
this_diff = -1.0 * this_diff;
if (this_diff > threshold)
{
numdiffs++;
if (this_diff > maxdiff)
maxdiff = this_diff;
}
}
if (numdiffs > 0)
{
cout << numdiffs << " Diffs found over threshold " << threshold
<< "; Max Diff = " << maxdiff << endl;
}
else
{
cout << "No differences found between base and test versions\n";
}
}
**/
_______________________________________________
gem5-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s