Hi All,

I using the GEM5 simulator to collect statistics of a
micro-benchmark program. I am encountering the functional read access
failed for address "0xXXXX".

I have attached the source file of the micro-benchmark program. The
simulation is running fine for the case "1" and "4" in the switch
construct. The error is encountered for the cases "2" and "3" while I am
using pthread locks.

I am using the MESI_Two_Level protocol with 4 core configuration( private
L1I and L1D cache for each core and shared L2 cache) in SE mode.

I have read the discussion on google group "
https://groups.google.com/u/1/g/gem5-gpu-dev/c/Wt43jSYYXag";
and can infer the issue is generated when we have multiple copies of a
block in our system (might be in the transient state) and we tried to
perform a store or load to it.

Can you point out the way to fix this?
Is pthread lock not supported in the gem5? Does the current release of gem5
provide a workaround for this?

Any help would be highly appreciated. Thanks in advance.

Regards,
Vipin
Research Scholar
IIT Kanpur
/**
 * A micro benchmark program to trigger different FS behavior
 * 1. Only FS
 * 2. FS with TS (both occur in different Cache block
 * 3. FS with TS (in same cache line)
 * 4. NO FS
 * 5. FS (on multiple cache line by different thread pair)
 * 
 */

#include <iostream>
#include <sys/time.h>
#include <cstdint>
#include <pthread.h>
#include <cstring>

#define NUM_THREADS 4
#define LOOP_COUNT 2 << 17
#define BLOCK_SIZE 64

using namespace std;

void *without_fs(void *);
void *only_fs(void *);
void *ts_fs_naive(void *);
void *ts_fs(void *);
double rtclock();
//void sequential_version();
//void check_result(uint16_t *, uint16_t*);
//uint16_t sequential_array[NUM_THREADS] = { 0 };
//const double threshold = 0.0000001;


uint32_t arr_seq_acc[NUM_THREADS]  = {0};
uint32_t arr_padding_1[8]={0};
uint32_t array_with_fs[NUM_THREADS] = {0};
uint32_t arr_padding_2[8]= {0};
uint32_t array_without_fs[NUM_THREADS * 16] = {0};//per entry 4 bytes: 1 block per thread 
uint32_t ts_var = 0;
uint8_t padd_arr[60]={0};

struct FSTS
{
    uint32_t shared_var = 0;
    uint32_t shared_arr[NUM_THREADS] = {0};
};
FSTS fs_ts_struct;
pthread_mutex_t lock_var;


int main(int argc, char **argv)
{
    pthread_t threads[NUM_THREADS];

    printf("Starting address of seq array : %p \n",(void *)&arr_seq_acc);
    printf("Starting address of array with FS : %p \n",(void *)array_with_fs);
    printf("Starting address of array without FS : %p \n",(void *)&array_without_fs);
    printf("Starting address of FS_TS Struct :%p \n", (void *)&fs_ts_struct);
    printf("Starting address of TS_SharedVar :%p \n", (void *)&ts_var);

    double clkbegin, clkend;
    
    if(argc == 2)
    {
        int option = atoi(argv[1]);
        switch(option)
        {
        // only_fs
        case 1:
        cout<< "Only FS variable"<<"\n";
        clkbegin= rtclock();
        for (int i = 0; i < NUM_THREADS; i++) 
            pthread_create(&threads[i], NULL, &only_fs, (void*)(intptr_t)i);
        break;

        //ts_fs_naive: at different cache line
        case 2:
        cout << "FS TS for var located on diff cache line.\n" ;
        clkbegin= rtclock();
        for (int i = 0; i < NUM_THREADS; i++) 
            pthread_create(&threads[i], NULL, &ts_fs_naive, (void*)(intptr_t)i);
        break;
        
        //ts_fs same line
        case 3:
        cout << "FS TS on same cache line. \n";
        clkbegin= rtclock();
        for (int i = 0; i < NUM_THREADS; i++) 
            pthread_create(&threads[i], NULL, &ts_fs, (void*)(intptr_t)i);
        break;
        
        //no_fs
        case 4:
        cout << "Parallel version of code. \n";
        clkbegin= rtclock();
        for (int i = 0; i < NUM_THREADS; i++) 
            pthread_create(&threads[i], NULL, &without_fs, (void*)(intptr_t)i);
        break;

        }
        // joining 4 threads i.e. waiting for all 4 threads to complete 
        for (int i = 0; i < NUM_THREADS; i++) 
            pthread_join(threads[i], NULL);
        
        clkend = rtclock();
        std::cout << "# of threads :" << NUM_THREADS << "\t Exec time:" <<
        clkend - clkbegin << " sec" << std::endl;
    }
    else
    {
        cout << "Sequential version";
        clkbegin= rtclock();
        for (int i = 0; i< NUM_THREADS; i++)
        {
            for (uint32_t j = 0; j < LOOP_COUNT; j++)
                arr_seq_acc[i] *= 1;
        }
        clkend = rtclock();
        std::cout << "SEQ VERSION: \t Exec time:" <<
        clkend - clkbegin << " sec" << std::endl;
    }

    /**
    double clkbegin, clkend;
    clkbegin= rtclock();
    clkend = rtclock();
    std::cout << "# of threads :" << NUM_THREADS << "\t Exec time:" <<
    clkend - clkbegin << " sec" << std::endl;
    clkbegin = rtclock();
    sequential_version();
    clkend= rtclock();
    std::cout << "Sequential Version :" << "\t Exec time:" <<
    clkend - clkbegin << " sec" << std::endl;
    check_result(shared_array,sequential_array);
    */
    return 0;

}

/**
 * Program with padded array
 * BEHAVIOR: NO FS NO TS (padded array)
 */ 
void *without_fs(void* threadId)
{
    int currID = (intptr_t) threadId;
    for (uint32_t i = 0; i < LOOP_COUNT; i++)
    {
        array_without_fs[currID * BLOCK_SIZE] *= 1;//shared_array[currID]; 
    }
}

/**
 * The BM suffer fs where all thread accesses are non-overlapping access
 * to same cache block
 * FS: array accesses
 * */
void *only_fs(void *threadId)
{
    int currID = (intptr_t) threadId;
    for (uint32_t i = 0; i < LOOP_COUNT; i++)
    {
        array_with_fs[currID] *= 1;//shared_array[currID]; 
    }  
}

/**
 * program experience fs and ts on different cache line.
 * TS: shared counter update accesses
 * FS: array accesses
 * Similar to locked micro benchmark of HURON 
 * 
 */
void *ts_fs_naive(void * threadId)
{
    int currID = (intptr_t) threadId;
    for (uint32_t i = 0; i < LOOP_COUNT; i++)
    {
        //pthread_mutex_lock(&lock_var);
        ts_var *= 1;
        //pthread_mutex_unlock(&lock_var);
        array_with_fs[currID] *= 1;
    }
}
/**
 * TS and FS for var belonging to same line 
 * FS: Array accesses
 * TS: 
 */
void *ts_fs(void *threadId)
{
    int currID = (intptr_t) threadId;
    for (uint32_t i = 0; i < LOOP_COUNT; i++)
    {
        pthread_mutex_lock(&lock_var);
        fs_ts_struct.shared_var *= 1;
        pthread_mutex_unlock(&lock_var);
        fs_ts_struct.shared_arr[currID] *= 1;
    }
}

/**

void sequential_version()
{
    for (int i = 0; i < NUM_THREADS; i++)
    {
        for (uint16_t j = 0; j < LOOP_COUNT; j++)
        {
            arr_seq_acc[i] += 1;//sequential_array[i];
        }
    }
}
**/
double rtclock()
{
    struct timezone Tzp;
    struct timeval Tp;
    int stat;
    stat = gettimeofday(&Tp, &Tzp);
    if (stat != 0)
    {
        std::cout << "Error return from gettimeofday: " << stat << std::endl;
    }
    return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
}
/**
void check_result(uint16_t *shared_arr, uint16_t *seq_arr)
{
  double maxdiff, this_diff;
  int numdiffs;
  int i, j;
  numdiffs = 0;
  maxdiff = 0;

  for (i = 0; i < NUM_THREADS; i++)
  {
    this_diff = shared_arr[i] - seq_arr[i];
    if (this_diff < 0)
    this_diff = -1.0 * this_diff;
    if (this_diff > threshold)
    {
    numdiffs++;
    if (this_diff > maxdiff)
        maxdiff = this_diff;
    }
  }

  if (numdiffs > 0)
  {
    cout << numdiffs << " Diffs found over threshold " << threshold
         << "; Max Diff = " << maxdiff << endl;
  }
  else
  {
    cout << "No differences found between base and test versions\n";
  }
}

**/
_______________________________________________
gem5-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

Reply via email to