Tran Dinh Hung, In your test case you create a checkpoint and then pass that checkpoint handle to a created thread.
This is done in a loop. What is happening is that the thread references the checkpoint handle by address so it is possible to be doing a checkpoint create operation and write operation on the created checkpoint handle before the created checkpoint handle is fully initialized. Let me give an example: 1. create checkpoint returns handle 0 2. pthread create &handle (=0) 3. create checkpoint returns handle 1 4. pthread created in step 2 reads the address of the &handle and has the value 1 and starts its checkpoint write operations on handle #1 step #4 happens while the create checkpoint is setting up in #3. Now essentially what I believe should happen in this case is the checkpoint write should return an invalid handle error until the checkpoint create has completed its full initialization. This is something that should be addressed to return the proper error to the user instead of returning an error within the executive. But for now, I'd suggest not passing the same checkpoint address to every pthread creation operation but instead use an array and pass the address of an array position. I tried this with your test program (using an array for the checkpoint handles) and it generates another fault which I have been working to resolve. Regards -steve On Tue, 2007-10-16 at 14:50 +0700, Tran Dinh Hung wrote: > Hi all, > > To: Steven > > The test with 2 processes is within my program so I can not send it to > you now. But I think it has the same problem with the simpler test of > mine. The simple test used only one thread to create checkpoints and > create a writing thread on each of that. I re-attach the simple test again. > > The detail of that test I described in the previous mail as follow: > " > Each WHILE loop, I create a check point and start a writing thread > on it (only write to that checkpoint in about 60s). > Max check point = 500. > When runing test, the system sometimes crashes. (I use Redhat on > VMWare server, openais-0.80.3). > Sometimes, when the program is hangup, I only use "service network > restart" and then system is OK again. > " > > To Russell Bryant: I dont have much experience with Linux so I do not > know how to get the backtrace. > > Please help me check it, if the checkpoint performance is error, I can > not built my program!!! > > Tran Dinh Hung. > > > > Steven Dake wrote: <cid:[email protected]> > > If you could attach the reproducer I'll take a look. Also which version > > are you using? > > > > Regards > > -steve > > On Fri, 2007-10-12 at 17:29 +0700, Tran Dinh Hung wrote: > > > >> Hi all, > >> > >> I did a perfomance test with checkpoint service. > >> In that test, I created two processes. > >> Each process did the following works: > >> > >> - open a new checkpoint in each second, the number of sections is random > >> (max = 16 sections), > >> - re-write data for each checkpoint section 4 times/1 second (maxlength > >> of section data is 256 bytes). > >> > >> When running test, OpenAis showed the following error (when the number > >> of checkpoints was about 40): > >> > >> ------------------------------------------------------------- > >> ... > >> Oct 12 6:35:58.240797 [IPC ] Invalid header id is 1598832896 min 0 max 17 > >> Oct 12 6:35:58.241080 [IPC ] Invalid header id is 1598832896 min 0 max 17 > >> Oct 12 6:35:58.294680 [IPC ] Invalid header id is 1598832896 min 0 max 17 > >> Oct 12 6:35:58.294681 [IPC ] Invalid header id is 1598832896 min 0 max 17 > >> ... > >> ------------------------------------------------------------- > >> > >> Could you help me solve this problem? > >> > >> HungTD. > >> > >> > >> _______________________________________________ > >> Openais mailing list > >> [email protected] > >> https://lists.linux-foundation.org/mailman/listinfo/openais > >> > > > > > > > > plain text document attachment (ckptTest.c) > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > #include <errno.h> > #include <unistd.h> > #include <sys/types.h> > #include <sys/socket.h> > #include <sys/select.h> > #include <sys/un.h> > #include <sys/time.h> > #include <pthread.h> > > #include <openais/saAis.h> > #include <openais/saCkpt.h> > > #define SM_MAX_ATTRIBUTE 128 > #define SM_MAX_ATTRIBUTE_SIZE 1024 > #define SM_RETENTION_POLICY 0 > > #define DEFAULT_RELAX_TIME 1000 > #define MAX_CHECKPOINT 500 > #define BUFFER_SIZE 32 > > #define LIVE_DURATION 60 > > const char *sa_error_list[] = { > "OUT_OF_RANGE", > "SA_AIS_OK", > "SA_AIS_ERR_LIBRARY", > "SA_AIS_ERR_VERSION", > "SA_AIS_ERR_INIT", > "SA_AIS_ERR_TIMEOUT", > "SA_AIS_ERR_TRY_AGAIN", > "SA_AIS_ERR_INVALID_PARAM", > "SA_AIS_ERR_NO_MEMORY", > "SA_AIS_ERR_BAD_HANDLE", > "SA_AIS_ERR_BUSY", > "SA_AIS_ERR_ACCESS", > "SA_AIS_ERR_NOT_EXIST", > "SA_AIS_ERR_NAME_TOO_LONG", > "SA_AIS_ERR_EXIST", > "SA_AIS_ERR_NO_SPACE", > "SA_AIS_ERR_INTERRUPT", > "SA_AIS_ERR_NAME_NOT_FOUND", > "SA_AIS_ERR_NO_RESOURCES", > "SA_AIS_ERR_NOT_SUPPORTED", > "SA_AIS_ERR_BAD_OPERATION", > "SA_AIS_ERR_FAILED_OPERATION", > "SA_AIS_ERR_MESSAGE_ERROR", > "SA_AIS_ERR_QUEUE_FULL", > "SA_AIS_ERR_QUEUE_NOT_AVAILABLE", > "SA_AIS_ERR_BAD_FLAGS", > "SA_AIS_ERR_TOO_BIG", > "SA_AIS_ERR_NO_SECTIONS", > }; > > SaCkptCheckpointCreationAttributesT checkpointCreationAttributes = { > .creationFlags = SA_CKPT_WR_ALL_REPLICAS, > .checkpointSize = 16*1024, > .retentionDuration = 5000000000LL, > .maxSections = 16, > .maxSectionSize = 1024, > .maxSectionIdSize = 64 > }; > > SaCkptHandleT ckptHandle; > > int CHECKPOINT_ID = 0; > > int checkpointCount = 0; > > pthread_mutex_t checkpointLock = PTHREAD_MUTEX_INITIALIZER; > > int get_sa_error(SaAisErrorT error, char *str, int len) > { > if (error < SA_AIS_OK || > error > SA_AIS_ERR_NO_SECTIONS || > len < strlen(sa_error_list[error])) { > errno = EINVAL; > return -1; > } > strncpy(str, sa_error_list[error], len); > return 0; > } > > char *get_sa_error_b (SaAisErrorT error) { > return ((char *)sa_error_list[error]); > } > > char *get_test_output (SaAisErrorT result, SaAisErrorT expected) { > static char test_result[256]; > > if ((result > (sizeof(sa_error_list)/4))||(expected > > (sizeof(sa_error_list)/4))) > { > return ("LIB ERROR"); > } > else if (result == expected) { > return ("PASSED"); > } else { > sprintf (test_result, "FAILED expected %s got %s", > get_sa_error_b(expected), > get_sa_error_b(result)); > return (test_result); > } > } > > SaAisErrorT createCheckpoint(char *ckptCheckpointName, > unsigned long > maxAttribute, > unsigned long > maxAttributeSize, > int rPolicy, > > SaCkptCheckpointHandleT* ckptCheckpointHandle) > { > SaAisErrorT error; > char strCheckpointName[SA_MAX_NAME_LENGTH]; > memset(strCheckpointName, 0, SA_MAX_NAME_LENGTH); > strcpy(strCheckpointName, ckptCheckpointName); > SaNameT checkpointName; > memset(&checkpointName, 0, sizeof(SaNameT)); > checkpointName.length = strlen(strCheckpointName); > memcpy(checkpointName.value, strCheckpointName, SA_MAX_NAME_LENGTH); > > SaCkptCheckpointCreationAttributesT sessionCreationAttributes; > memcpy(&sessionCreationAttributes, &checkpointCreationAttributes, > sizeof(SaCkptCheckpointCreationAttributesT)); > > error = saCkptCheckpointOpen( > ckptHandle, > &checkpointName, > &sessionCreationAttributes, > > SA_CKPT_CHECKPOINT_CREATE|SA_CKPT_CHECKPOINT_READ|SA_CKPT_CHECKPOINT_WRITE, > SA_TIME_END, > ckptCheckpointHandle); > > printf("[createCheckpoint] %s\n", get_test_output(error, SA_AIS_OK)); > > return error; > > } > > SaAisErrorT writeToCheckpoint(SaCkptCheckpointHandleT ckptCheckpointHandle, > char* > sectionName, > void* data, > int dataSize) > { > SaAisErrorT error; > SaCkptIOVectorElementT writeElement; > SaUint32T erroroneousVectorIndex = 0; > char initData[] = ""; > > SaCkptSectionIdT sectionId; > sectionId.id = (SaUint8T*)sectionName; > sectionId.idLen = strlen(sectionName); > > > SaCkptSectionCreationAttributesT sectionCreationAttributes; > sectionCreationAttributes.sectionId = §ionId; > sectionCreationAttributes.expirationTime = SA_TIME_END; > > > SaCkptIOVectorElementT writeVectorElements[] = { > { > {strlen(sectionName), sectionName}, > data, > dataSize, > 0, > 0 > } > }; > > error = saCkptSectionCreate(ckptCheckpointHandle, > > §ionCreationAttributes, > initData, > > strlen(initData)); > > if ((error != SA_AIS_OK)&&(error != SA_AIS_ERR_EXIST)) > return error; > > error = saCkptCheckpointWrite (ckptCheckpointHandle, > > writeVectorElements, > 1, > > &erroroneousVectorIndex); > printf("[writeToCheckpoint] %s\n", get_test_output(error, SA_AIS_OK)); > return error; > > } > > void *doTest(void* param) > { > SaCkptCheckpointHandleT ckptCheckpointHandle; > memcpy(&ckptCheckpointHandle, param, sizeof(SaCkptCheckpointHandleT)); > time_t start, now; > int relax = 0; > char* writeBuffer = (char*)malloc(BUFFER_SIZE); > memset(writeBuffer, 0, BUFFER_SIZE); > > start=time(NULL); > strcpy(writeBuffer,(char*)asctime(localtime(&start))); > > while (1) > { > writeToCheckpoint(ckptCheckpointHandle, "pass", writeBuffer, > strlen(writeBuffer)); > now=time(NULL); > strcpy(writeBuffer,(char*)asctime(localtime(&now))); > writeToCheckpoint(ckptCheckpointHandle, "now", writeBuffer, > strlen(writeBuffer)); > if (difftime(now, start) >= LIVE_DURATION) > { > break; > } > else { > srand(rand()); > relax = rand()%DEFAULT_RELAX_TIME; > usleep(relax*1000); > } > } > > saCkptCheckpointClose(ckptCheckpointHandle); > pthread_mutex_lock(&checkpointLock); > checkpointCount--; > pthread_mutex_unlock(&checkpointLock); > } > > char* nameCheckpoint() > { > char* name = malloc(strlen("checkpoint")+6); > sprintf(name, "checkpoint_%4d", CHECKPOINT_ID++); > return name; > } > > int main() > { > pthread_t thread; > SaAisErrorT error; > SaVersionT ckptVersion = {'B', 1, 0}; > SaCkptCheckpointHandleT ckptCheckpointHandle; > char *checkpointName; > > error = saCkptInitialize(&ckptHandle, NULL, &ckptVersion); > printf("[main] %s\n", get_test_output(error, SA_AIS_OK)); > > while (1) > { > pthread_mutex_lock(&checkpointLock); > checkpointName = nameCheckpoint(); > createCheckpoint(checkpointName, > SM_MAX_ATTRIBUTE, > SM_MAX_ATTRIBUTE_SIZE, > SM_RETENTION_POLICY, > &ckptCheckpointHandle); > pthread_create(&thread, NULL, doTest, &ckptCheckpointHandle); > checkpointCount++; > pthread_mutex_unlock(&checkpointLock); > if (checkpointCount == MAX_CHECKPOINT) > break; > else > sleep(1); > } > > pthread_join(thread, NULL); > > error = saCkptFinalize(ckptHandle); > printf("[main] %s\n", get_test_output(error, SA_AIS_OK)); > > return 0; > } > _______________________________________________ > Openais mailing list > [email protected] > https://lists.linux-foundation.org/mailman/listinfo/openais _______________________________________________ Openais mailing list [email protected] https://lists.linux-foundation.org/mailman/listinfo/openais
