The attached patch causes a transient failure to create a volume transaction to be retried, brutally three times in 1 sec intervals.

The problem usually only affects servers with ten-thousands of volumes, where a simple "vos listvol" could easily disturb a simultaneous "vos backupsys", or one out of two simultaneous "vos listvols" could print thousands of error messages depending on how they race.

Note: The patch "undoes" another patch (and its correction) in that area - that's not elegant but ok as it predates that patch and fixes both problems in one go, for the first one in a slightly different manner.

bcc'ed to openafs-bugs

--
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rainer Toebbicke
European Laboratory for Particle Physics(CERN) - Geneva, Switzerland
Phone: +41 22 767 8985       Fax: +41 22 767 7155
Index: openafs/src/volser/voltrans.c
===================================================================
RCS file: /cvs/openafs/src/volser/voltrans.c,v
retrieving revision 1.10.2.5
diff -u -r1.10.2.5 voltrans.c
--- openafs/src/volser/voltrans.c       18 Oct 2008 19:26:17 -0000      1.10.2.5
+++ openafs/src/volser/voltrans.c       16 Apr 2009 14:21:37 -0000
@@ -75,21 +75,31 @@
 NewTrans(afs_int32 avol, afs_int32 apart)
 {
     /* set volid, next, partition */
-    struct volser_trans *tt, *newtt;
+    struct volser_trans *tt;
     struct timeval tp;
     struct timezone tzp;
+    int retries = 3;
 
-    newtt = (struct volser_trans *)malloc(sizeof(struct volser_trans));
     VTRANS_LOCK;
     /* don't allow the same volume to be attached twice */
-    for (tt = allTrans; tt; tt = tt->next) {
-       if ((tt->volid == avol) && (tt->partition == apart)) {
-           VTRANS_UNLOCK;
-           free(newtt);
-           return (struct volser_trans *)0;    /* volume busy */
+    while (1) {
+       for (tt = allTrans; tt; tt = tt->next) {
+           if ((tt->volid == avol) && (tt->partition == apart)) break;
        }
+       if (tt == NULL) break;
+
+       VTRANS_UNLOCK;
+       if (retries-- > 0)
+#ifdef AFS_PTHREAD_ENV
+           sleep(1);           /* Allow for short lock-ups */
+#else
+           IOMGR_Sleep(1);     /* Allow for short lock-ups */
+#endif /*AFS_PTHREAD_ENV*/
+       else
+           return (struct volser_trans *)0;    /* volume busy */
+       VTRANS_LOCK;
     }
-    tt = newtt;
+    tt = (struct volser_trans *)malloc(sizeof(struct volser_trans));
     memset(tt, 0, sizeof(struct volser_trans));
     tt->volid = avol;
     tt->partition = apart;

Reply via email to