The branch, master has been updated
       via  4f42d17b74ce891691eee1cead498959cc8e4837 (commit)
       via  6860c79aea416f56cfd7a6af790bbdf495dbc54e (commit)
       via  909269a4a3690e1245117ca1af935401455785e6 (commit)
       via  bab744e3c49efef2e05dc09e8ea9bd3e3fa58716 (commit)
      from  d8f010355b715e49709836e057a5d0f110919897 (commit)

http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 4f42d17b74ce891691eee1cead498959cc8e4837
Author: Michael Adam <[email protected]>
Date:   Tue Nov 6 01:26:05 2012 +0100

    utils:ping_pong: add a -c switch to check the lock before reading/writing
    
    This is to verify that the fcntl F_GETLK call reports F_UNLCK if called
    from a process already holding a lock. This is for example used by samba's
    strict locking code in combination with "posix locking = true".
    
    Signed-off-by: Michael Adam <[email protected]>

commit 6860c79aea416f56cfd7a6af790bbdf495dbc54e
Author: Michael Adam <[email protected]>
Date:   Mon Nov 19 17:28:03 2012 +0100

    recovery: data corruption of persistent DBs after recoveries: don't delete 
emtpy records
    
    The record-by-record mode of recovery deletes empty records.
    For persistent databases, this can lead to data corruption
    by deleting records that should be there:
    
    - Assume the cluster has been running for a while.
    
    - A record R in a persistent database has been created and
      deleted a couple of times, the last operation being deletion,
      leaving an empty record with a high RSN, say 10.
    
    - Now a node N is turned off.
    
    - This leaves the local database copy of D on N with the empty
      copy of R and RSN 10. On all other nodes, the recovery has deleted
      the copy of record R.
    
    - Now the record is created again while node N is turned off.
      This creates R with RSN = 1 on all nodes except for N.
    
    - Now node N is turned on again. The following recovery will chose
      the older empty copy of R due to RSN 10 > RSN 1.
    
    ==> Hence the record is gone after the recovery.
    
    On databases like Samba's registry, this can damage the higher-level
    data structures built from the various tdb-level records.
    
    This patch fixes that problem by not deleting empty records in recoveries
    for persistent databases.
    
    Signed-off-by: Michael Adam <[email protected]>

commit 909269a4a3690e1245117ca1af935401455785e6
Author: Michael Adam <[email protected]>
Date:   Mon Nov 19 17:20:11 2012 +0100

    recoverd: fix a comment typo
    
    Signed-off-by: Michael Adam <[email protected]>

commit bab744e3c49efef2e05dc09e8ea9bd3e3fa58716
Author: Michael Adam <[email protected]>
Date:   Fri Nov 16 14:33:41 2012 +0100

    vacuum: fix a comment typo
    
    Pair-Programmed-With: Volker Lendecke <[email protected]>
    Signed-off-by: Michael Adam <[email protected]>

-----------------------------------------------------------------------

Summary of changes:
 server/ctdb_recoverd.c      |   35 ++++++++++++++++++++++++++++++++---
 server/ctdb_vacuum.c        |    2 +-
 utils/ping_pong/ping_pong.c |   42 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 73 insertions(+), 6 deletions(-)


Changeset truncated at 500 lines:

diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 6d0dbc4..d50e84e 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -1185,7 +1185,7 @@ static struct tdb_wrap *create_recdb(struct ctdb_context 
*ctdb, TALLOC_CTX *mem_
 
 
 /* 
-   a traverse function for pulling all relevent records from recdb
+   a traverse function for pulling all relevant records from recdb
  */
 struct recdb_data {
        struct ctdb_context *ctdb;
@@ -1202,8 +1202,37 @@ static int traverse_recdb(struct tdb_context *tdb, 
TDB_DATA key, TDB_DATA data,
        struct ctdb_rec_data *rec;
        struct ctdb_ltdb_header *hdr;
 
-       /* skip empty records */
-       if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
+       /*
+        * skip empty records - but NOT for persistent databases:
+        *
+        * The record-by-record mode of recovery deletes empty records.
+        * For persistent databases, this can lead to data corruption
+        * by deleting records that should be there:
+        *
+        * - Assume the cluster has been running for a while.
+        *
+        * - A record R in a persistent database has been created and
+        *   deleted a couple of times, the last operation being deletion,
+        *   leaving an empty record with a high RSN, say 10.
+        *
+        * - Now a node N is turned off.
+        *
+        * - This leaves the local database copy of D on N with the empty
+        *   copy of R and RSN 10. On all other nodes, the recovery has deleted
+        *   the copy of record R.
+        *
+        * - Now the record is created again while node N is turned off.
+        *   This creates R with RSN = 1 on all nodes except for N.
+        *
+        * - Now node N is turned on again. The following recovery will chose
+        *   the older empty copy of R due to RSN 10 > RSN 1.
+        *
+        * ==> Hence the record is gone after the recovery.
+        *
+        * On databases like Samba's registry, this can damage the higher-level
+        * data structures built from the various tdb-level records.
+        */
+       if (!params->persistent && data.dsize <= sizeof(struct 
ctdb_ltdb_header)) {
                return 0;
        }
 
diff --git a/server/ctdb_vacuum.c b/server/ctdb_vacuum.c
index 0ca485d..7f6a8f5 100644
--- a/server/ctdb_vacuum.c
+++ b/server/ctdb_vacuum.c
@@ -679,7 +679,7 @@ static int ctdb_process_vacuum_fetch_lists(struct 
ctdb_db_context *ctdb_db,
 }
 
 /**
- * Proces the delete list:
+ * Process the delete list:
  * Send the records to delete to all other nodes with the
  * try_delete_records control.
  */
diff --git a/utils/ping_pong/ping_pong.c b/utils/ping_pong/ping_pong.c
index 098dacd..0a49d66 100644
--- a/utils/ping_pong/ping_pong.c
+++ b/utils/ping_pong/ping_pong.c
@@ -2,6 +2,7 @@
    A ping-pong fcntl byte range lock test
 
    Copyright (C) Andrew Tridgell 2002
+   Copyright (C) Michael Adam 2012
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -41,7 +42,7 @@
 
 static struct timeval tp1,tp2;
 
-static int do_reads, do_writes, use_mmap;
+static int do_reads, do_writes, use_mmap, do_check;
 
 static void start_timer(void)
 {
@@ -69,6 +70,36 @@ static int lock_range(int fd, int offset, int len)
        return fcntl(fd,F_SETLKW,&lock);
 }
 
+/* check whether we could place a lock */
+int check_lock(int fd, int offset, int len)
+{
+       struct flock lock;
+       int ret;
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = offset;
+       lock.l_len = len;
+       lock.l_pid = 0;
+
+       ret = fcntl(fd, F_GETLK, &lock);
+       if (ret != 0) {
+               printf("error calling fcntl F_GETLCK: %s\n", strerror(errno));
+               return -1;
+       }
+
+       if (lock.l_type == F_UNLCK) {
+               /* we would be able to place the lock */
+               return 0;
+       }
+
+       /* we would not be able to place lock */
+       printf("check_lock failed: lock held: "
+              "pid='%d', type='%d', start='%d', len='%d'\n",
+              (int)lock.l_pid, (int)lock.l_type, (int)lock.l_start, 
(int)lock.l_len);
+       return 1;
+}
+
 /* unlock a byte range in a open file */
 static int unlock_range(int fd, int offset, int len)
 {
@@ -123,6 +154,9 @@ static void ping_pong(int fd, int num_locks)
                        printf("lock at %d failed! - %s\n",
                               (i+1) % num_locks, strerror(errno));
                }
+               if (do_check) {
+                       ret = check_lock(fd, i, 1);
+               }
                if (do_reads) {
                        unsigned char c;
                        if (use_mmap) {
@@ -169,7 +203,7 @@ int main(int argc, char *argv[])
        int fd, num_locks;
        int c;
 
-       while ((c = getopt(argc, argv, "rwm")) != -1) {
+       while ((c = getopt(argc, argv, "rwmc")) != -1) {
                switch (c){
                case 'w':
                        do_writes = 1;
@@ -180,6 +214,9 @@ int main(int argc, char *argv[])
                case 'm':
                        use_mmap = 1;
                        break;
+               case 'c':
+                       do_check = 1;
+                       break;
                default:
                        fprintf(stderr, "Unknown option '%c'\n", c);
                        exit(1);
@@ -194,6 +231,7 @@ int main(int argc, char *argv[])
                printf("           -r    do reads\n");
                printf("           -w    do writes\n");
                printf("           -m    use mmap\n");
+               printf("           -c    check locks\n");
                exit(1);
        }
 


-- 
CTDB repository

Reply via email to