I hate to beat on a tired horse (threads and db locking issues), but I am running into what I believe is a bug as we scale up the number of threads in our application. A little background: there is a main scheduler thread that does most of the processing, and client threads to handle remote connections. These clients don't touch the database very often, but on occasion (in particular when they exit) they request an exclusive lock to remove themselves from the database.

The problem that I see is that with multiple threads all attempting to "BEGIN EXCLUSIVE", they will occasionally *all* fail, calling the busy handler repeatedly until it finally returns SQL_BUSY. Let me re-state for clarity's sake: 10 threads all try "BEGIN EXCLUSIVE" at the same time. One succeeds, processes, and COMMITs. The other 9 will sometimes repeatedly call the busy handler over and over until they fail with SQLITE_BUSY, even though the database *should be* available to start a new exclusive transaction.

I have tried to eliminate as many variables as I could in my testing and in the sample program:
* No SQL queries other than BEGIN and COMMIT are being used on an empty database.
* No threads are accessing the database in any place other than the pthread_cleanup handler.
* I have tested on Fedora Core 1,2,3, SUSE and Gentoo, and with 32 and 64 bit systems.


The remaining variables I am working with are system load and CPU count:
* On a hyperthreaded (2 virtual CPUs) computer it fails at least 50% of the time when unloaded and < 5% with the load around 1, and then climbs up again as the load increases.
* On a multi-CPU computer it fails about 10% of the time when unloaded and < 1% when both CPUs are otherwise occupied.
* I have occasional access to a single-CPU Pentium 3 where I have not been able to duplicate the problem. This pc is running FC2 just like our chief offender the hyperthreaded PC who fails 50% of the time when unloaded.


In order to demonstrate this problem I have attached a program called sqlthreads.c. It builds under Linux with:
gcc -Wall -g sqlthreads.c -o sqlthreads -lpthread -lsqlite3.
sqlthreads takes two arguments: the number of threads to spawn, and the type of busy handler to use. 0 is the standard "sqlite3_busy_timeout" handler with a 20 second timeout. 1 is a custom sqlite3_busy_handler with a fixed sleep time. -1 is a similar sqlite3_busy_handler with a random sleep time. When run, the test launches N threads, waits a second, then cancels all of them. At this point, each thread will sqlite3_exec a BEGIN EXCLUSIVE. The one that succeeds will sleep for 1/2 second and then call COMMIT, at which time another thread *should* be free to begin. If at the end you see "It appears as though at least x threads failed to obtain an exclusive lock on the database!" then you have experienced what I consider to be the problem.


If I could get any independant corroboration of this problem, that would be fantastic. If you skimmed through my long email, it appears most likely to occur on a hyperthreaded PC, aka Pentium 4 2.8 ghz or higher.

-Eli

/*
 * sqlthreads.c -- sqlite3 locking / thread concurrency tester
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <sqlite3.h>

// global variables
int                             g_failed = 0;
int                             g_threadCnt = 0;

pthread_t               g_pool[256] = {0};
pthread_key_t   g_pkeyDb = 0;
pthread_mutex_t g_logMutex = PTHREAD_MUTEX_INITIALIZER;

//
// log output wrapper, adds timestamp and thread id
//
void sqlog(char * pattern, ...)
{
        va_list varArgs;
        char timeBuf[16];
        time_t t = time(0);
        struct tm tmStruct;

        // determine the current time
        localtime_r(&t, &tmStruct);
        strftime(timeBuf, sizeof(timeBuf), "%H:%M:%S", &tmStruct);

        // init the variable argument processor
        va_start(varArgs, pattern);

        // obtain a mutex lock and write out the log entry
        pthread_mutex_lock(&g_logMutex);
        fprintf(stderr, "%s [%u] ", timeBuf, (unsigned int)pthread_self());
        vfprintf(stderr, pattern, varArgs);
        pthread_mutex_unlock(&g_logMutex);

        // clean up and exit
        va_end(varArgs);
}


//
// sqlite3 alternative busy handler
//
int busyHandler(void *pArg1, int iPriorCalls)
{
        sqlog("!! dbBusyHandler %d\n", iPriorCalls);

        // sleep if handler has been called less than threshold value
        if (iPriorCalls < 20)
        {
                // adding a random value here greatly reduces locking
                if (pArg1 < 0)
                        usleep((rand() % 500000) + 400000);
                else usleep(500000);
                return 1;
        }

        // have sqlite3_exec immediately return SQLITE_BUSY
        return 0;
}


//
// each thread calls this when it is canceled
//
void destroySingleThread(void *ptr)
{
        int rc;
    sqlite3 * pDb;

    // get our db handle from thread local storage
        pDb = (sqlite3 *) pthread_getspecific(g_pkeyDb);

        // we want to get EXCLUSIVE access to the DB
    rc = sqlite3_exec(pDb, "BEGIN EXCLUSIVE", NULL, NULL, NULL);
        if (rc != SQLITE_OK)
        {
                sqlog("destroySingleThread: Exclusive Transaction failed! %s\n",
                        sqlite3_errmsg(pDb));
                g_failed++;
        }
        else
        {
                // sleep for a bit to simulate some SQL queries
                usleep(500000);
        sqlog("destroySingleThread: thread %u terminated!!\n", pthread_self());

                // commit the transaction, which should free it up for other 
threads
                rc = sqlite3_exec(pDb, "COMMIT", NULL, NULL, NULL);
                if (rc != SQLITE_OK) 
                {
                        sqlog("destroySingleThread: Commit failed! %s\n",
                                sqlite3_errmsg(pDb));
                }
        }

        sqlite3_close(pDb);

    return;
}


//
// each thread calls this to open the database and then spin
//
void * startSingleThread(void * sleepMode) 
{          
        sqlite3 * pDb = NULL;
        char * szDb = "./sqlthreadsDb";
        
        sqlog("startSingleThread: new thread %u launched\n", pthread_self());
        
        // open handle to the database and save it in thread local storage
        if (SQLITE_OK != sqlite3_open(szDb, &pDb))
        {
                sqlog("startSingleThread: Failed to open database %s: %s\n",
                        szDb, sqlite3_errmsg(pDb));
                return NULL;
        }

        // install a timeout or busy handler to be called if database is locked
        if (* (int *)sleepMode == 0)
                sqlite3_busy_timeout(pDb, 20000);
        else sqlite3_busy_handler(pDb, busyHandler, sleepMode);

        // put db handle in thread local storage so thread destructor can close 
it
        if (0 != pthread_setspecific(g_pkeyDb, pDb))
        {
                sqlog("startSingleThread: pthread_setspecific failed to save 
pDb\n");
                return NULL;
        }

        // push the node destructor onto the thread's cleanup stack
        pthread_cleanup_push(destroySingleThread, 0);

        // sit in a loop, doing nothing much
        while (1)
        {
                pthread_testcancel();
                usleep(100000);
        }

        // (never gets here, but...) remove the cleanup handler from the stack
        pthread_cleanup_pop(1);

        return NULL;
}


//
// launch all threads
//
int startThreads(int threadCnt, void * sleepMode)
{
        int rc, idx;
        pthread_attr_t attr;

        // create a pthread_attr object for our pthreads
        pthread_attr_init(&attr);
        pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

        // spawn the threads
        for (idx = 0; idx < threadCnt; idx++)
        {
                rc = pthread_create(&g_pool[idx], &attr, startSingleThread, 
sleepMode);
                if (rc)
                {
                        sqlog("startThreads: Failed to start thread %d: %d\n", 
idx, rc);
                        break;
                }
                g_threadCnt = idx+1;
        }

        // destroy the pthread_attr object we used to create our pthreads
        pthread_attr_destroy(&attr);

        return 0;
}


//
// start the main server process
//
int main(int argc, char **argv) 
{
        int idx, threadCnt, sleepMode;

        // parse cmd line args
        if (argc != 3) 
        {
                printf("\nUsage: %s [thread count] [sleep mode]\n", argv[0]);
                printf( "\tSleep Mode is one of:\n"
                                "\t\t 0 - Standard sqlite3_busy_timeout 
handler\n"
                                "\t\t-1 - sqlite3_busy_handler with random 
usleep time\n"
                                "\t\t 1 - sqlite3_busy_handler with fixed 
usleep time\n");
                return 0;
        }

        // get the thread count and sleep mode
        threadCnt = atoi(argv[1]);
        sleepMode = atoi(argv[2]);
        if (threadCnt < 1 || threadCnt > 50)
        {
                printf("\nBe reasonable! Thread count should be between 1 and 
50\n\n");
                return 0;
        }

        // print out a banner
        sqlog("sqlite3 thread test starting...\n");
        if (sleepMode == 0)
                sqlog("Using Sqlite3 standard sqlite3_busy_timeout handler\n");
        else if (sleepMode == 1)
                sqlog("Using custom sqlite3_busy_handler /w fixed usleep\n");
        else if (sleepMode == -1)
                sqlog("Using custom sqlite3_busy_handler /w random usleep\n");
        else 
        {
                sqlog("\nInvalid sleep mode '%s' specified!\n\n", argv[2]);
                exit(0);
        }

        // create a thread local storage key and start up our threads
        pthread_key_create(&g_pkeyDb, NULL);
        startThreads(threadCnt, &sleepMode);

        // loop through the thread array and wait on each
        sqlog("main: now sleep for 1 sec and then cancel all threads\n");
        sleep(1);
        sqlog("\nNote that if you are testing sqlite3_timeout_handler (0)"
                  "\nit may not produce output for up to 20 seconds\n");

        // loop and cancel threads
        for (idx = 0; idx < g_threadCnt; idx++)
        {
                sqlog("main: Sending cancel to thread %u\n", g_pool[idx]);
                pthread_cancel(g_pool[idx]);
        }
        sqlog("main: pthread_cancel sent to all threads!\n");

        // loop and wait for all threads to exit
        for (idx = 0; idx < g_threadCnt; idx++)
        {
                if (g_pool[idx])
                        pthread_join(g_pool[idx], NULL);
        }

        sqlog("Concluding the sqlite3 thread test...\n");
        if (g_failed)
                sqlog("It appears as though at least %d threads failed to 
obtain an "
                        "exclusive lock on the database!\n", g_failed);
        else sqlog("Success! Now try again! :-)\n");

        // delete thread local storage and exit
        pthread_key_delete(g_pkeyDb);
        pthread_exit(0);
        return 0;
}


Reply via email to