The attached patch makes O_DIRECT work on Linux in BerkeleyDB 4.5.20. (You will need to manually define LINUX_NEEDS_PAGE_ALIGNMENT if you're using a kernel older than 2.6.)

The main reason to use this patch is to conserve memory - ordinarily, all the I/O that BDB does to its files gets cached in the Linux filesystem buffer cache. This caching is redundant since BDB always does its own caching, and it effectively makes the BDB environment consume twice as much memory as it needs. Using O_DIRECT on I/Os disables the filesystem buffer cache for those I/Os, thus freeing up a sizable chunk of memory.

The caching problem is particularly aggravated on Linux because the memory manager doesn't give program pages higher priority than cache pages. So when your system is tight on memory, the kernel will start swapping program data pages before it starts reclaiming buffer cache pages, and application performance plummets. (Possibly that indicates a kernel bug, or at least a misfeature.)

Note that you must configure BerkeleyDB with --enable-o_direct to enable the support, and you must add "set_flags DB_DIRECT_DB" to your DB_CONFIG to enable it in a particular environment.

With this patch, a slapd that occupies 6.8GB on a system with 8GB of RAM can run continuously without swapping, delivering a sustained 11,500 authentications per second. Without the patch, swapping starts when the process hits the 4.5GB mark (because over 3GB of buffer cache is in use), and performance drops to only *hundreds* of authentications per second.

--
  -- Howard Chu
  Chief Architect, Symas Corp.  http://www.symas.com
  Director, Highland Sun        http://highlandsun.com/hyc
  OpenLDAP Core Team            http://www.openldap.org/project/
--- dbinc/mp.h.orig     2006-09-07 14:31:58.000000000 -0700
+++ dbinc/mp.h  2007-01-06 19:14:56.000000000 -0800
@@ -378,6 +378,23 @@
 #define        BH_FREE_REUSE           0x02
 #define        BH_FREE_UNLOCKED        0x04
 
+#ifdef DIAG_MVCC
+#define        BH_ALIGNED
+#define VM_PAGESIZE 4096
+#endif
+
+/* Linux O_DIRECT needs aligned buffers. 2.6 kernel allows 512 byte
+ * alignment, otherwise need page sized (4096).
+ */
+#if defined(linux) && !defined(BH_ALIGNED)
+#define        BH_ALIGNED
+#ifdef LINUX_NEEDS_PAGE_ALIGNMENT
+#define VM_PAGESIZE 4096
+#else  /* Linux 2.6+ */
+#define VM_PAGESIZE 512
+#endif
+#endif
+
 /*
  * BH --
  *     Buffer header.
@@ -404,7 +421,7 @@
 
        roff_t          td_off;         /* MVCC: creating TXN_DETAIL offset. */
        SH_CHAIN_ENTRY  vc;             /* MVCC: version chain. */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
        u_int16_t       align_off;      /* Alignment offset for diagnostics.*/
 #endif
 
@@ -465,15 +482,14 @@
     (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) &&            \
     dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))
 
-#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
-#define        VM_PAGESIZE 4096
-#define        MVCC_BHSIZE(mfp, sz) do {                                       
\
+#ifdef BH_ALIGNED
+#define        BHSIZE(mfp, sz) do {                                    \
        sz += VM_PAGESIZE + sizeof(BH);                                 \
        if (mfp->stat.st_pagesize < VM_PAGESIZE)                        \
                sz += VM_PAGESIZE - mfp->stat.st_pagesize;              \
 } while (0)
 
-#define        MVCC_BHALIGN(mfp, p) do {                                       
\
+#define        BHALIGN(mfp, p) do {                                    \
        if (mfp != NULL) {                                              \
                BH *__bhp;                                              \
                void *__orig = (p);                                     \
@@ -493,13 +509,19 @@
        }                                                               \
 } while (0)
 
-#define        MVCC_BHUNALIGN(mfp, p) do {                                     
\
+#define        BHUNALIGN(mfp, p) do {                                  \
        if ((mfp) != NULL) {                                            \
                BH *bhp = (BH *)(p);                                    \
                (p) = ((u_int8_t *)bhp - bhp->align_off);               \
        }                                                               \
 } while (0)
+#else
+#define        BHSIZE(mfp, sz) do {} while (0)
+#define        BHALIGN(mfp, p) do {} while (0)
+#define        BHUNALIGN(mfp, p) do {} while (0)
+#endif
 
+#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
 #ifdef linux
 #define        MVCC_MPROTECT(buf, sz, mode) do {                               
\
        int __ret = mprotect((buf), (sz), (mode));                      \
@@ -513,11 +535,7 @@
        }                                                               \
 } while (0)
 #endif /* linux */
-
-#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
-#define        MVCC_BHSIZE(mfp, sz) do {} while (0)
-#define        MVCC_BHALIGN(mfp, p) do {} while (0)
-#define        MVCC_BHUNALIGN(mfp, p) do {} while (0)
+#else
 #define        MVCC_MPROTECT(buf, size, mode) do {} while (0)
 #endif
 
--- mp/mp_alloc.c.orig  2006-09-07 14:32:03.000000000 -0700
+++ mp/mp_alloc.c       2007-01-06 19:14:56.000000000 -0800
@@ -66,7 +66,7 @@
        if (mfp != NULL) {
                len = SSZA(BH, buf) + mfp->stat.st_pagesize;
                /* Add space for alignment padding for MVCC diagnostics. */
-               MVCC_BHSIZE(mfp, len);
+               BHSIZE(mfp, len);
        }
 
        MPOOL_REGION_LOCK(dbenv, infop);
@@ -91,10 +91,10 @@
                        c_mp->stat.st_pages++;
                MPOOL_REGION_UNLOCK(dbenv, infop);
                /*
-                * For MVCC diagnostics, align the pointer so that the buffer
+                * If necessary, align the pointer so that the buffer
                 * starts on a page boundary.
                 */
-               MVCC_BHALIGN(mfp, p);
+               BHALIGN(mfp, p);
 
 found:         if (offsetp != NULL)
                        *offsetp = R_OFFSET(infop, p);
@@ -447,7 +447,7 @@
        MPOOLFILE *mfp;
        void *buf;
 {
-       MVCC_BHUNALIGN(mfp, buf);
+       BHUNALIGN(mfp, buf);
        COMPQUIET(mfp, NULL);
        __db_shalloc_free(infop, buf);
 }
--- mp/mp_fget.c.orig   2006-09-13 09:22:42.000000000 -0700
+++ mp/mp_fget.c        2007-01-06 19:14:56.000000000 -0800
@@ -708,7 +708,7 @@
                 * the hash bucket's priority.
                 */
                /*lint --e{668} (flexelint: bhp cannot be NULL). */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
                memset(bhp, 0, SSZ(BH, align_off));
 #else
                memset(bhp, 0, sizeof(BH));

Reply via email to