Author: mm
Date: Sun Mar 31 19:03:25 2013
New Revision: 248959
URL: http://svnweb.freebsd.org/changeset/base/248959

Log:
  MFC r242845 (delphij):
    Illumos r13840:97fd5cdf328a:
      3145 single-copy arc
      3212 ztest: race condition between vdev_online() and spa_vdev_remove()
  
    Illumos r13849:3468a95b27cd:
      3258 ztest's use of file descriptors is unstable

Modified:
  stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
Directory Properties:
  stable/8/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/   (props changed)
  stable/8/sys/cddl/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)

Modified: stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c Sun Mar 31 18:56:00 
2013        (r248958)
+++ stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c Sun Mar 31 19:03:25 
2013        (r248959)
@@ -121,8 +121,8 @@
 #include <sys/fs/zfs.h>
 #include <libnvpair.h>
 
-#define        ZTEST_FD_DATA 3
-#define        ZTEST_FD_RAND 4
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
 
 typedef struct ztest_shared_hdr {
        uint64_t        zh_hdr_size;
@@ -713,14 +713,17 @@ process_options(int argc, char **argv)
            UINT64_MAX >> 2);
 
        if (strlen(altdir) > 0) {
-               char cmd[MAXNAMELEN];
-               char realaltdir[MAXNAMELEN];
+               char *cmd;
+               char *realaltdir;
                char *bin;
                char *ztest;
                char *isa;
                int isalen;
 
-               (void) realpath(getexecname(), cmd);
+               cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+               realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+               VERIFY(NULL != realpath(getexecname(), cmd));
                if (0 != access(altdir, F_OK)) {
                        ztest_dump_core = B_FALSE;
                        fatal(B_TRUE, "invalid alternate ztest path: %s",
@@ -751,6 +754,9 @@ process_options(int argc, char **argv)
                        fatal(B_TRUE, "invalid alternate lib directory %s",
                            zo->zo_alt_libpath);
                }
+
+               umem_free(cmd, MAXPATHLEN);
+               umem_free(realaltdir, MAXPATHLEN);
        }
 }
 
@@ -767,10 +773,12 @@ ztest_random(uint64_t range)
 {
        uint64_t r;
 
+       ASSERT3S(ztest_fd_rand, >=, 0);
+
        if (range == 0)
                return (0);
 
-       if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r))
+       if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
                fatal(1, "short read from /dev/urandom");
 
        return (r % range);
@@ -4836,7 +4844,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
                        if (islog)
                                (void) rw_unlock(&ztest_name_lock);
                } else {
+                       /*
+                        * Ideally we would like to be able to randomly
+                        * call vdev_[on|off]line without holding locks
+                        * to force unpredictable failures but the side
+                        * effects of vdev_[on|off]line prevent us from
+                        * doing so. We grab the ztest_vdev_lock here to
+                        * prevent a race between injection testing and
+                        * aux_vdev removal.
+                        */
+                       VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
                        (void) vdev_online(spa, guid0, 0, NULL);
+                       VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
                }
        }
 
@@ -5795,29 +5814,16 @@ ztest_init(ztest_shared_t *zs)
 }
 
 static void
-setup_fds(void)
+setup_data_fd(void)
 {
-       int fd;
-#ifdef illumos
-
-       char *tmp = tempnam(NULL, NULL);
-       fd = open(tmp, O_RDWR | O_CREAT, 0700);
-       ASSERT3U(fd, ==, ZTEST_FD_DATA);
-       (void) unlink(tmp);
-       free(tmp);
-#else
-       char tmp[MAXPATHLEN];
-
-       strlcpy(tmp, ztest_opts.zo_dir, MAXPATHLEN);
-       strlcat(tmp, "/ztest.XXXXXX", MAXPATHLEN);
-       fd = mkstemp(tmp);
-       ASSERT3U(fd, ==, ZTEST_FD_DATA);
-#endif
+       static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
 
-       fd = open("/dev/urandom", O_RDONLY);
-       ASSERT3U(fd, ==, ZTEST_FD_RAND);
+       ztest_fd_data = mkstemp(ztest_name_data);
+       ASSERT3S(ztest_fd_data, >=, 0);
+       (void) unlink(ztest_name_data);
 }
 
+
 static int
 shared_data_size(ztest_shared_hdr_t *hdr)
 {
@@ -5838,15 +5844,11 @@ setup_hdr(void)
        int size;
        ztest_shared_hdr_t *hdr;
 
-#ifndef illumos
-       pwrite(ZTEST_FD_DATA, "", 1, 0);
-#endif
-
        hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
-           PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+           PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
        ASSERT(hdr != MAP_FAILED);
 
-       VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t)));
+       VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
 
        hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
        hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
@@ -5857,7 +5859,7 @@ setup_hdr(void)
        hdr->zh_ds_count = ztest_opts.zo_datasets;
 
        size = shared_data_size(hdr);
-       VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size));
+       VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
 
        (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
 }
@@ -5870,14 +5872,14 @@ setup_data(void)
        uint8_t *buf;
 
        hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
-           PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0);
+           PROT_READ, MAP_SHARED, ztest_fd_data, 0);
        ASSERT(hdr != MAP_FAILED);
 
        size = shared_data_size(hdr);
 
        (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
        hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
-           PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0);
+           PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
        ASSERT(hdr != MAP_FAILED);
        buf = (uint8_t *)hdr;
 
@@ -5896,12 +5898,13 @@ exec_child(char *cmd, char *libpath, boo
 {
        pid_t pid;
        int status;
-       char cmdbuf[MAXPATHLEN];
+       char *cmdbuf = NULL;
 
        pid = fork();
 
        if (cmd == NULL) {
-               (void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf));
+               cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+               (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
                cmd = cmdbuf;
        }
 
@@ -5910,9 +5913,16 @@ exec_child(char *cmd, char *libpath, boo
 
        if (pid == 0) { /* child */
                char *emptyargv[2] = { cmd, NULL };
+               char fd_data_str[12];
 
                struct rlimit rl = { 1024, 1024 };
                (void) setrlimit(RLIMIT_NOFILE, &rl);
+
+               (void) close(ztest_fd_rand);
+               VERIFY3U(11, >=,
+                   snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+               VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
                (void) enable_extended_FILE_stdio(-1, -1);
                if (libpath != NULL)
                        VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
@@ -5925,6 +5935,11 @@ exec_child(char *cmd, char *libpath, boo
                fatal(B_TRUE, "exec failed: %s", cmd);
        }
 
+       if (cmdbuf != NULL) {
+               umem_free(cmdbuf, MAXPATHLEN);
+               cmd = NULL;
+       }
+
        while (waitpid(pid, &status, 0) != pid)
                continue;
        if (statusp != NULL)
@@ -5989,39 +6004,41 @@ main(int argc, char **argv)
        char timebuf[100];
        char numbuf[6];
        spa_t *spa;
-       char cmd[MAXNAMELEN];
+       char *cmd;
        boolean_t hasalt;
-
-       boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR));
-       ASSERT(ischild || errno == EBADF);
+       char *fd_data_str = getenv("ZTEST_FD_DATA");
 
        (void) setvbuf(stdout, NULL, _IOLBF, 0);
 
        dprintf_setup(&argc, argv);
 
-       if (!ischild) {
+       ztest_fd_rand = open("/dev/urandom", O_RDONLY);
+       ASSERT3S(ztest_fd_rand, >=, 0);
+
+       if (!fd_data_str) {
                process_options(argc, argv);
 
-               setup_fds();
+               setup_data_fd();
                setup_hdr();
                setup_data();
                bcopy(&ztest_opts, ztest_shared_opts,
                    sizeof (*ztest_shared_opts));
        } else {
+               ztest_fd_data = atoi(fd_data_str);
                setup_data();
                bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
        }
        ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
 
        /* Override location of zpool.cache */
-       (void) asprintf((char **)&spa_config_path, "%s/zpool.cache",
-           ztest_opts.zo_dir);
+       VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+           ztest_opts.zo_dir), !=, -1);
 
        ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
            UMEM_NOFAIL);
        zs = ztest_shared;
 
-       if (ischild) {
+       if (fd_data_str) {
                metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
                metaslab_df_alloc_threshold =
                    zs->zs_metaslab_df_alloc_threshold;
@@ -6044,7 +6061,8 @@ main(int argc, char **argv)
                    (u_longlong_t)ztest_opts.zo_time);
        }
 
-       (void) strlcpy(cmd, getexecname(), sizeof (cmd));
+       cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+       (void) strlcpy(cmd, getexecname(), MAXNAMELEN);
 
        zs->zs_do_init = B_TRUE;
        if (strlen(ztest_opts.zo_alt_ztest) != 0) {
@@ -6185,5 +6203,7 @@ main(int argc, char **argv)
                    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
        }
 
+       umem_free(cmd, MAXNAMELEN);
+
        return (0);
 }

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c       Sun Mar 
31 18:56:00 2013        (r248958)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c       Sun Mar 
31 19:03:25 2013        (r248959)
@@ -191,6 +191,7 @@ uint64_t zfs_arc_meta_limit = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+int zfs_disable_dup_eviction = 0;
 
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
@@ -321,7 +322,6 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_io_error;
        kstat_named_t arcstat_l2_size;
        kstat_named_t arcstat_l2_hdr_size;
-       kstat_named_t arcstat_memory_throttle_count;
        kstat_named_t arcstat_l2_write_trylock_fail;
        kstat_named_t arcstat_l2_write_passed_headroom;
        kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -334,6 +334,10 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
        kstat_named_t arcstat_l2_write_buffer_list_iter;
        kstat_named_t arcstat_l2_write_buffer_list_null_iter;
+       kstat_named_t arcstat_memory_throttle_count;
+       kstat_named_t arcstat_duplicate_buffers;
+       kstat_named_t arcstat_duplicate_buffers_size;
+       kstat_named_t arcstat_duplicate_reads;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -391,7 +395,6 @@ static arc_stats_t arc_stats = {
        { "l2_io_error",                KSTAT_DATA_UINT64 },
        { "l2_size",                    KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
        { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
        { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
        { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
@@ -403,7 +406,11 @@ static arc_stats_t arc_stats = {
        { "l2_write_pios",              KSTAT_DATA_UINT64 },
        { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
        { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
-       { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
+       { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
+       { "memory_throttle_count",      KSTAT_DATA_UINT64 },
+       { "duplicate_buffers",          KSTAT_DATA_UINT64 },
+       { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
+       { "duplicate_reads",            KSTAT_DATA_UINT64 }
 };
 
 #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -1516,6 +1523,17 @@ arc_buf_clone(arc_buf_t *from)
        hdr->b_buf = buf;
        arc_get_data_buf(buf);
        bcopy(from->b_data, buf->b_data, size);
+
+       /*
+        * This buffer already exists in the arc so create a duplicate
+        * copy for the caller.  If the buffer is associated with user data
+        * then track the size and number of duplicates.  These stats will be
+        * updated as duplicate buffers are created and destroyed.
+        */
+       if (hdr->b_type == ARC_BUFC_DATA) {
+               ARCSTAT_BUMP(arcstat_duplicate_buffers);
+               ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+       }
        hdr->b_datacnt += 1;
        return (buf);
 }
@@ -1616,6 +1634,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
                ASSERT3U(state->arcs_size, >=, size);
                atomic_add_64(&state->arcs_size, -size);
                buf->b_data = NULL;
+
+               /*
+                * If we're destroying a duplicate buffer make sure
+                * that the appropriate statistics are updated.
+                */
+               if (buf->b_hdr->b_datacnt > 1 &&
+                   buf->b_hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
+               }
                ASSERT(buf->b_hdr->b_datacnt > 0);
                buf->b_hdr->b_datacnt -= 1;
        }
@@ -1800,6 +1828,48 @@ arc_buf_size(arc_buf_t *buf)
 }
 
 /*
+ * Called from the DMU to determine if the current buffer should be
+ * evicted. In order to ensure proper locking, the eviction must be initiated
+ * from the DMU. Return true if the buffer is associated with user data and
+ * duplicate buffers still exist.
+ */
+boolean_t
+arc_buf_eviction_needed(arc_buf_t *buf)
+{
+       arc_buf_hdr_t *hdr;
+       boolean_t evict_needed = B_FALSE;
+
+       if (zfs_disable_dup_eviction)
+               return (B_FALSE);
+
+       mutex_enter(&buf->b_evict_lock);
+       hdr = buf->b_hdr;
+       if (hdr == NULL) {
+               /*
+                * We are in arc_do_user_evicts(); let that function
+                * perform the eviction.
+                */
+               ASSERT(buf->b_data == NULL);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_FALSE);
+       } else if (buf->b_data == NULL) {
+               /*
+                * We have already been added to the arc eviction list;
+                * recommend eviction.
+                */
+               ASSERT3P(hdr, ==, &arc_eviction_hdr);
+               mutex_exit(&buf->b_evict_lock);
+               return (B_TRUE);
+       }
+
+       if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+               evict_needed = B_TRUE;
+
+       mutex_exit(&buf->b_evict_lock);
+       return (evict_needed);
+}
+
+/*
  * Evict buffers from list until we've removed the specified number of
  * bytes.  Move the removed buffers to the appropriate evict state.
  * If the recycle flag is set, then attempt to "recycle" a buffer:
@@ -2885,8 +2955,10 @@ arc_read_done(zio_t *zio)
        abuf = buf;
        for (acb = callback_list; acb; acb = acb->acb_next) {
                if (acb->acb_done) {
-                       if (abuf == NULL)
+                       if (abuf == NULL) {
+                               ARCSTAT_BUMP(arcstat_duplicate_reads);
                                abuf = arc_buf_clone(buf);
+                       }
                        acb->acb_buf = abuf;
                        abuf = NULL;
                }
@@ -3401,6 +3473,16 @@ arc_release(arc_buf_t *buf, void *tag)
                        ASSERT3U(*size, >=, hdr->b_size);
                        atomic_add_64(size, -hdr->b_size);
                }
+
+               /*
+                * We're releasing a duplicate user data buffer, update
+                * our statistics accordingly.
+                */
+               if (hdr->b_type == ARC_BUFC_DATA) {
+                       ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
+                       ARCSTAT_INCR(arcstat_duplicate_buffers_size,
+                           -hdr->b_size);
+               }
                hdr->b_datacnt -= 1;
                arc_cksum_verify(buf);
 #ifdef illumos

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Sun Mar 
31 18:56:00 2013        (r248958)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Sun Mar 
31 19:03:25 2013        (r248959)
@@ -2071,7 +2071,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db,
                        dbuf_evict(db);
                } else {
                        VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-                       if (!DBUF_IS_CACHEABLE(db))
+
+                       /*
+                        * A dbuf will be eligible for eviction if either the
+                        * 'primarycache' property is set or a duplicate
+                        * copy of this buffer is already cached in the arc.
+                        *
+                        * In the case of the 'primarycache' a buffer
+                        * is considered for eviction if it matches the
+                        * criteria set in the property.
+                        *
+                        * To decide if our buffer is considered a
+                        * duplicate, we must call into the arc to determine
+                        * if multiple buffers are referencing the same
+                        * block on-disk. If so, then we simply evict
+                        * ourselves.
+                        */
+                       if (!DBUF_IS_CACHEABLE(db) ||
+                           arc_buf_eviction_needed(db->db_buf))
                                dbuf_clear(db);
                        else
                                mutex_exit(&db->db_mtx);

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h   Sun Mar 
31 18:56:00 2013        (r248958)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h   Sun Mar 
31 19:03:25 2013        (r248959)
@@ -96,6 +96,7 @@ int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
+boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to