commit 38e47f046d4d3f9657e9644417ad8b154b46a6e7
Author: Jeff Garzik <[email protected]>
Date:   Fri Mar 5 08:03:48 2010 -0500

    Self-check cleanups.
    
    - remove self-check period.  self-check period is externally controlled.
    
    - self-check initiation limited to specified list of administrative users
    
    - clear up confusion about check "start" versus "poke." "start" is
      preferred.
    
    - thread is not spawned until first check
    
    - add some whitespace for readability
    
    - Unrelated: test/server-test.cfg: s/spaces/tab/ in <CLD> clause
    
    Signed-off-by: Jeff Garzik <[email protected]>

diff --git a/doc/setup.txt b/doc/setup.txt
index f8d9f00..a5e7c66 100644
--- a/doc/setup.txt
+++ b/doc/setup.txt
@@ -104,15 +104,12 @@ _cld._udp.phx2.ex.com has SRV record 10 50 8081 
maika.phx2.ex.com.
                <Rack>F3R18</Rack>
        </Geo>
 
-*) configure the self-check period, if desired. Note that the actual amount
-   of time that the daemon sleeps between self-check sweeps is randomized
-   in order to prevent the convoy effect, so the value below is approximate.
+*) configure the list of users authorized to initiate background self-check
 
-       <!-- 30 * 60 * 60 == 108000 seconds -->
-       <SelfCheckPeriod>108000</SelfCheckPeriod>
-
-   The default is zero, which disables the periodic self-check. It may be
-   appropriate if the application performs its own end-to-end checking.
+       <Check>
+               <User>admin_user</User>
+               <User>another_user</User>
+       </Check>
 
 *) start daemon (it will put itself into the background) with the
    configuration file just created:
diff --git a/include/chunkc.h b/include/chunkc.h
index 5ebc936..683992e 100644
--- a/include/chunkc.h
+++ b/include/chunkc.h
@@ -89,7 +89,7 @@ extern bool stc_put_inline(struct st_client *stc, const void 
*key,
 extern bool stc_del(struct st_client *stc, const void *key, size_t key_len);
 extern bool stc_ping(struct st_client *stc);
 
-extern bool stc_check_poke(struct st_client *stc);
+extern bool stc_check_start(struct st_client *stc);
 extern bool stc_check_status(struct st_client *stc,
                             struct chunk_check_status *out);
 
diff --git a/lib/chunkdc.c b/lib/chunkdc.c
index 93783bf..e9e0e0f 100644
--- a/lib/chunkdc.c
+++ b/lib/chunkdc.c
@@ -978,13 +978,13 @@ bool stc_ping(struct st_client *stc)
        return true;
 }
 
-bool stc_check_poke(struct st_client *stc)
+bool stc_check_start(struct st_client *stc)
 {
        struct chunksrv_resp resp;
        struct chunksrv_req *req = (struct chunksrv_req *) stc->req_buf;
 
        if (stc->verbose)
-               fprintf(stderr, "libstc: CHECK_POKE\n");
+               fprintf(stderr, "libstc: CHECK_START\n");
 
        /* initialize request */
        req_init(stc, req);
@@ -1004,7 +1004,7 @@ bool stc_check_poke(struct st_client *stc)
        /* check response code */
        if (resp.resp_code != che_Success) {
                if (stc->verbose)
-                       fprintf(stderr, "CHECK_POKE resp code: %d\n",
+                       fprintf(stderr, "CHECK_START resp code: %d\n",
                                resp.resp_code);
                return false;
        }
diff --git a/server/chunkd.h b/server/chunkd.h
index 41e0a4b..42bcb21 100644
--- a/server/chunkd.h
+++ b/server/chunkd.h
@@ -223,8 +223,9 @@ struct server {
        char                    *group;
        uint32_t                nid;
        struct geo              loc;
-       time_t                  chk_period;
+
        int                     chk_pipe[2];
+       GList                   *chk_users;
 
        TCHDB                   *tbl_master;
        struct objcache         actives;
@@ -342,8 +343,7 @@ extern void resp_init_req(struct chunksrv_resp *resp,
 extern void read_config(void);
 
 /* selfcheck.c */
-extern void chk_init(void);
-extern int chk_spawn(time_t period, TCHDB *hdb);
+extern int chk_spawn(TCHDB *hdb);
 
 static inline bool use_sendfile(struct client *cli)
 {
diff --git a/server/config.c b/server/config.c
index fee2c0c..69b5e9b 100644
--- a/server/config.c
+++ b/server/config.c
@@ -41,6 +41,7 @@ struct config_context {
        bool            badnid;
        bool            in_ssl;
        bool            in_listen;
+       bool            in_chk;
        bool            have_ssl;
        char            *vol_path;
 
@@ -133,6 +134,13 @@ static void cfg_elm_start (GMarkupParseContext *context,
                        applog(LOG_ERR, "Nested CLD in configuration");
                }
        }
+       else if (!strcmp(element_name, "Check")) {
+               if (!cc->in_chk) {
+                       cc->in_chk = true;
+               } else {
+                       applog(LOG_ERR, "Nested Check in configuration");
+               }
+       }
 }
 
 static void cfg_elm_end_listen(struct config_context *cc)
@@ -293,7 +301,16 @@ static void cfg_elm_end (GMarkupParseContext *context,
        else if (!strcmp(element_name, "SSL"))
                cc->in_ssl = false;
 
-       else if (cc->in_ssl && cc->text && !strcmp(element_name, "PrivateKey")) 
{
+       else if (cc->in_chk && cc->text && !strcmp(element_name, "User")) {
+               chunkd_srv.chk_users =
+                       g_list_append(chunkd_srv.chk_users, cc->text);
+               cc->text = NULL;
+       }
+
+       else if (!strcmp(element_name, "Check"))
+               cc->in_chk = false;
+
+       else if (cc->in_ssl && cc->text && !strcmp(element_name, "PrivateKey")){
                if (SSL_CTX_use_PrivateKey_file(ssl_ctx, cc->text,
                                                SSL_FILETYPE_PEM) <= 0)
                        applog(LOG_ERR, "Failed to read SSL private key '%s'",
@@ -442,24 +459,6 @@ static void cfg_elm_end (GMarkupParseContext *context,
                cc->text = NULL;
        }
 
-       else if (!strcmp(element_name, "SelfCheckPeriod")) {
-               if (!cc->text) {
-                       applog(LOG_WARNING, "SelfCheckPeriod element empty");
-                       return;
-               }
-               n = strtol(cc->text, NULL, 10);
-               if (n < 0 || n >= LONG_MAX) {
-                       applog(LOG_ERR, "SelfCheckPeriod '%s' is invalid",
-                              cc->text);
-                       free(cc->text);
-                       cc->text = NULL;
-                       return;
-               }
-               chunkd_srv.chk_period = n;
-               free(cc->text);
-               cc->text = NULL;
-       }
-
        else {
                applog(LOG_WARNING, "Unknown element \"%s\"", element_name);
        }
diff --git a/server/selfcheck.c b/server/selfcheck.c
index f8cbaef..5207e7f 100644
--- a/server/selfcheck.c
+++ b/server/selfcheck.c
@@ -13,7 +13,6 @@
 #include "chunkd.h"
 
 struct chk_arg {
-       time_t period;
        TCHDB *hdb;
        // GThread *gthread;
 };
@@ -183,62 +182,46 @@ static void chk_thread_command(struct chk_tls *tls)
 
 static gpointer chk_thread_func(gpointer data)
 {
-       struct chk_tls _tls;
-       struct chk_tls *tls;
+       struct chk_tls _tls = { .arg = data };
+       struct chk_tls *tls = &_tls;
        struct pollfd pfd[1];
-       int tmo;
-       time_t dt;
        int i;
        int rc;
 
-       tls = &_tls;
-       memset(tls, 0, sizeof(struct chk_tls));
-       tls->arg = data;
-
        for (;;) {
                g_mutex_lock(chunkd_srv.bigmutex);
                chunkd_srv.chk_state = CHK_ST_IDLE;
                g_mutex_unlock(chunkd_srv.bigmutex);
 
-               /*
-                * Without the randomization, all systems in a low loaded
-                * datacenter are virtually guaranteed to start checks in the
-                * same time, blow fuses and/or disrupt applications.
-                */
-               dt = tls->arg->period;
-               if (dt == 0) {
-                       tmo = -1;
-               } else {
-                       if (dt >= 3)
-                               dt += rand() % (dt/3);
-                       if (debugging)
-                               applog(LOG_DEBUG, "chk: sleeping %lu s", dt);
-                       tmo = dt * 1000;
-               }
-
                memset(pfd, 0, sizeof(pfd));
                pfd[0].fd = chunkd_srv.chk_pipe[0];
                pfd[0].events = POLLIN;
 
-               rc = poll(pfd, 1, tmo);
+               rc = poll(pfd, ARRAY_SIZE(pfd), -1);
                if (rc < 0) {
                        applog(LOG_WARNING, "chk: poll error: %s",
                               strerror(errno));
                        break;  /* don't flood, just die */
                }
 
-               if (rc) {
-                       for (i = 0; i < ARRAY_SIZE(pfd); i++) {
-                               if (pfd[i].revents) {
-                                       if (i == 0) {
-                                               chk_thread_command(tls);
-                                       }
-                               }
+               if (rc == 0)
+                       continue;       /* should never happen */
+
+               for (i = 0; i < ARRAY_SIZE(pfd); i++) {
+                       if (!pfd[i].revents)
+                               continue;
+
+                       switch (i) {
+                       case 0:
+                               chk_thread_command(tls);
+                               break;
+                       default:
+                               /* do nothing */
+                               break;
                        }
-               } else {
-                       chk_thread_scan(tls);
                }
        }
+
        return NULL;
 }
 
@@ -249,7 +232,7 @@ static gpointer chk_thread_func(gpointer data)
  */
 static struct chk_arg *thread;
 
-int chk_spawn(time_t period, TCHDB *hdb)
+int chk_spawn(TCHDB *hdb)
 {
        GThread *gthread;
        struct chk_arg *arg;
@@ -260,7 +243,6 @@ int chk_spawn(time_t period, TCHDB *hdb)
                applog(LOG_ERR, "No core");
                return -1;
        }
-       arg->period = period;
        arg->hdb = hdb;
 
        gthread = g_thread_create(chk_thread_func, arg, FALSE, &error);
@@ -276,18 +258,3 @@ int chk_spawn(time_t period, TCHDB *hdb)
        return 0;
 }
 
-void chk_init()
-{
-       int rc;
-
-       if (!chunkd_srv.chk_period) {
-               if (debugging)
-                       applog(LOG_DEBUG, "chk: disabled by configuration");
-               return;
-       }
-
-       chunkd_srv.chk_state = CHK_ST_INIT;
-       rc = chk_spawn(chunkd_srv.chk_period, chunkd_srv.tbl_master);
-       if (rc)
-               exit(1);                /* message already logged */
-}
diff --git a/server/server.c b/server/server.c
index 1dd9d8f..0d83311 100644
--- a/server/server.c
+++ b/server/server.c
@@ -933,24 +933,47 @@ err_out:
        return cli_err(cli, err, false);
 }
 
-static bool chk_poke(struct client *cli)
+static bool chk_user_authorized(struct client *cli)
+{
+       GList *tmp = chunkd_srv.chk_users;
+
+       while (tmp) {
+               char *s;
+
+               s = tmp->data;
+               if (!strcmp(cli->user, s))
+                       return true;
+
+               tmp = tmp->next;
+       }
+
+       return false;
+}
+
+static bool chk_start(struct client *cli)
 {
        unsigned char cmd;
        int rc;
 
+       if (!chk_user_authorized(cli))
+               return cli_err(cli, che_AccessDenied, true);
+
        g_mutex_lock(chunkd_srv.bigmutex);
+
        switch (chunkd_srv.chk_state) {
        case CHK_ST_OFF:
                chunkd_srv.chk_state = CHK_ST_INIT;
                g_mutex_unlock(chunkd_srv.bigmutex);
-               rc = chk_spawn(chunkd_srv.chk_period, chunkd_srv.tbl_master);
+               rc = chk_spawn(chunkd_srv.tbl_master);
                if (rc)
                        return cli_err(cli, che_InternalError, true);
                break;
+
        case CHK_ST_INIT:
        case CHK_ST_RUNNING:
                g_mutex_unlock(chunkd_srv.bigmutex);
                return cli_err(cli, che_Busy, true);
+
        default:
                chunkd_srv.chk_state = CHK_ST_RUNNING;
                g_mutex_unlock(chunkd_srv.bigmutex);
@@ -966,8 +989,11 @@ static bool chk_status(struct client *cli)
        struct chunk_check_status outbuf;
 
        memset(&outbuf, 0, sizeof(struct chunk_check_status));
+
        g_mutex_lock(chunkd_srv.bigmutex);
+
        outbuf.lastdone = cpu_to_le64(chunkd_srv.chk_done);
+
        switch (chunkd_srv.chk_state) {
        case CHK_ST_IDLE:
                outbuf.state = chk_Idle;
@@ -979,6 +1005,7 @@ static bool chk_status(struct client *cli)
        default:
                outbuf.state = chk_Off;
        }
+
        g_mutex_unlock(chunkd_srv.bigmutex);
 
        return cli_resp_bin(cli, &outbuf, sizeof(struct chunk_check_status));
@@ -1110,7 +1137,7 @@ static bool cli_evt_exec_req(struct client *cli, unsigned 
int events)
                rcb = volume_open(cli);
                break;
        case CHO_CHECK_START:
-               rcb = chk_poke(cli);
+               rcb = chk_start(cli);
                break;
        case CHO_CHECK_STATUS:
                rcb = chk_status(cli);
@@ -1804,8 +1831,6 @@ int main (int argc, char *argv[])
                goto err_out_cld;
        }
 
-       chk_init();
-
        /* set up server networking */
        for (tmpl = chunkd_srv.listeners; tmpl; tmpl = tmpl->next) {
                rc = net_open(tmpl->data);
diff --git a/test/selfcheck-unit.c b/test/selfcheck-unit.c
index fbd9a09..ead5568 100644
--- a/test/selfcheck-unit.c
+++ b/test/selfcheck-unit.c
@@ -37,7 +37,6 @@ struct config_context {
        char *text;
 
        char *path;
-       long period;
 };
 
 static void cfg_elm_start(GMarkupParseContext *context,
@@ -56,20 +55,12 @@ static void cfg_elm_end(GMarkupParseContext *context,
                        GError          **error)
 {
        struct config_context *cc = user_data;
-       long n;
 
        if (!strcmp(element_name, "Path")) {
                OK(cc->text);
                free(cc->path);
                cc->path = cc->text;
                cc->text = NULL;
-       } else if (!strcmp(element_name, "SelfCheckPeriod")) {
-               OK(cc->text);
-               n = strtol(cc->text, NULL, 10);
-               OK(n >= 0 && n < LONG_MAX);
-               cc->period = n;
-               free(cc->text);
-               cc->text = NULL;
        } else {
                free(cc->text);
                cc->text = NULL;
@@ -251,7 +242,6 @@ int main(int argc, char *argv[])
        memset(&ctx, 0, sizeof(struct config_context));
        read_config(&ctx);
        OK(ctx.path);           /* must have a path */
-       OK(!ctx.period);        /* should have a SelfCheckPeriod set to off */
 
        /*
         * Step 1: create the object
@@ -275,11 +265,9 @@ int main(int argc, char *argv[])
        OK(rcb);
 
        /*
-        * Step 3: damage the back-end file and wait
-        * We sleep for longer than the self-check period because:
-        * 1) the server sleeps for up to period*1.5
-        * 2) the server may be quite busy walking numerous objects
-        * 3) the build system may be overloaded
+        * Step 3: damage the back-end file and wait, because:
+        * 1) the server may be quite busy walking numerous objects
+        * 2) the build system may be overloaded
         */
        rcb = be_file_damage(fn);
        OK(rcb);
@@ -291,7 +279,7 @@ int main(int argc, char *argv[])
        OK(stc);
        rcb = stc_check_status(stc, &status1);
        OK(rcb);
-       rcb = stc_check_poke(stc);
+       rcb = stc_check_start(stc);
        OK(rcb);
        cnt = 0;
        for (;;) {
diff --git a/test/server-test.cfg b/test/server-test.cfg
index 80833d2..d98fca4 100644
--- a/test/server-test.cfg
+++ b/test/server-test.cfg
@@ -23,7 +23,12 @@
 <NID>1</NID>
 
 <CLD>
-  <PortFile>cld.port</PortFile>
-  <Host>localhost</Host>
+       <PortFile>cld.port</PortFile>
+       <Host>localhost</Host>
 </CLD>
 
+<Check>
+       <User>testuser</User>
+       <User>testuser2</User>
+</Check>
+
diff --git a/tools/chcli.c b/tools/chcli.c
index 24b3e4b..6fde35f 100644
--- a/tools/chcli.c
+++ b/tools/chcli.c
@@ -185,7 +185,7 @@ static void show_cmds(void)
 "DEL key       Delete key\n"
 "PING          Ping server\n"
 "CHECKSTATUS   Fetch status of server self-check\n"
-"CHECKPOKE     Force server self-check\n"
+"CHECKSTART    Begin server self-check\n"
 "\n"
 "Keys provided on the command line (as opposed to via -k) are stored\n"
 "with a C-style nul terminating character appended, adding 1 byte to\n"
@@ -639,7 +639,7 @@ static int cmd_get(void)
        return 0;
 }
 
-static int cmd_check_poke(void)
+static int cmd_check_start(void)
 {
        struct st_client *stc;
 
@@ -647,8 +647,8 @@ static int cmd_check_poke(void)
        if (!stc)
                return 1;
 
-       if (!stc_check_poke(stc)) {
-               fprintf(stderr, "CHECK POKE failed\n");
+       if (!stc_check_start(stc)) {
+               fprintf(stderr, "CHECK START failed\n");
                stc_free(stc);
                return 1;
        }
@@ -758,7 +758,7 @@ int main (int argc, char *argv[])
        case CHC_CHECKSTATUS:
                return cmd_check_status();
        case CHC_CHECKSTART:
-               return cmd_check_poke();
+               return cmd_check_start();
        }
 
        return 0;
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to