Sometimes during operation tabled starts flooding a message "read obj".

I thought it was something about replication thread fetching incomplete
objects, but actually it is caused by attempts to read from a closed
object. It is purely dictated by load levels.

This fix closes the obvious avenue, but leaves any "potential" ones.
I want to see if they ever hit.

Also, fix up messages a bit. In a redundant environment it is necessary
to know NIDs so that a broken Chunk can be isolated, so add that.

Signed-off-by: Pete Zaitcev <zait...@redhat.com>

---
 server/object.c |   32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

commit 146b9efd8628343b7ecbfe879ce2c1b09e797bba
Author: Pete Zaitcev <zait...@yahoo.com>
Date:   Thu Aug 5 20:40:29 2010 -0600

    Fix floods of "read obj(4FF1)".

diff --git a/server/object.c b/server/object.c
index d495d6e..f8e7b12 100644
--- a/server/object.c
+++ b/server/object.c
@@ -356,10 +356,14 @@ static bool object_put_end(struct client *cli)
                        if (debugging) {
                                /* FIXME how do we test for inline objects 
here? */
                                if (!stor_obj_test(ochunk, cli->out_objid))
-                                       applog(LOG_ERR, "Stat (%llX) failed",
+                                       applog(LOG_ERR,
+                                              "Stat failed nid %u oid %llX",
+                                              ochunk->node->id,
                                               (unsigned long long) 
cli->out_objid);
                                else
-                                       applog(LOG_DEBUG, "STORED %llX, size -",
+                                       applog(LOG_DEBUG,
+                                              "STORED nid %u oid %llX, size -",
+                                              ochunk->node->id,
                                               (unsigned long long) 
cli->out_objid);
                        }
                        obj_addr.nidv[nok] = GUINT32_TO_LE(ochunk->node->id);
@@ -705,8 +709,9 @@ static struct open_chunk *open_chunk1(struct storage_node 
*stnode,
 
        rc = stor_put_start(ochunk, object_put_event, objid, content_len);
        if (rc != 0) {
-               applog(LOG_WARNING, "Cannot start putting for %llX (%d)",
-                      (unsigned long long) objid, rc);
+               applog(LOG_WARNING,
+                      "Cannot start putting for nid %u oid %llX (%d)",
+                      ochunk->node->id, (unsigned long long) objid, rc);
                goto err_start;
        }
 
@@ -982,6 +987,7 @@ void cli_in_end(struct client *cli)
                return;
 
        stor_close(&cli->in_ce);
+       cli->in_len = 0;
 }
 
 static bool object_get_more(struct client *cli, void *cb_data, bool done);
@@ -995,6 +1001,13 @@ static bool object_get_poke(struct client *cli)
        char *buf;
        ssize_t bytes;
 
+       /* The checks for in_len in caller should protect us, but let's see. */
+       if (!cli->in_ce.stc) {
+               applog(LOG_ERR, "read on closed chunk, in_len %ld",
+                      (long) cli->in_len);
+               return false;
+       }
+
        /* XXX flow control - what treshold? */
        /*  if (cli_wqueued(cli) >= 1000000000) return false; */
 
@@ -1005,8 +1018,9 @@ static bool object_get_poke(struct client *cli)
        bytes = stor_get_buf(&cli->in_ce, buf,
                             MIN(cli->in_len, CLI_DATA_BUF_SZ));
        if (bytes < 0) {
-               applog(LOG_ERR, "read obj(%llX) failed",
-                      (unsigned long long) cli->in_objid);
+               applog(LOG_ERR, "read failed nid %u oid %llX (%d)",
+                      cli->in_ce.node->id, (unsigned long long) cli->in_objid,
+                      (int) bytes);
                goto err_out;
        }
        if (bytes == 0) {
@@ -1048,6 +1062,8 @@ static bool object_get_more(struct client *cli, void 
*cb_data, bool done)
        /* do not queue more, if !completion or fd was closed early */
        if (!done)      /* FIXME We used to test for input errors here. */
                return false;
+       if (!cli->in_len)
+               return false;
 
        return object_get_poke(cli);            /* won't hurt to try */
 }
@@ -1323,8 +1339,8 @@ static bool object_get_body(struct client *cli, const 
char *user,
 
        bytes = stor_get_buf(&cli->in_ce, buf, MIN(cli->in_len, sizeof(buf)));
        if (bytes < 0) {
-               applog(LOG_ERR, "read obj(%llX) failed",
-                      (unsigned long long) cli->in_objid);
+               applog(LOG_ERR, "read failed nid %u oid %llX",
+                      cli->in_ce.node->id, (unsigned long long) cli->in_objid);
                goto err_out_in_end;
        }
        if (bytes == 0) {
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to