auto-sm-tracking.patch:
-----------------------
At some point, new linked lists were added to track state machines that
are currently running within the server. When an SM completes, it is
implicitly removed from the list. However, SMs that were started
without a request (ie internal state machines) were not added to any
list. This caused a segfault if any of these internal state machines
stopped (because the completion code assumes that all SMs should be
removed from a list). This patch corrects the problem by just making an
extra linked list for state machine instances that are not associated
with a particular request.
mgmt-getconfig-assignment.patch:
--------------------------------
This patch adds some missing variable assignments in the mgmt-getconfig
state machine. This problem might have previously caused pvfs2-validate
to complain about not being able to retrieve configuration data from
some servers.
mgmt-remove-dirent-handlecount.patch:
-------------------------------------
This patch updates mgmt-remove-dirent to match the normal rmdirent state
machine; a flag must now be set to inform trove when to update keyval
handle counts. Otherwise the directory entry counting will get out of
whack when pvfs2-remove-object is used.
skip-retry-delay-on-cancel.patch:
---------------------------------
PVFS2 adds a "retry delay" between operation retries on the client side
to prevent busy spinning (and quick retry exhaustion) on network errors
that appear quickly. However, it also tacked on this retry delay on
operations that had simply timed out. This causes timeouts and retries
to take excessively long; if an operation fails because of a job
timeout, then you don't need to artificially delay before trying again.
This patch addresses the problem by having the client state machines
differentiate between job timeouts and normal error codes.
diff -Naur pvfs2/src/server/pvfs2-server.c pvfs2-new/src/server/pvfs2-server.c
--- pvfs2/src/server/pvfs2-server.c 2006-10-18 18:01:12.000000000 +0200
+++ pvfs2-new/src/server/pvfs2-server.c 2006-11-02 21:59:59.000000000 +0100
@@ -96,6 +96,8 @@
static QLIST_HEAD(posted_sop_list);
/* A list of all serv_op's posted for expected messages alone */
static QLIST_HEAD(inprogress_sop_list);
+/* A list of all serv_op's that are started automatically without requests */
+static QLIST_HEAD(noreq_sop_list);
/* this is used externally by some server state machines */
job_context_id server_job_context = -1;
@@ -1918,6 +1920,9 @@
if (new_op)
{
+ /* add to list of state machines started without a request */
+ qlist_add_tail(&new_op->next, &noreq_sop_list);
+
/* execute first state */
ret = PINT_state_machine_invoke(new_op, &tmp_status);
if (ret < 0)
Index: pvfs2_src/src/client/sysint/mgmt-get-config.c
===================================================================
--- pvfs2_src/src/client/sysint/mgmt-get-config.c (revision 2491)
+++ pvfs2_src/src/client/sysint/mgmt-get-config.c (revision 2492)
@@ -75,6 +75,9 @@
mntent.fs_id = *fsid;
+ mntent.pvfs_fs_name = cur_fs->file_system_name;
+ sm_p->u.get_config.config = config;
+
sm_p->msgpair.enc_type = cur_fs->encoding;
sm_p->u.get_config.mntent = &mntent;
Index: pvfs2_src/src/server/mgmt-remove-dirent.sm
===================================================================
--- pvfs2_src/src/server/mgmt-remove-dirent.sm (revision 2488)
+++ pvfs2_src/src/server/mgmt-remove-dirent.sm (revision 2489)
@@ -120,7 +120,7 @@
s_op->u.mgmt_remove_dirent.dirdata_handle,
&s_op->key,
NULL,
- TROVE_SYNC,
+ TROVE_SYNC | TROVE_KEYVAL_HANDLE_COUNT,
NULL,
s_op,
0,
Index: pvfs2_src/src/common/misc/msgpairarray.sm
===================================================================
--- pvfs2_src/src/common/misc/msgpairarray.sm (revision 2977)
+++ pvfs2_src/src/common/misc/msgpairarray.sm (revision 2978)
@@ -47,7 +47,8 @@
enum
{
MSGPAIRS_COMPLETE = 190,
- MSGPAIRS_RETRY = 191
+ MSGPAIRS_RETRY = 191,
+ MSGPAIRS_RETRY_NODELAY = 192
};
static int msgpairarray_init(
@@ -100,6 +101,7 @@
{
run msgpairarray_completion_fn;
MSGPAIRS_RETRY => post_retry;
+ MSGPAIRS_RETRY_NODELAY => post;
default => return;
}
}
@@ -627,7 +629,21 @@
gossip_debug(GOSSIP_MSGPAIR_DEBUG,
"*** %s: msgpair %d failed, retry %d\n",
__func__, i, msg_p->retry_count);
- js_p->error_code = MSGPAIRS_RETRY;
+ if(msg_p->op_status == -BMI_ECANCEL)
+ {
+ /* if the error code indicates cancel, then skip the
+ * delay. We have probably already been waiting a while
+ */
+ gossip_debug(GOSSIP_MSGPAIR_DEBUG,
+ "*** %s: msgpair skipping retry delay.\n", __func__);
+ js_p->error_code = MSGPAIRS_RETRY_NODELAY;
+ }
+ else
+ {
+ gossip_debug(GOSSIP_MSGPAIR_DEBUG,
+ "*** %s: msgpair retrying after delay.\n", __func__);
+ js_p->error_code = MSGPAIRS_RETRY;
+ }
} else {
char s[1024];
Index: pvfs2_src/src/client/sysint/sys-io.sm
===================================================================
--- pvfs2_src/src/client/sysint/sys-io.sm (revision 2977)
+++ pvfs2_src/src/client/sysint/sys-io.sm (revision 2978)
@@ -32,6 +32,7 @@
IO_NO_DATA = 132,
IO_DATAFILE_TRANSFERS_COMPLETE,
IO_RETRY,
+ IO_RETRY_NODELAY,
IO_GET_DATAFILE_SIZE,
IO_ANALYZE_SIZE_RESULTS,
IO_DO_SMALL_IO
@@ -390,7 +391,8 @@
PVFS_TYPE_METAFILE,
0);
- if (js_p->error_code == IO_RETRY)
+ if (js_p->error_code == IO_RETRY ||
+ (js_p->error_code == IO_RETRY_NODELAY))
{
js_p->error_code = 0;
@@ -403,6 +405,13 @@
return 1;
}
+ if(js_p->error_code == IO_RETRY_NODELAY)
+ {
+ gossip_debug(GOSSIP_IO_DEBUG, " sys-io retrying without delay.\n");
+ js_p->error_code = 0;
+ return 1;
+ }
+ gossip_debug(GOSSIP_IO_DEBUG, " sys-io retrying with delay.\n");
return job_req_sched_post_timer(
sm_p->msgarray_params.retry_delay, sm_p, 0, js_p, &tmp_id,
pint_client_sm_context);
@@ -1272,7 +1281,18 @@
gossip_debug(GOSSIP_IO_DEBUG, "Retrying I/O operation "
"(attempt number %d)\n", sm_p->u.io.retry_count);
- js_p->error_code = IO_RETRY;
+ if(ret == -BMI_ECANCEL)
+ {
+ /* if we got a BMI cancellation, then it probably indicates a
+ * that a BMI timeout has expired; we should retry without
+ * introducing another delay
+ */
+ js_p->error_code = IO_RETRY_NODELAY;
+ }
+ else
+ {
+ js_p->error_code = IO_RETRY;
+ }
goto analyze_results_exit;
}
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers