I don't know if all of this is generally useful, but some of it may be.
job-flow-err-log.patch:
-----------------------
- adds flow pointer value to several debugging messages so they are
easier to match up
- adds error messages if a flow is cancelled. It seems like it is
helpful to see in the server logs if flows are timing out. The possible
downside it also makes error conditions noisier for applications that
use the pvfs2 library directly.
- adds debugging messages to show how many flows and dbpf I/O operations
are in progress at certain points
trove-debug-log.patch:
-----------------------
- adds debugging messages to print out offsets/sizes/etc. of aio
operations posted within trove
-Phil
diff -Naur pvfs2-old/src/io/flow/flowproto-bmi-trove/flowproto-multiqueue.c pvfs2/src/io/flow/flowproto-bmi-trove/flowproto-multiqueue.c
--- pvfs2-old/src/io/flow/flowproto-bmi-trove/flowproto-multiqueue.c 2005-12-20 01:08:28.000000000 +0100
+++ pvfs2/src/io/flow/flowproto-bmi-trove/flowproto-multiqueue.c 2006-03-26 20:57:14.000000000 +0200
@@ -461,6 +461,7 @@
{
struct fp_private_data *flow_data = PRIVATE_FLOW(flow_d);
+ gossip_err("Flow proto cancel called on %p\n", flow_d);
gen_mutex_lock(flow_data->parent->flow_mutex);
/*
if the flow is already marked as complete, then there is nothing
@@ -682,8 +683,8 @@
void *tmp_user_ptr;
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue bmi_recv_callback_fn, error code: %d.\n",
- error_code);
+ "flowproto-multiqueue bmi_recv_callback_fn, error code: %d, flow: %p.\n",
+ error_code, flow_data->parent);
q_item->posted_id = 0;
@@ -868,8 +869,8 @@
q_item = result_tmp->q_item;
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue trove_read_callback_fn, error_code: %d.\n",
- error_code);
+ "flowproto-multiqueue trove_read_callback_fn, error_code: %d, flow: %p.\n",
+ error_code, flow_data->parent);
result_tmp->posted_id = 0;
@@ -980,7 +981,8 @@
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
"flowproto-multiqueue bmi_send_callback_fn, error_code: %d, "
- "initial_call_flag: %d.\n", error_code, initial_call_flag);
+ "initial_call_flag: %d, flow: %p.\n", error_code, initial_call_flag,
+ flow_data->parent);
if(flow_data->parent->error_code != 0 && initial_call_flag)
{
@@ -1214,8 +1216,8 @@
gossip_debug(
GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue trove_write_callback_fn, error_code: %d.\n",
- error_code);
+ "flowproto-multiqueue trove_write_callback_fn, error_code: %d, flow: %p.\n",
+ error_code, flow_data->parent);
result_tmp->posted_id = 0;
@@ -1490,8 +1492,8 @@
enum bmi_buffer_type buffer_type = BMI_EXT_ALLOC;
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue mem_to_bmi_callback_fn, error_code: %d.\n",
- error_code);
+ "flowproto-multiqueue mem_to_bmi_callback_fn, error_code: %d, flow: %p.\n",
+ error_code, flow_data->parent);
q_item->posted_id = 0;
@@ -1673,8 +1675,8 @@
PVFS_size region_size;
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue bmi_to_mem_callback_fn, error_code: %d.\n",
- error_code);
+ "flowproto-multiqueue bmi_to_mem_callback_fn, error_code: %d, flow: %p.\n",
+ error_code, flow_data->parent);
q_item->posted_id = 0;
@@ -1863,15 +1865,16 @@
int ret;
gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue error cleanup path.\n");
+ "flowproto-multiqueue handle_io_error() called for flow %p.\n",
+ flow_data->parent);
/* is this the first error registered for this particular flow? */
if(flow_data->parent->error_code == 0)
{
enum flow_endpoint_type src, dest;
+
+ gossip_err("Flow proto error cleanup started on %p, error_code: %d\n", flow_data->parent, error_code);
- gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
- "flowproto-multiqueue first failure.\n");
flow_data->parent->error_code = error_code;
if(q_item)
{
@@ -1924,6 +1927,8 @@
/* impossible condition */
assert(0);
}
+ gossip_err("Flow proto %p canceling a total of %d BMI or Trove operations\n",
+ flow_data->parent, flow_data->cleanup_pending_count);
}
else
{
@@ -1931,11 +1936,15 @@
flow_data->cleanup_pending_count--;
}
- gossip_debug(GOSSIP_FLOW_PROTO_DEBUG, "%s: cleanup_pending_count: %d\n",
- __func__, flow_data->cleanup_pending_count);
+ gossip_debug(GOSSIP_FLOW_PROTO_DEBUG,
+ "flowproto-multiqueue handle_io_error() pending count: %d\n",
+ flow_data->cleanup_pending_count);
if(flow_data->cleanup_pending_count == 0)
{
+ gossip_err("Flow proto error cleanup finished %p, error_code: %d\n",
+ flow_data->parent, flow_data->parent->error_code);
+
/* we are finished, make sure error is marked and state is set */
assert(flow_data->parent->error_code);
/* we are in trouble if more than one callback function thinks that
diff -Naur pvfs2-old/src/io/job/job-time-mgr.c pvfs2/src/io/job/job-time-mgr.c
--- pvfs2-old/src/io/job/job-time-mgr.c 2005-10-25 20:00:59.000000000 +0200
+++ pvfs2/src/io/job/job-time-mgr.c 2006-03-26 20:57:14.000000000 +0200
@@ -15,6 +15,7 @@
#include "gen-locks.h"
#include "gossip.h"
#include "job-time-mgr.h"
+#include "pvfs2-internal.h"
static QLIST_HEAD(bucket_queue);
static gen_mutex_t bucket_mutex = GEN_MUTEX_INITIALIZER;
@@ -254,8 +255,7 @@
switch(jd->type)
{
case JOB_BMI:
- gossip_debug(GOSSIP_CANCEL_DEBUG, "%s: cancelling bmi.\n",
- __func__);
+ gossip_err("Job time out: cancelling bmi operation, job_id: %llu.\n", llu(jd->job_id));
ret = job_bmi_cancel(jd->job_id, jd->context_id);
break;
case JOB_FLOW:
@@ -272,14 +272,12 @@
else
{
/* otherwise kill the flow */
- gossip_debug(GOSSIP_CANCEL_DEBUG,
- "%s: cancelling flow.\n", __func__);
+ gossip_err("Job time out: cancelling flow operation, job_id: %llu.\n", llu(jd->job_id));
ret = job_flow_cancel(jd->job_id, jd->context_id);
}
break;
case JOB_TROVE:
- gossip_debug(GOSSIP_CANCEL_DEBUG,
- "%s: cancelling trove.\n", __func__);
+ gossip_err("Job time out: cancelling trove operation, job_id: %llu.\n", llu(jd->job_id));
ret = job_trove_dspace_cancel(
jd->u.trove.fsid, jd->job_id, jd->context_id);
break;
diff -Naur pvfs2-old/src/io/job/job.c pvfs2/src/io/job/job.c
--- pvfs2-old/src/io/job/job.c 2006-03-09 22:12:53.000000000 +0100
+++ pvfs2/src/io/job/job.c 2006-03-26 20:57:26.000000000 +0200
@@ -1223,6 +1223,8 @@
*id = jd->job_id;
flow_pending_count++;
jd->event_type = PVFS_EVENT_FLOW;
+ gossip_debug(GOSSIP_FLOW_DEBUG, "Job flows in progress (post time): %d\n",
+ flow_pending_count);
return(job_time_mgr_add(jd, timeout_sec));
}
@@ -4547,6 +4549,8 @@
tmp_desc->completed_flag = 1;
flow_pending_count--;
+ gossip_debug(GOSSIP_FLOW_DEBUG, "Job flows in progress (callback time): %d\n",
+ flow_pending_count);
#ifdef __PVFS2_JOB_THREADED__
/* wake up anyone waiting for completion */
diff -Naur pvfs2-old/src/io/trove/trove-dbpf/dbpf-bstream.c pvfs2/src/io/trove/trove-dbpf/dbpf-bstream.c
--- pvfs2-old/src/io/trove/trove-dbpf/dbpf-bstream.c 2006-03-23 21:12:49.000000000 +0100
+++ pvfs2/src/io/trove/trove-dbpf/dbpf-bstream.c 2006-03-26 20:57:26.000000000 +0200
@@ -251,6 +251,8 @@
{
s_dbpf_ios_in_progress--;
}
+ gossip_debug(GOSSIP_TROVE_DEBUG, "DBPF I/O ops in progress: %d\n",
+ s_dbpf_ios_in_progress);
if (s_dbpf_io_ready_queue == NULL)
{
@@ -369,6 +371,10 @@
"(%d already in progress)\n",
cur_op, s_dbpf_ios_in_progress);
}
+
+ gossip_debug(GOSSIP_TROVE_DEBUG, "DBPF I/O ops in progress: %d\n",
+ s_dbpf_ios_in_progress);
+
gen_mutex_unlock(&s_dbpf_io_mutex);
if (!op_delayed)
---------------------
PatchSet 478
Date: 2006/03/08 15:52:47
Author: pcarns
Branch: HEAD
Tag: (none)
Log:
added new trove debugging messages to show arguments to lio_listio() calls
[artf32976]
Members:
src/io/trove/trove-dbpf/dbpf-bstream.c:1.6->1.7
Index: src/io/trove/trove-dbpf/dbpf-bstream.c
diff -u src/io/trove/trove-dbpf/dbpf-bstream.c:1.6 src/io/trove/trove-dbpf/dbpf-bstream.c:1.7
--- src/io/trove/trove-dbpf/dbpf-bstream.c:1.6 Tue Jan 10 08:19:46 2006
+++ src/io/trove/trove-dbpf/dbpf-bstream.c Wed Mar 8 08:52:47 2006
@@ -294,6 +294,19 @@
aiocb_ptr_array[i] = &aiocbs[i];
}
+ gossip_debug(GOSSIP_TROVE_DEBUG,
+ "lio_listio called with the following aiocbs:\n");
+ for(i=0; i<aiocb_inuse_count; i++)
+ {
+ gossip_debug(GOSSIP_TROVE_DEBUG,
+ "aiocb_ptr_array[%d]: fd: %d, off: %lld, bytes: %d, buf: %p, type: %d\n",
+ i,
+ aiocb_ptr_array[i]->aio_fildes,
+ lld(aiocb_ptr_array[i]->aio_offset),
+ (int)aiocb_ptr_array[i]->aio_nbytes,
+ aiocb_ptr_array[i]->aio_buf,
+ (int)aiocb_ptr_array[i]->aio_lio_opcode);
+ }
ret = lio_listio(LIO_NOWAIT, aiocb_ptr_array, aiocb_inuse_count,
&cur_op->op.u.b_rw_list.sigev);
@@ -328,6 +341,7 @@
int aiocb_inuse_count, struct sigevent *sig, int dec_first)
{
int ret = -TROVE_EINVAL, op_delayed = 0;
+ int i;
assert(cur_op);
gen_mutex_lock(&s_dbpf_io_mutex);
@@ -373,6 +387,18 @@
if (!op_delayed)
{
+ gossip_debug(GOSSIP_TROVE_DEBUG,
+ "lio_listio called with the following aiocbs:\n");
+ for(i=0; i<aiocb_inuse_count; i++)
+ {
+ gossip_debug(GOSSIP_TROVE_DEBUG,
+ "aiocb_ptr_array[%d]: fd: %d, off: %lld, bytes: %d, buf: %p, type: %d\n",
+ i, aiocb_ptr_array[i]->aio_fildes,
+ lld(aiocb_ptr_array[i]->aio_offset),
+ (int)aiocb_ptr_array[i]->aio_nbytes,
+ aiocb_ptr_array[i]->aio_buf,
+ (int)aiocb_ptr_array[i]->aio_lio_opcode);
+ }
ret = lio_listio(LIO_NOWAIT, aiocb_ptr_array,
aiocb_inuse_count, sig);
if (ret != 0)
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers