Hi All,
I'm investigating a difficult to diagnose problem and I came across the
following code in core_output_filter():
4152 nvec = 0;
4153 nbytes = 0;
4154 temp = APR_BRIGADE_FIRST(temp_brig);
4155 APR_BUCKET_REMOVE(temp);
4156 APR_BRIGADE_INSERT_HEAD(b, temp);
4157 apr_bucket_read(temp, &str, &n,
APR_BLOCK_READ);
4158 vec[nvec].iov_base = (char*) str;
4159 vec[nvec].iov_len = n;
4160 nvec++;
4161
4162 /* Just in case the temporary brigade has
4163 * multiple buckets, recover the rest of
4164 * them and put them in the brigade that
4165 * we're sending.
4166 */
4167 for (next = APR_BRIGADE_FIRST(temp_brig);
4168 next != APR_BRIGADE_SENTINEL(temp_brig);
4169 next = APR_BRIGADE_FIRST(temp_brig)) {
4170 APR_BUCKET_REMOVE(next);
4171 APR_BUCKET_INSERT_AFTER(temp, next);
4172 temp = next;
4173 apr_bucket_read(next, &str, &n,
4174 APR_BLOCK_READ);
4175 vec[nvec].iov_base = (char*) str;
4176 vec[nvec].iov_len = n;
4177 nvec++;
4178 }
4179
4180 apr_brigade_destroy(temp_brig);
where
4034 struct iovec vec[MAX_IOVEC_TO_WRITE];
and
3995 #define MAX_IOVEC_TO_WRITE 16
but there's no explicit check here to ensure we don't exceed the array's
boundary. :/ Our problem occurs in apr_brigade_destroy(temp_brig).
Now, it may be that temp_brig is such that this can never happen, but I
don't see how based on this code:
4116 /* Create a temporary brigade as a means
4117 * of concatenating a bunch of buckets together
4118 */
4119 if (last_merged_bucket) {
4120 /* If we've concatenated together small
4121 * buckets already in a previous pass,
4122 * the initial buckets in this brigade
4123 * are heap buckets that may have extra
4124 * space left in them (because they
4125 * were created by apr_brigade_write()).
4126 * We can take advantage of this by
4127 * building the new temp brigade out of
4128 * these buckets, so that the content
4129 * in them doesn't have to be copied again.
4130 */
4131 apr_bucket_brigade *bb;
4132 bb = apr_brigade_split(b,
4133
APR_BUCKET_NEXT(last_merged_bucket));
4134 temp_brig = b;
4135 b = bb;
4136 }
4137 else {
4138 temp_brig = apr_brigade_create(f->c->pool,
4139
f->c->bucket_alloc);
4140 }
4141
4142 temp = APR_BRIGADE_FIRST(b);
4143 while (temp != e) {
4144 apr_bucket *d;
4145 rv = apr_bucket_read(temp, &str, &n,
APR_BLOCK_READ);
4146 apr_brigade_write(temp_brig, NULL, NULL,
str, n);
4147 d = temp;
4148 temp = APR_BUCKET_NEXT(temp);
4149 apr_bucket_delete(d);
4150 }
Now, all this code is inside a big if:
4101 if (n) {
4102 if (!fd) {
4103 if (nvec == MAX_IOVEC_TO_WRITE) {
4104 /* woah! too many. buffer them up, for use
later. */
4105 apr_bucket *temp, *next;
4106 apr_bucket_brigade *temp_brig;
which implies nvec could be carrying information over from previous
iterations through the code... but all of this is inside a huge
while loop where
4026 while (b && !APR_BRIGADE_EMPTY(b)) {
4027 apr_size_t nbytes = 0;
4028 apr_bucket *last_e = NULL; /* initialized for debugging */
4029 apr_bucket *e;
4030
4031 /* one group of iovecs per pass over the brigade */
4032 apr_size_t nvec = 0;
which may reset the counter when it shouldn't. :(
BTW, yes, we are dealing with extremely large files (~1G+) when we hit
our problem.
Could someone who really understands this code figure out if this is
possible or am I barking up the wrong tree.
Here is the pstack for 1 such stuck thread:
----- Thread 18842 -----
0x4021fc28: apr_pool_cleanup_kill + 0x1e (42e06048, 42e064f0, 401203b0,
8f20c40, 8f20c40, 42e064f4)
0x40120458: apr_brigade_destroy + 0x32 (42e064f0, 412126fc, 1, 1f40, 41212654,
b7fdc48) + 1a0
0x08089e23: core_output_filter + 0x33f (42e06488, 42e064f0)
0x0808173e: ap_pass_brigade + 0x2c (42e06488, 42e064f0, 412127f4, 0, 8f2a5f8,
0) + 20
0x08083ca8: ap_content_length_filter + 0x8e (8f2b260, 42e064f0)
0x0808173e: ap_pass_brigade + 0x2c (8f2b260, 42e064f0, 0, 0, 2000, 0) + 2050
0x08069c8f: ap_proxy_http_process_response + 0x3d9 (42e06048, 8f2a5f8,
42e06540, 42e06758, 42e06558, 80f0130) + 70
0x0806a59d: ap_proxy_http_handler + 0x273 (8f2a5f8, 80f0130, 42e06650, 0, 0,
0) + 10
0x080665e5: proxy_run_scheme_handler + 0x4b (8f2a5f8, 80f0130, 8f2bbde, 0, 0,
8096caf) + 30
0x08065157: proxy_handler + 0x20d (8f2a5f8, 8120978, 41214994, 8f2a5f8,
8f2a5f8, 8096c70)
0x08075a5a: ap_run_handler + 0x32 (8f2a5f8, 80a3220, 8f2a5f8, 8f2a5f8, 0,
42e06150) + 20
0x08075fdc: ap_invoke_handler + 0xba (8f2a5f8, 0, 4, 8f2a5f8, 42e06150,
8f2a5f8)
0x0806f88a: ap_process_request + 0x8e (8f2a5f8, 4, 8f2a5f8, 807f578, 42e06150,
42e06080) + 10
0x0806abc1: ap_process_http_connection + 0x117 (42e06150, 42e06080, 41214a44,
42e06150, 42e06048, 42e06080)
0x0807f458: ap_run_process_connection + 0x32 (42e06150, 42e06080, 42e06080,
18c, 42e06148, 8f20598) + 20
0x08072249: process_socket + 0x7f (42e06048, 42e06080, 6, c, 8f20598, 0) + 20
0x080729d1: worker_thread + 0x195 (8123a48, 815d798)
0x4021abec: dummy_worker + 0x16 (8123a48, 0, 41214cf8, 0, 0, 0) + e0
0x402a2881: pthread_start_thread + 0x1b1 (41214be0, 41214be0, 0, 41214be0, 1,
0) + bedeb40c
Here is a GDB bactrace of the problem thread:
#0 0x402a4a35 in __pthread_sigsuspend () from /lib/i686/libpthread.so.0
#1 0x402a3db8 in __pthread_wait_for_restart_signal ()
from /lib/i686/libpthread.so.0
#2 0x402a0c8b in [EMAIL PROTECTED] () from /lib/i686/libpthread.so.0
#3 0x4021d1b6 in apr_thread_cond_wait (cond=0xfffffffc, mutex=0x8123588)
at thread_cond.c:79
#4 0x0807515f in ap_queue_pop (queue=0x4022435c, sd=0x41914ab4, p=0x41914ab0)
at fdqueue.c:257
#5 0x080728fb in worker_thread (thd=0xfffffffc, dummy=0xfffffffc)
at worker.c:801
#6 0x4021abec in dummy_worker (opaque=0xfffffffc) at thread.c:88
#7 0x402a2881 in pthread_start_thread () from /lib/i686/libpthread.so.0
Thanks,
Ron
--
Ronald Park <[EMAIL PROTECTED]>