Hi All,

I'm investigating a difficult to diagnose problem and I came across the
following code in core_output_filter():


4152                             nvec = 0;
4153                             nbytes = 0;
4154                             temp = APR_BRIGADE_FIRST(temp_brig);
4155                             APR_BUCKET_REMOVE(temp);
4156                             APR_BRIGADE_INSERT_HEAD(b, temp);
4157                             apr_bucket_read(temp, &str, &n, 
APR_BLOCK_READ);
4158                             vec[nvec].iov_base = (char*) str;
4159                             vec[nvec].iov_len = n;
4160                             nvec++;
4161 
4162                             /* Just in case the temporary brigade has
4163                              * multiple buckets, recover the rest of
4164                              * them and put them in the brigade that
4165                              * we're sending.
4166                              */
4167                             for (next = APR_BRIGADE_FIRST(temp_brig);
4168                                  next != APR_BRIGADE_SENTINEL(temp_brig);
4169                                  next = APR_BRIGADE_FIRST(temp_brig)) {
4170                                 APR_BUCKET_REMOVE(next);
4171                                 APR_BUCKET_INSERT_AFTER(temp, next);
4172                                 temp = next;
4173                                 apr_bucket_read(next, &str, &n,
4174                                                 APR_BLOCK_READ);
4175                                 vec[nvec].iov_base = (char*) str;
4176                                 vec[nvec].iov_len = n;
4177                                 nvec++;
4178                             }
4179 
4180                             apr_brigade_destroy(temp_brig);

where

4034         struct iovec vec[MAX_IOVEC_TO_WRITE];

and

3995 #define MAX_IOVEC_TO_WRITE 16

but there's no explicit check here to ensure we don't exceed the array's
boundary. :/  Our problem occurs in apr_brigade_destroy(temp_brig).

Now, it may be that temp_brig is such that this can never happen, but I
don't see how based on this code:

4116                             /* Create a temporary brigade as a means
4117                              * of concatenating a bunch of buckets together
4118                              */
4119                             if (last_merged_bucket) {
4120                                 /* If we've concatenated together small
4121                                  * buckets already in a previous pass,
4122                                  * the initial buckets in this brigade
4123                                  * are heap buckets that may have extra
4124                                  * space left in them (because they
4125                                  * were created by apr_brigade_write()).
4126                                  * We can take advantage of this by
4127                                  * building the new temp brigade out of
4128                                  * these buckets, so that the content
4129                                  * in them doesn't have to be copied again.
4130                                  */
4131                                 apr_bucket_brigade *bb;
4132                                 bb = apr_brigade_split(b,
4133                                          
APR_BUCKET_NEXT(last_merged_bucket));
4134                                 temp_brig = b;
4135                                 b = bb;
4136                             }
4137                             else {
4138                                 temp_brig = apr_brigade_create(f->c->pool,
4139                                                            
f->c->bucket_alloc);
4140                             }
4141 
4142                             temp = APR_BRIGADE_FIRST(b);
4143                             while (temp != e) {
4144                                 apr_bucket *d;
4145                                 rv = apr_bucket_read(temp, &str, &n, 
APR_BLOCK_READ);
4146                                 apr_brigade_write(temp_brig, NULL, NULL, 
str, n);
4147                                 d = temp;
4148                                 temp = APR_BUCKET_NEXT(temp);
4149                                 apr_bucket_delete(d);
4150                             }

Now, all this code is inside a big if:

4101                 if (n) {
4102                     if (!fd) {
4103                         if (nvec == MAX_IOVEC_TO_WRITE) {
4104                             /* woah! too many. buffer them up, for use 
later. */
4105                             apr_bucket *temp, *next;
4106                             apr_bucket_brigade *temp_brig;

which implies nvec could be carrying information over from previous
iterations through the code... but all of this is inside a huge
while loop where

4026     while (b && !APR_BRIGADE_EMPTY(b)) {
4027         apr_size_t nbytes = 0;
4028         apr_bucket *last_e = NULL; /* initialized for debugging */
4029         apr_bucket *e;
4030 
4031         /* one group of iovecs per pass over the brigade */
4032         apr_size_t nvec = 0;

which may reset the counter when it shouldn't. :(

BTW, yes, we are dealing with extremely large files (~1G+) when we hit
our problem.

Could someone who really understands this code figure out if this is
possible or am I barking up the wrong tree.

Here is the pstack for 1 such stuck thread:
----- Thread 18842 -----
0x4021fc28: apr_pool_cleanup_kill + 0x1e (42e06048, 42e064f0, 401203b0, 
8f20c40, 8f20c40, 42e064f4)
0x40120458: apr_brigade_destroy + 0x32 (42e064f0, 412126fc, 1, 1f40, 41212654, 
b7fdc48) + 1a0
0x08089e23: core_output_filter + 0x33f (42e06488, 42e064f0)
0x0808173e: ap_pass_brigade + 0x2c (42e06488, 42e064f0, 412127f4, 0, 8f2a5f8, 
0) + 20
0x08083ca8: ap_content_length_filter + 0x8e (8f2b260, 42e064f0)
0x0808173e: ap_pass_brigade + 0x2c (8f2b260, 42e064f0, 0, 0, 2000, 0) + 2050
0x08069c8f: ap_proxy_http_process_response + 0x3d9 (42e06048, 8f2a5f8, 
42e06540, 42e06758, 42e06558, 80f0130) + 70
0x0806a59d: ap_proxy_http_handler + 0x273 (8f2a5f8, 80f0130, 42e06650, 0, 0, 
0) + 10
0x080665e5: proxy_run_scheme_handler + 0x4b (8f2a5f8, 80f0130, 8f2bbde, 0, 0, 
8096caf) + 30
0x08065157: proxy_handler + 0x20d (8f2a5f8, 8120978, 41214994, 8f2a5f8, 
8f2a5f8, 8096c70)
0x08075a5a: ap_run_handler + 0x32 (8f2a5f8, 80a3220, 8f2a5f8, 8f2a5f8, 0, 
42e06150) + 20
0x08075fdc: ap_invoke_handler + 0xba (8f2a5f8, 0, 4, 8f2a5f8, 42e06150, 
8f2a5f8)
0x0806f88a: ap_process_request + 0x8e (8f2a5f8, 4, 8f2a5f8, 807f578, 42e06150, 
42e06080) + 10
0x0806abc1: ap_process_http_connection + 0x117 (42e06150, 42e06080, 41214a44, 
42e06150, 42e06048, 42e06080)
0x0807f458: ap_run_process_connection + 0x32 (42e06150, 42e06080, 42e06080, 
18c, 42e06148, 8f20598) + 20
0x08072249: process_socket + 0x7f (42e06048, 42e06080, 6, c, 8f20598, 0) + 20
0x080729d1: worker_thread + 0x195 (8123a48, 815d798)
0x4021abec: dummy_worker + 0x16 (8123a48, 0, 41214cf8, 0, 0, 0) + e0
0x402a2881: pthread_start_thread + 0x1b1 (41214be0, 41214be0, 0, 41214be0, 1, 
0) + bedeb40c


Here is a GDB bactrace of the problem thread:

#0  0x402a4a35 in __pthread_sigsuspend () from /lib/i686/libpthread.so.0
#1  0x402a3db8 in __pthread_wait_for_restart_signal ()
   from /lib/i686/libpthread.so.0
#2  0x402a0c8b in [EMAIL PROTECTED] () from /lib/i686/libpthread.so.0
#3  0x4021d1b6 in apr_thread_cond_wait (cond=0xfffffffc, mutex=0x8123588)
    at thread_cond.c:79
#4  0x0807515f in ap_queue_pop (queue=0x4022435c, sd=0x41914ab4, p=0x41914ab0)
    at fdqueue.c:257
#5  0x080728fb in worker_thread (thd=0xfffffffc, dummy=0xfffffffc)
    at worker.c:801
#6  0x4021abec in dummy_worker (opaque=0xfffffffc) at thread.c:88
#7  0x402a2881 in pthread_start_thread () from /lib/i686/libpthread.so.0


Thanks,
Ron

-- 
Ronald Park <[EMAIL PROTECTED]>

Reply via email to