On 7/15/22 12:36 PM, yla...@apache.org wrote:
> Author: ylavic
> Date: Fri Jul 15 10:36:24 2022
> New Revision: 1902731
> 
> URL: http://svn.apache.org/viewvc?rev=1902731&view=rev
> Log:
> util_pcre: Add a thread local subpool cache for when stack does not suffice.
> 
> When AP_HAS_THREAD_LOCAL is available, use a thread-local match_thread_state 
> to
> save per-thread data in a subpool of the thread's pool.
> 
> If private_malloc() gets out of the stack buffer and the current thread has a
> pool (i.e. ap_thread_current() != NULL), it will apr_palloc()ate and return
> memory from the subpool.
> 
> When the match is complete and the match_data are freed, the thread subpool is
> cleared thus giving back the memory to the allocator, which itself will give
> back the memory or recycle it depending on its max_free setting.
> 
> * util_pcre.c:
>   Restore POSIX_MALLOC_THRESHOLDsince this is part of the user API.
> 
> * util_pcre.c(match_data_pt):
>   Type not used (explicitely) anymore, axe.
>   
> * util_pcre.c(struct match_data_state):
>   Put the stack buffer there to simplify code (the state is allocated on
>   stack anyway).
>   If APREG_USE_THREAD_LOCAL, add the apr_thread_t* and match_thread_state*
>   fields that track the thread local data for the match.
> 
> * util_pcre.c(alloc_match_data, free_match):
>   Renamed to setup_state() and cleanup_state(), simplified (no stack buffer
>   parameters anymore).
>   cleanup_state() now clears the thread local subpool if used during the 
> match.
>   setup_state() set state->thd to ap_thread_current(), thus NULL if it's not a
>   suitable thread for using thread local data.
> 
> * util_pcre.c(private_malloc):
>   Fix a possible buf_used overflow (size <= avail < APR_ALIGN_DEFAULT(size)).
>   Create the thread local subpool (once per thread) and allocate from there
>   when stack space is missing and state->thd != NULL, otherwise fall back to
>   malloc() still.
> 
> * util_pcre.c(private_free):
>   Do nothing for thread local subpool memory, will be freed in cleanup_state
>   eventually.
> 
> 
> Modified:
>     httpd/httpd/trunk/server/util_pcre.c
> 
> Modified: httpd/httpd/trunk/server/util_pcre.c
> URL: 
> http://svn.apache.org/viewvc/httpd/httpd/trunk/server/util_pcre.c?rev=1902731&r1=1902730&r2=1902731&view=diff
> ==============================================================================
> --- httpd/httpd/trunk/server/util_pcre.c (original)
> +++ httpd/httpd/trunk/server/util_pcre.c Fri Jul 15 10:36:24 2022

> @@ -257,105 +268,201 @@ AP_DECLARE(int) ap_regcomp(ap_regex_t *
>   *              Match a regular expression       *
>   *************************************************/
>  
> -/* Unfortunately, PCRE requires 3 ints of working space for each captured
> +/* Unfortunately, PCRE1 requires 3 ints of working space for each captured
>   * substring, so we have to get and release working store instead of just 
> using
>   * the POSIX structures as was done in earlier releases when PCRE needed 
> only 2
>   * ints. However, if the number of possible capturing brackets is small, use 
> a
>   * block of store on the stack, to reduce the use of malloc/free. The 
> threshold
> - * is in a macro that can be changed at configure time.
> - * Yet more unfortunately, PCRE2 wants an opaque context by providing the API
> - * to allocate and free it, so to minimize these calls we maintain one opaque
> - * context per thread (in Thread Local Storage, TLS) grown as needed, and 
> while
> - * at it we do the same for PCRE1 ints vectors. Note that this requires a 
> fast
> - * TLS mechanism to be worth it, which is the case of 
> apr_thread_data_get/set()
> - * from/to ap_thread_current() when AP_HAS_THREAD_LOCAL; otherwise we'll do
> - * the allocation and freeing for each ap_regexec().
> + * is in POSIX_MALLOC_THRESHOLD macro that can be changed at configure time.
> + * PCRE2 takes an opaque match context and lets us provide the callbacks to
> + * manage the memory needed during the match, so we can still use a small 
> stack
> + * space that'll suffice for regexes up to POSIX_MALLOC_THRESHOLD captures 
> (and
> + * not too complex), above that either use some thread local subpool cache 
> (if
> + * AP_HAS_THREAD_LOCAL) or fall back to malloc()/free().
>   */
>  
> +#if AP_HAS_THREAD_LOCAL && !defined(APREG_NO_THREAD_LOCAL)
> +#define APREG_USE_THREAD_LOCAL 1
> +#else
> +#define APREG_USE_THREAD_LOCAL 0
> +#endif
> +
>  #ifdef HAVE_PCRE2
> -typedef pcre2_match_data* match_data_pt;
> -typedef size_t*           match_vector_pt;
> +typedef PCRE2_SIZE* match_vector_pt;
>  #else
> -typedef int*              match_data_pt;
> -typedef int*              match_vector_pt;
> +typedef int*        match_vector_pt;
>  #endif
>  
> -struct match_data_state
> -{
> -    match_data_pt data;
> -    char *buf;
> -    apr_size_t buf_len;
> +#if APREG_USE_THREAD_LOCAL
> +struct match_thread_state {
> +    char *heap;
> +    apr_size_t heap_size;
> +    apr_size_t heap_used;
> +    apr_pool_t *pool;
> +};
> +
> +AP_THREAD_LOCAL static struct match_thread_state *thread_state;
> +#endif
> +
> +struct match_data_state {
> +    /* keep first, struct aligned */
> +    char buf[AP_PCRE_STACKBUF_SIZE];
>      apr_size_t buf_used;
> +
> +#if APREG_USE_THREAD_LOCAL
> +    apr_thread_t *thd;
> +    struct match_thread_state *ts;
> +#endif
> +
>  #ifdef HAVE_PCRE2
>      pcre2_general_context *pcre2_ctx;
> +    pcre2_match_data* match_data;
> +#else
> +    int *match_data;
>  #endif
>  };
>  
>  static void * private_malloc(size_t size, void *ctx)
>  {
>      struct match_data_state *state = ctx;
> +    apr_size_t avail;
>  
> -    if(size <= (state->buf_len - state->buf_used)) {
> +    avail = sizeof(state->buf) - state->buf_used;
> +    if (avail >= size) {
>          void *p = state->buf + state->buf_used;
> +        size = APR_ALIGN_DEFAULT(size);
> +        if (size > avail) {
> +            size = avail;
> +        }
> +        state->buf_used += size;
> +        return p;
> +    }
>  
> -        state->buf_used += APR_ALIGN_DEFAULT(size);
> +#if APREG_USE_THREAD_LOCAL
> +    if (state->thd) {
> +        struct match_thread_state *ts = thread_state;
> +        void *p;
> +
> +        if (!ts) {
> +            apr_pool_t *tp = apr_thread_pool_get(state->thd);
> +            ts = apr_pcalloc(tp, sizeof(*ts));
> +            apr_pool_create(&ts->pool, tp);
> +            thread_state = state->ts = ts;
> +        }
> +        else if (!state->ts) {
> +            ts->heap_used = 0;
> +            state->ts = ts;
> +        }
>  
> +        avail = ts->heap_size - ts->heap_used;
> +        if (avail >= size) {
> +            size = APR_ALIGN_DEFAULT(size);
> +            if (size > avail) {
> +                size = avail;
> +            }
> +            p = ts->heap + ts->heap_used;
> +        }
> +        else {
> +            ts->heap_size *= 2;
> +            size = APR_ALIGN_DEFAULT(size);
> +            if (ts->heap_size < size) {
> +                ts->heap_size = size;
> +            }
> +            if (ts->heap_size < AP_PCRE_STACKBUF_SIZE * 2) {
> +                ts->heap_size = AP_PCRE_STACKBUF_SIZE * 2;
> +            }
> +            ts->heap = apr_palloc(ts->pool, ts->heap_size);
> +            ts->heap_used = 0;
> +            p = ts->heap;

I think that apr_palloc should be efficient enough for allocating memory 
quickly and
that we don't need to reserve bigger memory chunks manage them here locally 
that looks
like what apr_palloc already does.

> +        }
> +
> +        ts->heap_used += size;
>          return p;
>      }
> -    else {
> -        return malloc(size);
> -    }
> +#endif
> +
> +    return malloc(size);
>  }
>  
>  static void private_free(void *block, void *ctx)
>  {
>      struct match_data_state *state = ctx;
> -    void *buf_start = state->buf;
> -    void *buf_end = state->buf + state->buf_len;
> +    char *p = block;
>  
> -    if (block >= buf_start && block <= buf_end) {
> +    if (p >= state->buf && p < state->buf + sizeof(state->buf)) {
>          /* This block allocated from stack buffer. Do nothing. */
> +        return;
>      }
> -    else {
> -        free(block);
> +
> +#if APREG_USE_THREAD_LOCAL
> +    if (state->thd) {
> +        /* Freed in cleanup_state() eventually. */
> +        return;
>      }
> +#endif
> +
> +    free(block);
>  } 
>  
>  static APR_INLINE
> -match_data_pt alloc_match_data(apr_size_t size,
> -                               struct match_data_state *state,
> -                               void *stack_buf,
> -                               apr_size_t stack_buf_len)
> +int setup_state(struct match_data_state *state, apr_uint32_t ncaps)
>  {
> -    state->buf = stack_buf;
> -    state->buf_len = stack_buf_len;
>      state->buf_used = 0;
> +#if APREG_USE_THREAD_LOCAL
> +    state->thd = ap_thread_current();
> +    state->ts = NULL;
> +#endif
>  
>  #ifdef HAVE_PCRE2
> -    state->pcre2_ctx = pcre2_general_context_create(private_malloc, 
> private_free, state);
> +    state->pcre2_ctx = pcre2_general_context_create(private_malloc,
> +                                                    private_free, state);
>      if (!state->pcre2_ctx) { 
> -        return NULL;
> +        return 0;
>      }
> -    state->data = pcre2_match_data_create((int)size, state->pcre2_ctx);
> -    if (!state->data) {
> +
> +    state->match_data = pcre2_match_data_create(ncaps, state->pcre2_ctx);
> +    if (!state->match_data) {
>          pcre2_general_context_free(state->pcre2_ctx);
> -        return NULL;
> +        return 0;
>      }
>  #else
> -    state->data = private_malloc(size * sizeof(int) * 3, state);
> +    if (ncaps) {
> +        state->match_data = private_malloc(ncaps * sizeof(int) * 3, state);
> +        if (!state->match_data) {
> +            return 0;
> +        }
> +    }
> +    else {
> +        /* Fine with PCRE1 */
> +        state->match_data = NULL;
> +    }
>  #endif
>  
> -    return state->data;
> +    return 1;
>  }
>  
>  static APR_INLINE
> -void free_match_data(struct match_data_state *state)
> +void cleanup_state(struct match_data_state *state)
>  {
>  #ifdef HAVE_PCRE2
> -    pcre2_match_data_free(state->data);
> +    pcre2_match_data_free(state->match_data);
>      pcre2_general_context_free(state->pcre2_ctx);
>  #else
> -    private_free(state->data, state);
> +    if (state->match_data) {
> +        private_free(state->match_data, state);
> +    }
> +#endif
> +
> +#if APREG_USE_THREAD_LOCAL
> +    if (state->ts) {
> +        /* Let the thread's pool allocator recycle or free according
> +         * to its max_free setting.
> +         */
> +        struct match_thread_state *ts = state->ts;
> +        apr_pool_clear(ts->pool);
> +        ts->heap_size = 0;
> +        ts->heap = NULL;
> +    }
>  #endif
>  }
>  

Regards

RĂ¼diger

Reply via email to