> > > I think that we could try an alternative approach for that part of the 
> > > problem. The alternative approach would have the same characteristics as 
> > > the approach that had been used for PCRE1:
> > >
> > > -- Supply PCRE2 with a custom context with malloc and free 
> > > implementations.
> > > -- Those implementations would work by either using a stack buffer for 
> > > small allocations or by doing a malloc().
> > > -- This allocation scheme would have the same properties as the existing 
> > > scheme used when compiling with PCRE1.
> >
> > For PCRE2, you would malloc()/free() for every ap_regexec(), which can
> > be quite costly depending on the configuration/modules. This work was
> > done precisely for this concern, for the switch from PCRE1 to PCRE2 to
> > be as little costly as possible. The current implementation reuses the
> > same PCRE2 context per thread for the lifetime of httpd..
> > Same for PCRE1, besides the stack buffer for small vectors (which is
> > still there currently btw).
> >
> I was thinking about having a custom malloc()/free() implementation
> that uses stack memory for first N bytes and then falls back to
> ordinary malloc/free(). So for cases where the allocation fits into
> the stack threshold there would be no malloc()/free() calls, as it was
> with PCRE1 and POSIX_MALLOC_THRESHOLD.
>
> I'll try to prepare a patch with this approach, to illustrate it better.

Here is a patch with the described alternative approach with custom
malloc() and free() implementations that use a stack buffer for first
N bytes and then fall back to an ordinary malloc/free().

Its key properties are:

1) Allocations with PCRE2 happen the same way as they were happening
with PCRE1 in httpd 2.4.52 and earlier.

2) There are no malloc()/free() calls for typical cases where the
match data can be kept on stack.

3) The patch avoids a malloc() for the match_data structure itself,
because the match data is allocated with the provided custom malloc()
function.

4) Using custom allocation functions should ensure that PCRE is not
going to use malloc() for any auxiliary allocations, if they are
necessary.

5) There is no per-thread state.

NOTE: Current behavior in trunk is that we allocate for the number of
captures in the regular expression (preg->re_nsub). An additional
improvement would be to cap the allocation size based on the passed-in
limit (nmatch). I'll try to handle that separately.

Thoughts?

--
Ivan Zhakov
Index: server/util_pcre.c
===================================================================
--- server/util_pcre.c  (revision 1902366)
+++ server/util_pcre.c  (working copy)
@@ -75,7 +75,7 @@
 #include "apr_want.h"
 
 #ifndef POSIX_MALLOC_THRESHOLD
-#define POSIX_MALLOC_THRESHOLD (10)
+#define POSIX_MALLOC_THRESHOLD (256)
 #endif
 
 /* Table of error strings corresponding to POSIX error codes; must be
@@ -281,117 +281,85 @@
 typedef int*              match_vector_pt;
 #endif
 
-static APR_INLINE
-match_data_pt alloc_match_data(apr_size_t size,
-                               match_vector_pt small_vector)
+struct match_data_state
 {
     match_data_pt data;
-
+    char *buf;
+    apr_size_t buf_len;
+    apr_size_t buf_used;
 #ifdef HAVE_PCRE2
-    data = pcre2_match_data_create(size, NULL);
-#else
-    if (size > POSIX_MALLOC_THRESHOLD) {
-        data = malloc(size * sizeof(int) * 3);
-    }
-    else {
-        data = small_vector;
-    }
+    pcre2_general_context *pcre2_ctx;
 #endif
+};
 
-    return data;
-}
-
-static APR_INLINE
-void free_match_data(match_data_pt data, apr_size_t size)
+static void * private_malloc(size_t size, void *ctx)
 {
-#ifdef HAVE_PCRE2
-    pcre2_match_data_free(data);
-#else
-    if (size > POSIX_MALLOC_THRESHOLD) {
-        free(data);
-    }
-#endif
-}
+    struct match_data_state *state = ctx;
 
-#if AP_HAS_THREAD_LOCAL && !defined(APREG_NO_THREAD_LOCAL)
+    if(size <= (state->buf_len - state->buf_used)) {
+        void *p = state->buf + state->buf_used;
 
-struct apreg_tls {
-    match_data_pt data;
-    apr_size_t size;
-};
+        state->buf_used += APR_ALIGN_DEFAULT(size);
 
-#ifdef HAVE_PCRE2
-static apr_status_t apreg_tls_cleanup(void *arg)
-{
-    struct apreg_tls *tls = arg;
-    pcre2_match_data_free(tls->data); /* NULL safe */
-    return APR_SUCCESS;
+        return p;
+    }
+    else {
+        return malloc(size);
+    }
 }
-#endif
 
-static match_data_pt get_match_data(apr_size_t size,
-                                    match_vector_pt small_vector,
-                                    int *to_free)
+static void private_free(void *block, void *ctx)
 {
-    apr_thread_t *current;
-    struct apreg_tls *tls = NULL;
+    struct match_data_state *state = ctx;
+    void *buf_start = state->buf;
+    void *buf_end = state->buf + state->buf_len;
 
-    /* Even though AP_HAS_THREAD_LOCAL, we may still be called by a
-     * native/non-apr thread, let's fall back to alloc/free in this case.
-     */
-    current = ap_thread_current();
-    if (!current) {
-        *to_free = 1;
-        return alloc_match_data(size, small_vector);
+    if (block >= buf_start && block <= buf_end) {
+        /* This block allocated from stack buffer. Do nothing. */
     }
+    else {
+        free(block);
+    }
+} 
 
-    apr_thread_data_get((void **)&tls, "apreg", current);
-    if (!tls || tls->size < size) {
-        apr_pool_t *tp = apr_thread_pool_get(current);
-        if (!tls) {
-            tls = apr_pcalloc(tp, sizeof(*tls));
+static APR_INLINE
+match_data_pt alloc_match_data(apr_size_t size,
+                               struct match_data_state *state,
+                               void *stack_buf,
+                               apr_size_t stack_buf_len)
+{
+    state->buf = stack_buf;
+    state->buf_len = stack_buf_len;
+    state->buf_used = 0;
+
 #ifdef HAVE_PCRE2
-            apr_thread_data_set(tls, "apreg", apreg_tls_cleanup, current);
+    state->pcre2_ctx = pcre2_general_context_create(private_malloc, 
private_free, state);
+    if (!state->pcre2_ctx) { 
+        return NULL;
+    }
+    state->data = pcre2_match_data_create(size, state->pcre2_ctx);
+    if (!state->data) {
+        pcre2_general_context_free(state->pcre2_ctx);
+        return NULL;
+    }
 #else
-            apr_thread_data_set(tls, "apreg", NULL, current);
+    state->data = private_malloc(size * sizeof(int) * 3, state);
 #endif
-        }
 
-        tls->size *= 2;
-        if (tls->size < size) {
-            tls->size = size;
-            if (tls->size < POSIX_MALLOC_THRESHOLD) {
-                tls->size = POSIX_MALLOC_THRESHOLD;
-            }
-        }
+    return state->data;
+}
 
+static APR_INLINE
+void free_match_data(struct match_data_state *state)
+{
 #ifdef HAVE_PCRE2
-        pcre2_match_data_free(tls->data); /* NULL safe */
-        tls->data = pcre2_match_data_create(tls->size, NULL);
-        if (!tls->data) {
-            tls->size = 0;
-            return NULL;
-        }
+    pcre2_match_data_free(state->data);
+    pcre2_general_context_free(state->pcre2_ctx);
 #else
-        tls->data = apr_palloc(tp, tls->size * sizeof(int) * 3);
+    private_free(state->data, state);
 #endif
-    }
-
-    return tls->data;
 }
 
-#else /* AP_HAS_THREAD_LOCAL && !defined(APREG_NO_THREAD_LOCAL) */
-
-static APR_INLINE match_data_pt get_match_data(apr_size_t size,
-                                               match_vector_pt small_vector,
-                                               int *to_free)
-{
-    *to_free = 1;
-    return alloc_match_data(size, small_vector);
-}
-
-#endif /* AP_HAS_THREAD_LOCAL && !defined(APREG_NO_THREAD_LOCAL) */
-
 AP_DECLARE(int) ap_regexec(const ap_regex_t *preg, const char *string,
                            apr_size_t nmatch, ap_regmatch_t *pmatch,
                            int eflags)
@@ -408,13 +376,12 @@
     int options = 0, to_free = 0;
     match_vector_pt ovector = NULL;
     apr_size_t ncaps = (apr_size_t)preg->re_nsub + 1;
-#ifdef HAVE_PCRE2
-    match_data_pt data = get_match_data(ncaps, NULL, &to_free);
-#else
-    int small_vector[POSIX_MALLOC_THRESHOLD * 3];
-    match_data_pt data = get_match_data(ncaps, small_vector, &to_free);
-#endif
+    struct match_data_state state;
+    match_data_pt data;
+    /* Use apr_uint64_t to get proper alignment. */
+    apr_uint64_t stack_buf[(POSIX_MALLOC_THRESHOLD + sizeof(apr_uint64_t) - 1) 
/ sizeof(apr_uint64_t)];
 
+    data = alloc_match_data(ncaps, &state, stack_buf, sizeof(stack_buf));
     if (!data) {
         return AP_REG_ESPACE;
     }
@@ -449,15 +416,11 @@
         }
         for (; i < nmatch; i++)
             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
-        if (to_free) {
-            free_match_data(data, ncaps);
-        }
+        free_match_data(&state);
         return 0;
     }
     else {
-        if (to_free) {
-            free_match_data(data, ncaps);
-        }
+        free_match_data(&state);
 #ifdef HAVE_PCRE2
         if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
             return AP_REG_INVARG;

Reply via email to