https://github.com/python/cpython/commit/8eb25faed706de41bd6965efbb15d04b5715ed48
commit: 8eb25faed706de41bd6965efbb15d04b5715ed48
branch: 3.12
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2024-11-18T11:24:13Z
summary:
[3.12] gh-67877: Fix memory leaks in terminated RE matching (GH-126840)
(GH-126961)
If SRE(match) function terminates abruptly, either because of a signal
or because memory allocation fails, allocated SRE_REPEAT blocks might
be never released.
(cherry picked from commit 7538e7f5696408fa0aa02fce8a413a7dfac76a04)
files:
A Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
M Lib/test/test_re.py
M Modules/_sre/clinic/sre.c.h
M Modules/_sre/sre.c
M Modules/_sre/sre.h
M Modules/_sre/sre_lib.h
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 756a7ccd506be4..130e1e1d01dec5 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2621,6 +2621,50 @@ def test_regression_gh94675(self):
p.terminate()
p.join()
+ def test_fail(self):
+ self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
+
+ def test_character_set_any(self):
+ # The union of complementary character sets matches any character
+ # and is equivalent to "(?s:.)".
+ s = '1x\n'
+ for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
+ with self.subTest(pattern=p):
+ self.assertEqual(re.findall(p, s), list(s))
+ self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
+
+ def test_character_set_none(self):
+ # Negation of the union of complementary character sets does not match
+ # any character.
+ s = '1x\n'
+ for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
+ with self.subTest(pattern=p):
+ self.assertIsNone(re.search(p, s))
+ self.assertIsNone(re.search('(?s:.)' + p, s))
+
+ def check_interrupt(self, pattern, string, maxcount):
+ class Interrupt(Exception):
+ pass
+ p = re.compile(pattern)
+ for n in range(maxcount):
+ try:
+ p._fail_after(n, Interrupt)
+ p.match(string)
+ return n
+ except Interrupt:
+ pass
+ finally:
+ p._fail_after(-1, None)
+
+ @unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug
build')
+ def test_memory_leaks(self):
+ self.check_interrupt(r'(.)*:', 'abc:', 100)
+ self.check_interrupt(r'([^:])*?:', 'abc:', 100)
+ self.check_interrupt(r'([^:])*+:', 'abc:', 100)
+ self.check_interrupt(r'(.){2,4}:', 'abc:', 100)
+ self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100)
+ self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100)
+
def get_debug_out(pat):
with captured_stdout() as out:
diff --git
a/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
new file mode 100644
index 00000000000000..021b4ae2e100bc
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst
@@ -0,0 +1,2 @@
+Fix memory leaks when :mod:`regular expression <re>` matching terminates
+abruptly, either because of a signal or because memory allocation fails.
diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h
index 529c634e76d63c..56a4e6048fa8ef 100644
--- a/Modules/_sre/clinic/sre.c.h
+++ b/Modules/_sre/clinic/sre.c.h
@@ -975,6 +975,44 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__,
#define _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF \
{"__deepcopy__", (PyCFunction)_sre_SRE_Pattern___deepcopy__, METH_O,
_sre_SRE_Pattern___deepcopy____doc__},
+#if defined(Py_DEBUG)
+
+PyDoc_STRVAR(_sre_SRE_Pattern__fail_after__doc__,
+"_fail_after($self, count, exception, /)\n"
+"--\n"
+"\n"
+"For debugging.");
+
+#define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF \
+ {"_fail_after", _PyCFunction_CAST(_sre_SRE_Pattern__fail_after),
METH_FASTCALL, _sre_SRE_Pattern__fail_after__doc__},
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+ PyObject *exception);
+
+static PyObject *
+_sre_SRE_Pattern__fail_after(PatternObject *self, PyObject *const *args,
Py_ssize_t nargs)
+{
+ PyObject *return_value = NULL;
+ int count;
+ PyObject *exception;
+
+ if (!_PyArg_CheckPositional("_fail_after", nargs, 2, 2)) {
+ goto exit;
+ }
+ count = _PyLong_AsInt(args[0]);
+ if (count == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ exception = args[1];
+ return_value = _sre_SRE_Pattern__fail_after_impl(self, count, exception);
+
+exit:
+ return return_value;
+}
+
+#endif /* defined(Py_DEBUG) */
+
PyDoc_STRVAR(_sre_compile__doc__,
"compile($module, /, pattern, flags, code, groups, groupindex,\n"
" indexgroup)\n"
@@ -1460,4 +1498,8 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject
*cls, PyObject *const
}
return _sre_SRE_Scanner_search_impl(self, cls);
}
-/*[clinic end generated code: output=045de53cfe02dee0 input=a9049054013a1b77]*/
+
+#ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+ #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
+#endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */
+/*[clinic end generated code: output=2165ecf43a7c20e8 input=a9049054013a1b77]*/
diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c
index 6d9843bb76d791..35c6058dac4eae 100644
--- a/Modules/_sre/sre.c
+++ b/Modules/_sre/sre.c
@@ -218,6 +218,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size)
return 0;
}
+/* memory pool functions for SRE_REPEAT, this can avoid memory
+ leak when SRE(match) function terminates abruptly.
+ state->repeat_pool_used is a doubly-linked list, so that we
+ can remove a SRE_REPEAT node from it.
+ state->repeat_pool_unused is a singly-linked list, we put/get
+ node at the head. */
+static SRE_REPEAT *
+repeat_pool_malloc(SRE_STATE *state)
+{
+ SRE_REPEAT *repeat;
+
+ if (state->repeat_pool_unused) {
+ /* remove from unused pool (singly-linked list) */
+ repeat = state->repeat_pool_unused;
+ state->repeat_pool_unused = repeat->pool_next;
+ }
+ else {
+ repeat = PyObject_Malloc(sizeof(SRE_REPEAT));
+ if (!repeat) {
+ return NULL;
+ }
+ }
+
+ /* add to used pool (doubly-linked list) */
+ SRE_REPEAT *temp = state->repeat_pool_used;
+ if (temp) {
+ temp->pool_prev = repeat;
+ }
+ repeat->pool_prev = NULL;
+ repeat->pool_next = temp;
+ state->repeat_pool_used = repeat;
+
+ return repeat;
+}
+
+static void
+repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
+{
+ SRE_REPEAT *prev = repeat->pool_prev;
+ SRE_REPEAT *next = repeat->pool_next;
+
+ /* remove from used pool (doubly-linked list) */
+ if (prev) {
+ prev->pool_next = next;
+ }
+ else {
+ state->repeat_pool_used = next;
+ }
+ if (next) {
+ next->pool_prev = prev;
+ }
+
+ /* add to unused pool (singly-linked list) */
+ repeat->pool_next = state->repeat_pool_unused;
+ state->repeat_pool_unused = repeat;
+}
+
+static void
+repeat_pool_clear(SRE_STATE *state)
+{
+ /* clear used pool */
+ SRE_REPEAT *next = state->repeat_pool_used;
+ state->repeat_pool_used = NULL;
+ while (next) {
+ SRE_REPEAT *temp = next;
+ next = temp->pool_next;
+ PyObject_Free(temp);
+ }
+
+ /* clear unused pool */
+ next = state->repeat_pool_unused;
+ state->repeat_pool_unused = NULL;
+ while (next) {
+ SRE_REPEAT *temp = next;
+ next = temp->pool_next;
+ PyObject_Free(temp);
+ }
+}
+
/* generate 8-bit version */
#define SRE_CHAR Py_UCS1
@@ -463,6 +542,11 @@ state_init(SRE_STATE* state, PatternObject* pattern,
PyObject* string,
state->pos = start;
state->endpos = end;
+#ifdef Py_DEBUG
+ state->fail_after_count = pattern->fail_after_count;
+ state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
+#endif
+
return string;
err:
/* We add an explicit cast here because MSVC has a bug when
@@ -485,6 +569,8 @@ state_fini(SRE_STATE* state)
/* See above PyMem_Del for why we explicitly cast here. */
PyMem_Free((void*) state->mark);
state->mark = NULL;
+ /* SRE_REPEAT pool */
+ repeat_pool_clear(state);
}
/* calculate offset from start of string */
@@ -571,6 +657,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void
*arg)
Py_VISIT(self->groupindex);
Py_VISIT(self->indexgroup);
Py_VISIT(self->pattern);
+#ifdef Py_DEBUG
+ Py_VISIT(self->fail_after_exc);
+#endif
return 0;
}
@@ -580,6 +669,9 @@ pattern_clear(PatternObject *self)
Py_CLEAR(self->groupindex);
Py_CLEAR(self->indexgroup);
Py_CLEAR(self->pattern);
+#ifdef Py_DEBUG
+ Py_CLEAR(self->fail_after_exc);
+#endif
return 0;
}
@@ -642,7 +734,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self,
PyTypeObject *cls,
Py_ssize_t status;
PyObject *match;
- if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
+ if (!state_init(&state, self, string, pos, endpos))
return NULL;
state.ptr = state.start;
@@ -1330,6 +1422,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self,
PyObject *memo)
return Py_NewRef(self);
}
+#ifdef Py_DEBUG
+/*[clinic input]
+_sre.SRE_Pattern._fail_after
+
+ count: int
+ exception: object
+ /
+
+For debugging.
+[clinic start generated code]*/
+
+static PyObject *
+_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
+ PyObject *exception)
+/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
+{
+ self->fail_after_count = count;
+ Py_INCREF(exception);
+ Py_XSETREF(self->fail_after_exc, exception);
+ Py_RETURN_NONE;
+}
+#endif /* Py_DEBUG */
+
static PyObject *
pattern_repr(PatternObject *obj)
{
@@ -1456,6 +1571,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern,
int flags,
self->pattern = NULL;
self->groupindex = NULL;
self->indexgroup = NULL;
+#ifdef Py_DEBUG
+ self->fail_after_count = -1;
+ self->fail_after_exc = NULL;
+#endif
self->codesize = n;
@@ -2552,7 +2671,8 @@ pattern_new_match(_sremodulestate* module_state,
if (!match)
return NULL;
- match->pattern = (PatternObject*)Py_NewRef(pattern);
+ Py_INCREF(pattern);
+ match->pattern = pattern;
match->string = Py_NewRef(state->string);
@@ -2688,7 +2808,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self,
PyTypeObject *cls)
return NULL;
}
- match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+ match = pattern_new_match(module_state, self->pattern,
state, status);
if (status == 0)
@@ -2738,7 +2858,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self,
PyTypeObject *cls)
return NULL;
}
- match = pattern_new_match(module_state, (PatternObject*) self->pattern,
+ match = pattern_new_match(module_state, self->pattern,
state, status);
if (status == 0)
@@ -2774,7 +2894,8 @@ pattern_scanner(_sremodulestate *module_state,
return NULL;
}
- scanner->pattern = Py_NewRef(self);
+ Py_INCREF(self);
+ scanner->pattern = self;
PyObject_GC_Track(scanner);
return (PyObject*) scanner;
@@ -2968,6 +3089,7 @@ static PyMethodDef pattern_methods[] = {
_SRE_SRE_PATTERN_SCANNER_METHODDEF
_SRE_SRE_PATTERN___COPY___METHODDEF
_SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
+ _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
{"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
PyDoc_STR("See PEP 585")},
{NULL, NULL}
diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h
index a0f235606e290e..b8c6f8e3e660d1 100644
--- a/Modules/_sre/sre.h
+++ b/Modules/_sre/sre.h
@@ -34,6 +34,11 @@ typedef struct {
int flags; /* flags used when compiling pattern source */
PyObject *weakreflist; /* List of weak references */
int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */
+#ifdef Py_DEBUG
+ /* for simulation of user interruption */
+ int fail_after_count;
+ PyObject *fail_after_exc;
+#endif
/* pattern code */
Py_ssize_t codesize;
SRE_CODE code[1];
@@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T {
const SRE_CODE* pattern; /* points to REPEAT operator arguments */
const void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
+ /* for SRE_REPEAT pool */
+ struct SRE_REPEAT_T *pool_prev;
+ struct SRE_REPEAT_T *pool_next;
} SRE_REPEAT;
typedef struct {
@@ -94,12 +102,19 @@ typedef struct {
size_t data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
+ /* SRE_REPEAT pool */
+ SRE_REPEAT *repeat_pool_used;
+ SRE_REPEAT *repeat_pool_unused;
unsigned int sigcount;
+#ifdef Py_DEBUG
+ int fail_after_count;
+ PyObject *fail_after_exc;
+#endif
} SRE_STATE;
typedef struct {
PyObject_HEAD
- PyObject* pattern;
+ PatternObject* pattern;
SRE_STATE state;
int executing;
} ScannerObject;
diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h
index 95c1ada908d222..d82ba7aa3c8b83 100644
--- a/Modules/_sre/sre_lib.h
+++ b/Modules/_sre/sre_lib.h
@@ -524,13 +524,28 @@ typedef struct {
Py_ssize_t last_ctx_pos;
} SRE(match_context);
-#define MAYBE_CHECK_SIGNALS \
+#define _MAYBE_CHECK_SIGNALS \
do { \
if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \
RETURN_ERROR(SRE_ERROR_INTERRUPTED); \
} \
} while (0)
+#ifdef Py_DEBUG
+# define MAYBE_CHECK_SIGNALS \
+ do { \
+ _MAYBE_CHECK_SIGNALS; \
+ if (state->fail_after_count >= 0) { \
+ if (state->fail_after_count-- == 0) { \
+ PyErr_SetNone(state->fail_after_exc); \
+ RETURN_ERROR(SRE_ERROR_INTERRUPTED); \
+ } \
+ } \
+ } while (0)
+#else
+# define MAYBE_CHECK_SIGNALS _MAYBE_CHECK_SIGNALS
+#endif /* Py_DEBUG */
+
#ifdef HAVE_COMPUTED_GOTOS
#ifndef USE_COMPUTED_GOTOS
#define USE_COMPUTED_GOTOS 1
@@ -1083,12 +1098,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern,
int toplevel)
pattern[1], pattern[2]));
/* install new repeat context */
- /* TODO(https://github.com/python/cpython/issues/67877): Fix this
- * potential memory leak. */
- ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep));
+ ctx->u.rep = repeat_pool_malloc(state);
if (!ctx->u.rep) {
- PyErr_NoMemory();
- RETURN_FAILURE;
+ RETURN_ERROR(SRE_ERROR_MEMORY);
}
ctx->u.rep->count = -1;
ctx->u.rep->pattern = pattern;
@@ -1099,7 +1111,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int
toplevel)
state->ptr = ptr;
DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]);
state->repeat = ctx->u.rep->prev;
- PyObject_Free(ctx->u.rep);
+ repeat_pool_free(state, ctx->u.rep);
if (ret) {
RETURN_ON_ERROR(ret);
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]