https://github.com/python/cpython/commit/bb09ba679223666e01f8da780f97888a29d07131
commit: bb09ba679223666e01f8da780f97888a29d07131
branch: main
author: Petr Viktorin <[email protected]>
committer: encukou <[email protected]>
date: 2024-07-27T10:27:06+02:00
summary:

gh-122291: Intern latin-1 one-byte strings at startup (GH-122303)

files:
M InternalDocs/string_interning.md
M Objects/unicodeobject.c

diff --git a/InternalDocs/string_interning.md b/InternalDocs/string_interning.md
index 930ea110d857ac..358e2c070cd5fa 100644
--- a/InternalDocs/string_interning.md
+++ b/InternalDocs/string_interning.md
@@ -8,51 +8,50 @@
 
 This is used to optimize dict and attribute lookups, among other things.
 
-Python uses three different mechanisms to intern strings:
+Python uses two different mechanisms to intern strings: singletons and
+dynamic interning.
 
-- Singleton strings marked in C source with `_Py_STR` and `_Py_ID` macros.
-  These are statically allocated, and collected using `make 
regen-global-objects`
-  (`Tools/build/generate_global_objects.py`), which generates code
-  for declaration, initialization and finalization.
+## Singletons
 
-  The difference between the two kinds is not important. (A `_Py_ID` string is
-  a valid C name, with which we can refer to it; a `_Py_STR` may e.g. contain
-  non-identifier characters, so it needs a separate C-compatible name.)
+The 256 possible one-character latin-1 strings, which can be retrieved with
+`_Py_LATIN1_CHR(c)`, are stored in statically allocated arrays,
+`_PyRuntime.static_objects.strings.ascii` and
+`_PyRuntime.static_objects.strings.latin1`.
 
-  The empty string is in this category (as `_Py_STR(empty)`).
+Longer singleton strings are marked in C source with `_Py_ID` (if the string
+is a valid C identifier fragment) or `_Py_STR` (if it needs a separate
+C-compatible name.)
+These are also stored in statically allocated arrays.
+They are collected from CPython sources using `make regen-global-objects`
+(`Tools/build/generate_global_objects.py`), which generates code
+for declaration, initialization and finalization.
 
-  These singletons are interned in a runtime-global lookup table,
-  `_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
-  at runtime initialization.
+The empty string is one of the singletons: `_Py_STR(empty)`.
 
-- The 256 possible one-character latin-1 strings are singletons,
-  which can be retrieved with `_Py_LATIN1_CHR(c)`, are stored in runtime-global
-  arrays, `_PyRuntime.static_objects.strings.ascii` and
-  `_PyRuntime.static_objects.strings.latin1`.
+The three sets of singletons (`_Py_LATIN1_CHR`, `_Py_ID`, `_Py_STR`)
+are disjoint.
+If you have such a singleton, it (and no other copy) will be interned.
 
-  These are NOT interned at startup in the normal build.
-  In the free-threaded build, they are; this avoids modifying the
-  global lookup table after threads are started.
+These singletons are interned in a runtime-global lookup table,
+`_PyRuntime.cached_objects.interned_strings` (`INTERNED_STRINGS`),
+at runtime initialization, and immutable until it's torn down
+at runtime finalization.
+It is shared across threads and interpreters without any synchronization.
 
-  Interning a one-char latin-1 string will always intern the corresponding
-  singleton.
 
-- All other strings are allocated dynamically, and have their
-  `_PyUnicode_STATE(s).statically_allocated` flag set to zero.
-  When interned, such strings are added to an interpreter-wide dict,
-  `PyInterpreterState.cached_objects.interned_strings`.
+## Dynamically allocated strings
 
-  The key and value of each entry in this dict reference the same object.
+All other strings are allocated dynamically, and have their
+`_PyUnicode_STATE(s).statically_allocated` flag set to zero.
+When interned, such strings are added to an interpreter-wide dict,
+`PyInterpreterState.cached_objects.interned_strings`.
 
-The three sets of singletons (`_Py_STR`, `_Py_ID`, `_Py_LATIN1_CHR`)
-are disjoint.
-If you have such a singleton, it (and no other copy) will be interned.
+The key and value of each entry in this dict reference the same object.
 
 
 ## Immortality and reference counting
 
-Invariant: Every immortal string is interned, *except* the one-char latin-1
-singletons (which might but might not be interned).
+Invariant: Every immortal string is interned.
 
 In practice, this means that you must not use `_Py_SetImmortal` on
 a string. (If you know it's already immortal, don't immortalize it;
@@ -115,8 +114,5 @@ The valid transitions between these states are:
   Using `_PyUnicode_InternStatic` on these is an error; the other cases
   don't change the state.
 
-- One-char latin-1 singletons can be interned (0 -> 3) using any interning
-  function; after that the functions don't change the state.
-
-- Other statically allocated strings are interned (0 -> 3) at runtime init;
+- Singletons are interned (0 -> 3) at runtime init;
   after that all interning functions don't change the state.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6196a8e766a15b..ffb879a68745b1 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -325,7 +325,8 @@ init_global_interned_strings(PyInterpreterState *interp)
         return _PyStatus_ERR("failed to create global interned dict");
     }
 
-    /* Intern statically allocated string identifiers and deepfreeze strings.
+    /* Intern statically allocated string identifiers, deepfreeze strings,
+        * and one-byte latin-1 strings.
         * This must be done before any module initialization so that statically
         * allocated string identifiers are used instead of heap allocated 
strings.
         * Deepfreeze uses the interned identifiers if present to save space
@@ -333,14 +334,11 @@ init_global_interned_strings(PyInterpreterState *interp)
     */
     _PyUnicode_InitStaticStrings(interp);
 
-#ifdef Py_GIL_DISABLED
-// In the free-threaded build, intern the 1-byte strings as well
     for (int i = 0; i < 256; i++) {
         PyObject *s = LATIN1(i);
         _PyUnicode_InternStatic(interp, &s);
         assert(s == LATIN1(i));
     }
-#endif
 #ifdef Py_DEBUG
     assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
 
@@ -15355,26 +15353,14 @@ intern_static(PyInterpreterState *interp, PyObject *s 
/* stolen */)
     assert(s != NULL);
     assert(_PyUnicode_CHECK(s));
     assert(_PyUnicode_STATE(s).statically_allocated);
-
-    switch (PyUnicode_CHECK_INTERNED(s)) {
-        case SSTATE_NOT_INTERNED:
-            break;
-        case SSTATE_INTERNED_IMMORTAL_STATIC:
-            return s;
-        default:
-            Py_FatalError("_PyUnicode_InternStatic called on wrong string");
-    }
+    assert(!PyUnicode_CHECK_INTERNED(s));
 
 #ifdef Py_DEBUG
     /* We must not add process-global interned string if there's already a
      * per-interpreter interned_dict, which might contain duplicates.
-     * Except "short string" singletons: those are special-cased. */
+     */
     PyObject *interned = get_interned_dict(interp);
-    assert(interned == NULL || unicode_is_singleton(s));
-#ifdef Py_GIL_DISABLED
-    // In the free-threaded build, don't allow even the short strings.
     assert(interned == NULL);
-#endif
 #endif
 
     /* Look in the global cache first. */
@@ -15446,11 +15432,6 @@ intern_common(PyInterpreterState *interp, PyObject *s 
/* stolen */,
         return s;
     }
 
-    /* Handle statically allocated strings. */
-    if (_PyUnicode_STATE(s).statically_allocated) {
-        return intern_static(interp, s);
-    }
-
     /* Is it already interned? */
     switch (PyUnicode_CHECK_INTERNED(s)) {
         case SSTATE_NOT_INTERNED:
@@ -15467,6 +15448,9 @@ intern_common(PyInterpreterState *interp, PyObject *s 
/* stolen */,
             return s;
     }
 
+    /* Statically allocated strings must be already interned. */
+    assert(!_PyUnicode_STATE(s).statically_allocated);
+
 #if Py_GIL_DISABLED
     /* In the free-threaded build, all interned strings are immortal */
     immortalize = 1;
@@ -15477,13 +15461,11 @@ intern_common(PyInterpreterState *interp, PyObject *s 
/* stolen */,
         immortalize = 1;
     }
 
-    /* if it's a short string, get the singleton -- and intern it */
+    /* if it's a short string, get the singleton */
     if (PyUnicode_GET_LENGTH(s) == 1 &&
                 PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
         PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
-        if (!PyUnicode_CHECK_INTERNED(r)) {
-            r = intern_static(interp, r);
-        }
+        assert(PyUnicode_CHECK_INTERNED(r));
         Py_DECREF(s);
         return r;
     }

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to