wingo pushed a commit to branch wip-whippet
in repository guile.

commit 4138d3c646c72845ac4946fc6e22c155c067b12b
Author: Andy Wingo <wi...@pobox.com>
AuthorDate: Wed May 14 16:05:29 2025 +0200

    The symbol table is an ephemeron table
    
    * libguile/symbols.c: Rework the symbol table to be an ephemeron table
    instead of a weak set.  It is no longer resizeable; getting that to work
    will involve some GC cooperation.
---
 libguile/symbols.c | 254 +++++++++++++++++++++++++----------------------------
 1 file changed, 119 insertions(+), 135 deletions(-)

diff --git a/libguile/symbols.c b/libguile/symbols.c
index 292941e9d..1cec76f26 100644
--- a/libguile/symbols.c
+++ b/libguile/symbols.c
@@ -1,4 +1,4 @@
-/* Copyright 
1995-1998,2000-2001,2003-2004,2006,2009,2011,2013,2015,2018,2022,2023
+/* Copyright 
1995-1998,2000-2001,2003-2004,2006,2009,2011,2013,2015,2018,2022,2023,2025
      Free Software Foundation, Inc.
 
    This file is part of Guile.
@@ -29,6 +29,7 @@
 #include "alist.h"
 #include "boolean.h"
 #include "chars.h"
+#include "ephemerons.h"
 #include "eval.h"
 #include "fluids.h"
 #include "gsubr.h"
@@ -46,14 +47,13 @@
 #include "threads.h"
 #include "variable.h"
 #include "vectors.h"
-#include "weak-set.h"
 
 #include "symbols.h"
 
 
 
 
-static SCM symbols;
+static struct scm_ephemeron_table *symbols;
 
 #ifdef GUILE_DEBUG
 SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0,
@@ -61,7 +61,7 @@ SCM_DEFINE (scm_sys_symbols, "%symbols", 0, 0, 0,
            "Return the system symbol obarray.")
 #define FUNC_NAME s_scm_sys_symbols
 {
-  return symbols;
+  return scm_from_ephemeron_table (symbols);
 }
 #undef FUNC_NAME
 #endif
@@ -77,187 +77,171 @@ scm_i_hash_symbol (SCM obj, unsigned long n, void 
*closure)
   return scm_i_symbol_hash (obj) % n;
 }
 
-struct string_lookup_data
-{
-  SCM string;
-  unsigned long string_hash;
-};
-
 static int
-string_lookup_predicate_fn (SCM sym, void *closure)
+symbol_equals_string (SCM sym, SCM str, size_t len, unsigned long hash)
 {
-  struct string_lookup_data *data = closure;
-
-  if (scm_i_symbol_hash (sym) == data->string_hash
-      && scm_i_symbol_length (sym) == scm_i_string_length (data->string))
-    {
-      size_t n = scm_i_symbol_length (sym);
-      while (n--)
-        if (scm_i_symbol_ref (sym, n) != scm_i_string_ref (data->string, n))
-          return 0;
-      return 1;
-    }
-  else
+  if (scm_i_symbol_hash (sym) != hash)
+    return 0;
+  if (scm_i_symbol_length (sym) != len)
     return 0;
-}
-
-static SCM
-lookup_interned_symbol (SCM name, unsigned long raw_hash)
-{
-  struct string_lookup_data data;
-
-  data.string = name;
-  data.string_hash = raw_hash;
   
-  return scm_c_weak_set_lookup (symbols, raw_hash,
-                                string_lookup_predicate_fn,
-                                &data, SCM_BOOL_F);
-}
+  for (size_t i = 0; i < len; i++)
+    if (scm_i_symbol_ref (sym, i) != scm_i_string_ref (str, i))
+      return 0;
 
-struct latin1_lookup_data
-{
-  const char *str;
-  size_t len;
-  unsigned long string_hash;
-};
+  return 1;
+}
 
 static int
-latin1_lookup_predicate_fn (SCM sym, void *closure)
+symbol_equals_latin1_string (SCM sym, const char *str, size_t len,
+                             unsigned long hash)
 {
-  struct latin1_lookup_data *data = closure;
-
-  return scm_i_symbol_hash (sym) == data->string_hash
-    && scm_i_is_narrow_symbol (sym)
-    && scm_i_symbol_length (sym) == data->len
-    && strncmp (scm_i_symbol_chars (sym), data->str, data->len) == 0;
+  if (scm_i_symbol_hash (sym) != hash)
+    return 0;
+  if (scm_i_symbol_length (sym) != len)
+    return 0;
+  if (!scm_i_is_narrow_symbol (sym))
+    return 0;
+  
+  return strncmp (scm_i_symbol_chars (sym), str, len) == 0;
 }
 
 static SCM
 lookup_interned_latin1_symbol (const char *str, size_t len,
                                unsigned long raw_hash)
 {
-  struct latin1_lookup_data data;
-
-  data.str = str;
-  data.len = len;
-  data.string_hash = raw_hash;
-  
-  return scm_c_weak_set_lookup (symbols, raw_hash,
-                                latin1_lookup_predicate_fn,
-                                &data, SCM_BOOL_F);
+  size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
+  for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket);
+       e;
+       e = scm_c_ephemeron_next (e))
+    {
+      SCM sym = scm_c_ephemeron_key (e);
+      if (scm_is_true (sym)
+          && symbol_equals_latin1_string (sym, str, len, raw_hash))
+        return sym;
+    }
+  return SCM_BOOL_F;
 }
 
-struct utf8_lookup_data
+static int
+utf8_string_equals_narrow_string (const uint8_t *utf8, size_t ulen,
+                                  const char *narrow)
 {
-  const char *str;
-  size_t len;
-  unsigned long string_hash;
-};
+  /* Precondition: utf8,ulen is valid UTF-8.  */
+  size_t byte_idx = 0;
+  
+  while (byte_idx < ulen)
+    {
+      ucs4_t c = -1;
+      byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx);
+      if (c != *narrow)
+        return 0;
+      narrow++;
+    }
+
+  return 1;
+}
 
 static int
-utf8_string_equals_wide_string (const uint8_t *narrow, size_t nlen,
-                                const scm_t_wchar *wide, size_t wlen)
+utf8_string_equals_wide_string (const uint8_t *utf8, size_t ulen,
+                                const scm_t_wchar *wide)
 {
-  size_t byte_idx = 0, char_idx = 0;
+  /* Precondition: utf8,ulen is valid UTF-8.  */
+  size_t byte_idx = 0;
   
-  while (byte_idx < nlen && char_idx < wlen)
+  while (byte_idx < ulen)
     {
-      ucs4_t c;
-      int nbytes;
-
-      nbytes = u8_mbtoucr (&c, narrow + byte_idx, nlen - byte_idx);
-      if (nbytes == 0)
-        break;
-      else if (nbytes < 0)
-        /* Bad UTF-8.  */
-        return 0;
-      else if (c != wide[char_idx])
+      ucs4_t c = -1;
+      byte_idx += u8_mbtoucr (&c, utf8 + byte_idx, ulen - byte_idx);
+      if (c != *wide)
         return 0;
-
-      byte_idx += nbytes;
-      char_idx++;
+      wide++;
     }
 
-  return byte_idx == nlen && char_idx == wlen;
+  return 1;
 }
 
 static int
-utf8_lookup_predicate_fn (SCM sym, void *closure)
+symbol_equals_utf8_string (SCM sym, const uint8_t *str, size_t len,
+                           unsigned long hash, int codepoint_count)
 {
-  struct utf8_lookup_data *data = closure;
-
-  if (scm_i_symbol_hash (sym) != data->string_hash)
+  if (scm_i_symbol_hash (sym) != hash)
+    return 0;
+  if (scm_i_symbol_length (sym) != codepoint_count)
     return 0;
   
   if (scm_i_is_narrow_symbol (sym))
-    return (scm_i_symbol_length (sym) == data->len
-            && strncmp (scm_i_symbol_chars (sym), data->str, data->len) == 0);
+    return utf8_string_equals_narrow_string (str, len,
+                                             scm_i_symbol_chars (sym));
   else
-    return utf8_string_equals_wide_string ((const uint8_t *) data->str,
-                                           data->len,
-                                           scm_i_symbol_wide_chars (sym),
-                                           scm_i_symbol_length (sym));
+    return utf8_string_equals_wide_string (str, len,
+                                           scm_i_symbol_wide_chars (sym));
 }
 
 static SCM
-lookup_interned_utf8_symbol (const char *str, size_t len,
+lookup_interned_utf8_symbol (const uint8_t *str, size_t len,
                              unsigned long raw_hash)
 {
-  struct utf8_lookup_data data;
-
-  data.str = str;
-  data.len = len;
-  data.string_hash = raw_hash;
-  
-  return scm_c_weak_set_lookup (symbols, raw_hash,
-                                utf8_lookup_predicate_fn,
-                                &data, SCM_BOOL_F);
-}
+  int codepoint_count = u8_mbsnlen (str, len);
+  if (codepoint_count == -1)
+    /* Bad UTF-8.  */
+    return SCM_BOOL_F;
 
-static int
-symbol_lookup_predicate_fn (SCM sym, void *closure)
-{
-  SCM other = SCM_PACK_POINTER (closure);
+  if (codepoint_count == len)
+    return lookup_interned_latin1_symbol ((const char *) str, len, raw_hash);
 
-  if (scm_i_symbol_hash (sym) == scm_i_symbol_hash (other)
-      && scm_i_symbol_length (sym) == scm_i_symbol_length (other))
+  size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
+  for (struct gc_ephemeron *e = scm_c_ephemeron_table_ref (symbols, bucket);
+       e;
+       e = scm_c_ephemeron_next (e))
     {
-      if (scm_i_is_narrow_symbol (sym))
-        return scm_i_is_narrow_symbol (other)
-          && (strncmp (scm_i_symbol_chars (sym),
-                       scm_i_symbol_chars (other),
-                       scm_i_symbol_length (other)) == 0);
-      else
-        return scm_is_true
-          (scm_string_equal_p (scm_symbol_to_string (sym),
-                               scm_symbol_to_string (other)));
+      SCM sym = scm_c_ephemeron_key (e);
+      if (scm_is_true (sym)
+          && symbol_equals_utf8_string (sym, str, len, raw_hash,
+                                        codepoint_count))
+        return sym;
     }
-  return 0;
+  return SCM_BOOL_F;
 }
- 
+
 static SCM
 scm_i_str2symbol (SCM str)
 {
-  SCM symbol;
   unsigned long raw_hash = scm_i_string_hash (str);
+  size_t bucket = raw_hash % scm_c_ephemeron_table_length (symbols);
+  size_t len = scm_i_string_length (str);
 
-  symbol = lookup_interned_symbol (str, raw_hash);
-  if (scm_is_true (symbol))
-    return symbol;
-  else
+  struct gc_ephemeron *chain = scm_c_ephemeron_table_ref (symbols, bucket);
+  /* First see if a symbol with this name is already interned.  */
+  for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e))
     {
-      /* The symbol was not found, create it.  */
-      symbol = scm_i_make_symbol (str, 0, raw_hash);
-
-      /* Might return a different symbol, if another one was interned at
-         the same time.  */
-      return scm_c_weak_set_add_x (symbols, raw_hash,
-                                   symbol_lookup_predicate_fn,
-                                   SCM_UNPACK_POINTER (symbol), symbol);
+      SCM sym = scm_c_ephemeron_key (e);
+      if (scm_is_true (sym) && symbol_equals_string (sym, str, len, raw_hash))
+        return sym;
+    }
+      
+  /* The symbol was not found, create it.  */
+  SCM sym = scm_i_make_symbol (str, 0, raw_hash);
+  struct gc_ephemeron *link = scm_c_make_ephemeron (sym, SCM_BOOL_T);
+  while (1)
+    {
+      struct gc_ephemeron *prev =
+        scm_c_ephemeron_table_try_push_x (symbols, bucket, link, chain);
+      if (prev == chain)
+        return sym;
+      /* Lost a race, someone else added a symbol in this bucket.  Check
+         the chain and try again.  */
+      chain = prev;
+      for (struct gc_ephemeron *e = chain; e; e = scm_c_ephemeron_next (e))
+        {
+          SCM sym = scm_c_ephemeron_key (e);
+          if (scm_is_true (sym)
+              && symbol_equals_string (sym, str, len, raw_hash))
+            return sym;
+        }
     }
 }
 
-
 static SCM
 scm_i_str2uninterned_symbol (SCM str)
 {
@@ -449,7 +433,7 @@ scm_from_utf8_symboln (const char *sym, size_t len)
     len = strlen (sym);
   hash = scm_i_utf8_string_hash (sym, len);
 
-  ret = lookup_interned_utf8_symbol (sym, len, hash);
+  ret = lookup_interned_utf8_symbol ((const uint8_t *)sym, len, hash);
   if (scm_is_false (ret))
     {
       SCM str = scm_from_utf8_stringn (sym, len);
@@ -462,7 +446,7 @@ scm_from_utf8_symboln (const char *sym, size_t len)
 void
 scm_symbols_prehistory ()
 {
-  symbols = scm_c_make_weak_set (5000);
+  symbols = scm_c_make_ephemeron_table (5000);
 }
 
 

Reply via email to