wingo pushed a commit to branch wip-whippet
in repository guile.

commit e780d2795933ed836e4ee993ebc3f903e78b0fae
Author: Andy Wingo <wi...@igalia.com>
AuthorDate: Wed Mar 5 10:08:03 2025 +0100

    nofl: Refactor SWAR mark-matching routines
    
    We are going to try to use fewer bits for mark state.
---
 src/nofl-space.h | 23 +++++++++---------
 src/swar.h       | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index 66aa0ac62..f98530b02 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -150,7 +150,6 @@ struct nofl_block_stack {
 #define NOFL_PAGE_OUT_QUEUE_SIZE 4
 
 struct nofl_space {
-  uint64_t sweep_mask;
   uint8_t live_mask;
   uint8_t marked_mask;
   uint8_t evacuating;
@@ -558,7 +557,7 @@ nofl_clear_memory(uintptr_t addr, size_t size) {
 
 static size_t
 nofl_space_live_object_granules(uint8_t *metadata) {
-  return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 
1;
+  return scan_for_byte_with_bits(metadata, -1, NOFL_METADATA_BYTE_END) + 1;
 }
 
 static void
@@ -704,7 +703,7 @@ nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
 // reached the end of the block.
 static size_t
 nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
-                                  uintptr_t sweep_mask) {
+                                  uint8_t live_mask) {
   GC_ASSERT(nofl_allocator_has_block(alloc));
   GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
   uintptr_t sweep = alloc->sweep;
@@ -721,7 +720,7 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator 
*alloc,
   // right after a hole, which can point to either the end of the
   // block or to a live object.  Assume that a live object is more
   // common.
-  while (limit_granules && (metadata[0] & sweep_mask)) {
+  while (limit_granules && (metadata[0] & live_mask)) {
     // Object survived collection; skip over it and continue sweeping.
     size_t object_granules = nofl_space_live_object_granules(metadata);
     sweep += object_granules * NOFL_GRANULE_SIZE;
@@ -734,7 +733,8 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator 
*alloc,
     return 0;
   }
 
-  size_t hole_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  size_t hole_granules = scan_for_byte_with_bits(metadata, limit_granules,
+                                                 live_mask);
   size_t free_bytes = hole_granules * NOFL_GRANULE_SIZE;
   GC_ASSERT(hole_granules);
   GC_ASSERT(hole_granules <= limit_granules);
@@ -754,10 +754,10 @@ nofl_allocator_next_hole_in_block(struct nofl_allocator 
*alloc,
 
 static void
 nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                                        uintptr_t sweep_mask) {
+                                        uint8_t live_mask) {
   do {
     nofl_allocator_finish_hole(alloc);
-  } while (nofl_allocator_next_hole_in_block(alloc, sweep_mask));
+  } while (nofl_allocator_next_hole_in_block(alloc, live_mask));
 }
 
 static void
@@ -771,7 +771,7 @@ nofl_allocator_release_block(struct nofl_allocator *alloc,
   } else if (space->evacuating) {
     nofl_allocator_release_full_evacuation_target(alloc, space);
   } else {
-    nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
+    nofl_allocator_finish_sweeping_in_block(alloc, space->live_mask);
     nofl_allocator_release_full_block(alloc, space);
   }
 }
@@ -801,7 +801,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
   // Sweep current block for a hole.
   if (nofl_allocator_has_block(alloc)) {
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
     if (granules)
       return granules;
     else
@@ -819,7 +819,7 @@ nofl_allocator_next_hole(struct nofl_allocator *alloc,
     alloc->block.summary->holes_with_fragmentation = 0;
     alloc->block.summary->fragmentation_granules = 0;
     size_t granules =
-      nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      nofl_allocator_next_hole_in_block(alloc, space->live_mask);
     if (granules)
       return granules;
     nofl_allocator_release_full_block(alloc, space);
@@ -1134,7 +1134,6 @@ nofl_space_update_mark_patterns(struct nofl_space *space,
   if (advance_mark_mask)
     space->marked_mask = next_marked_mask;
   space->live_mask = survivor_mask | next_marked_mask;
-  space->sweep_mask = broadcast_byte(space->live_mask);
 }
 
 static void
@@ -1206,7 +1205,7 @@ nofl_space_promote_blocks(struct nofl_space *space) {
     block.summary->holes_with_fragmentation = 0;
     block.summary->fragmentation_granules = 0;
     struct nofl_allocator alloc = { block.addr, block.addr, block };
-    nofl_allocator_finish_sweeping_in_block(&alloc, space->sweep_mask);
+    nofl_allocator_finish_sweeping_in_block(&alloc, space->live_mask);
     atomic_fetch_add(&space->old_generation_granules,
                      NOFL_GRANULES_PER_BLOCK - block.summary->hole_granules);
     nofl_block_list_push(&space->old, block);
diff --git a/src/swar.h b/src/swar.h
index 293d99ec2..e516ed83f 100644
--- a/src/swar.h
+++ b/src/swar.h
@@ -26,23 +26,80 @@ load_eight_aligned_bytes(uint8_t *ptr) {
   return word;
 }
 
+static inline uint64_t
+match_bytes_against_bits(uint64_t bytes, uint8_t mask) {
+  return bytes & broadcast_byte(mask);
+}
+
 static size_t
-scan_for_byte(uint8_t *ptr, size_t limit, uint64_t mask) {
+scan_for_byte_with_bits(uint8_t *ptr, size_t limit, uint8_t mask) {
+  size_t n = 0;
+  size_t unaligned = ((uintptr_t) ptr) & 7;
+  if (unaligned) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 
8);
+    uint64_t match = match_bytes_against_bits(bytes, mask);
+    if (match)
+      return count_zero_bytes(match);
+    n += 8 - unaligned;
+  }
+
+  for(; n < limit; n += 8) {
+    uint64_t bytes = load_eight_aligned_bytes(ptr + n);
+    uint64_t match = match_bytes_against_bits(bytes, mask);
+    if (match)
+      return n + count_zero_bytes(match);
+  }
+
+  return limit;
+}
+
+static inline uint64_t
+match_bytes_against_2_tags(uint64_t bytes, uint8_t mask, uint8_t tag1,
+                           uint8_t tag2) 
+{
+  // Precondition: tags are covered by within mask.
+  GC_ASSERT_EQ(tag1 & mask, tag1);
+  GC_ASSERT_EQ(tag2 & mask, tag2);
+  // Precondition: high bit of mask byte is empty, so that we can add without
+  // overflow.
+  GC_ASSERT_EQ(mask & 0x7f, mask);
+  // Precondition: mask is low bits of byte.
+  GC_ASSERT(mask);
+  GC_ASSERT_EQ(mask & (mask + 1), 0);
+  
+  uint64_t vmask = broadcast_byte(mask);
+  uint64_t vtest = broadcast_byte(mask + 1);
+  uint64_t vtag1 = broadcast_byte(tag1);
+  uint64_t vtag2 = broadcast_byte(tag2);
+
+  bytes &= vmask;
+  uint64_t m1 = (bytes ^ vtag1) + vmask;
+  uint64_t m2 = (bytes ^ vtag2) + vmask;
+  return ((m1 & m2) & vtest) ^ vtest;
+}
+
+static size_t
+scan_for_byte_with_tags(uint8_t *ptr, size_t limit, uint8_t mask,
+                        uint8_t tag1, uint8_t tag2) {
+  // The way we handle unaligned reads by padding high bytes with zeroes 
assumes
+  // that all-zeroes is not a matching byte.
+  GC_ASSERT(tag1 && tag2);
+
   size_t n = 0;
   size_t unaligned = ((uintptr_t) ptr) & 7;
   if (unaligned) {
     uint64_t bytes = load_eight_aligned_bytes(ptr - unaligned) >> (unaligned * 
8);
-    bytes &= mask;
-    if (bytes)
-      return count_zero_bytes(bytes);
+    uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
+    if (match)
+      return count_zero_bytes(match);
     n += 8 - unaligned;
   }
 
   for(; n < limit; n += 8) {
     uint64_t bytes = load_eight_aligned_bytes(ptr + n);
-    bytes &= mask;
-    if (bytes)
-      return n + count_zero_bytes(bytes);
+    uint64_t match = match_bytes_against_2_tags(bytes, mask, tag1, tag2);
+    if (match)
+      return n + count_zero_bytes(match);
   }
 
   return limit;

Reply via email to