[Guile-commits] 265/423: nofl: Refactor evacuation allocation to be thread-local

Andy Wingo Wed, 16 Apr 2025 05:27:29 -0700

wingo pushed a commit to branch wip-whippet
in repository guile.

commit b663e5878e031ac6c4ba25bdd4779543f2f32075
Author: Andy Wingo <wi...@igalia.com>
AuthorDate: Tue Aug 20 14:33:56 2024 +0200


    nofl: Refactor evacuation allocation to be thread-local
    
    This relaxes a "reliability" requirement, as in
    
https://wingolog.org/archives/2024/07/10/copying-collectors-with-block-structured-heaps-are-unreliable.
---
 src/nofl-space.h | 1605 ++++++++++++++++++++++++++----------------------------
 src/whippet.c    |   63 ++-
 2 files changed, 815 insertions(+), 853 deletions(-)

diff --git a/src/nofl-space.h b/src/nofl-space.h
index fd718c962..eba3cd386 100644
--- a/src/nofl-space.h
+++ b/src/nofl-space.h
@@ -34,55 +34,6 @@ STATIC_ASSERT_EQ(NOFL_GRANULE_SIZE, 1 << 
NOFL_GRANULE_SIZE_LOG_2);
 STATIC_ASSERT_EQ(NOFL_MEDIUM_OBJECT_THRESHOLD,
                  NOFL_MEDIUM_OBJECT_GRANULE_THRESHOLD * NOFL_GRANULE_SIZE);
 
-// Each granule has one mark byte stored in a side table.  A granule's
-// mark state is a whole byte instead of a bit to facilitate parallel
-// marking.  (Parallel markers are allowed to race.)  We also use this
-// byte to compute object extent, via a bit flag indicating
-// end-of-object.
-//
-// Because we want to allow for conservative roots, we need to know
-// whether an address indicates an object or not.  That means that when
-// an object is allocated, it has to set a bit, somewhere.  We use the
-// metadata byte for this purpose, setting the "young" bit.
-//
-// The "young" bit's name might make you think about generational
-// collection, and indeed all objects collected in a minor collection
-// will have this bit set.  However, the nofl space never needs to check
-// for the young bit; if it weren't for the need to identify
-// conservative roots, we wouldn't need a young bit at all.  Perhaps in
-// an all-precise system, we would be able to avoid the overhead of
-// initializing mark byte upon each fresh allocation.
-//
-// When an object becomes dead after a GC, it will still have a bit set
-// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
-// clear these bits before the next collection.  But, for concurrent
-// marking, we will also be marking "live" objects, updating their mark
-// bits.  So there are four object states concurrently observable:
-// young, dead, survivor, and marked.  (If we didn't have concurrent
-// marking we would still need the "marked" state, because marking
-// mutator roots before stopping is also a form of concurrent marking.)
-// Even though these states are mutually exclusive, we use separate bits
-// for them because we have the space.  After each collection, the dead,
-// survivor, and marked states rotate by one bit.
-enum nofl_metadata_byte {
-  NOFL_METADATA_BYTE_NONE = 0,
-  NOFL_METADATA_BYTE_YOUNG = 1,
-  NOFL_METADATA_BYTE_MARK_0 = 2,
-  NOFL_METADATA_BYTE_MARK_1 = 4,
-  NOFL_METADATA_BYTE_MARK_2 = 8,
-  NOFL_METADATA_BYTE_END = 16,
-  NOFL_METADATA_BYTE_EPHEMERON = 32,
-  NOFL_METADATA_BYTE_PINNED = 64,
-  NOFL_METADATA_BYTE_UNUSED_1 = 128
-};
-
-static uint8_t
-nofl_rotate_dead_survivor_marked(uint8_t mask) {
-  uint8_t all =
-    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | 
NOFL_METADATA_BYTE_MARK_2;
-  return ((mask << 1) | (mask >> 2)) & all;
-}
-
 #define NOFL_SLAB_SIZE (4 * 1024 * 1024)
 #define NOFL_BLOCK_SIZE (64 * 1024)
 #define NOFL_METADATA_BYTES_PER_BLOCK (NOFL_BLOCK_SIZE / NOFL_GRANULE_SIZE)
@@ -144,7 +95,9 @@ struct nofl_block_summary {
       uint16_t hole_count;
       uint16_t free_granules;
       // Counters related to allocation since previous collection:
-      // wasted space due to fragmentation.
+      // wasted space due to fragmentation.  Also used by blocks on the
+      // "partly full" list, which have zero holes_with_fragmentation
+      // but nonzero fragmentation_granules.
       uint16_t holes_with_fragmentation;
       uint16_t fragmentation_granules;
       // After a block is swept, if it's empty it goes on the empties
@@ -173,6 +126,91 @@ struct nofl_slab {
 };
 STATIC_ASSERT_EQ(sizeof(struct nofl_slab), NOFL_SLAB_SIZE);
 
+// Lock-free block list.
+struct nofl_block_list {
+  size_t count;
+  uintptr_t blocks;
+};
+
+struct nofl_space {
+  uint64_t sweep_mask;
+  uint8_t live_mask;
+  uint8_t marked_mask;
+  uint8_t evacuating;
+  uintptr_t low_addr;
+  size_t extent;
+  size_t heap_size;
+  uint8_t last_collection_was_minor;
+  uintptr_t next_block;   // atomically
+  struct nofl_block_list empty;
+  struct nofl_block_list unavailable;
+  struct nofl_block_list partly_full;
+  struct nofl_block_list evacuation_targets;
+  double evacuation_minimum_reserve;
+  double evacuation_reserve;
+  double venerable_threshold;
+  ssize_t pending_unavailable_bytes; // atomically
+  struct nofl_slab *slabs;
+  size_t nslabs;
+  uintptr_t granules_freed_by_last_collection; // atomically
+  uintptr_t fragmentation_granules_since_last_collection; // atomically
+};
+
+struct nofl_allocator {
+  uintptr_t alloc;
+  uintptr_t sweep;
+  uintptr_t block;
+};
+
+// Each granule has one mark byte stored in a side table.  A granule's
+// mark state is a whole byte instead of a bit to facilitate parallel
+// marking.  (Parallel markers are allowed to race.)  We also use this
+// byte to compute object extent, via a bit flag indicating
+// end-of-object.
+//
+// Because we want to allow for conservative roots, we need to know
+// whether an address indicates an object or not.  That means that when
+// an object is allocated, it has to set a bit, somewhere.  We use the
+// metadata byte for this purpose, setting the "young" bit.
+//
+// The "young" bit's name might make you think about generational
+// collection, and indeed all objects collected in a minor collection
+// will have this bit set.  However, the nofl space never needs to check
+// for the young bit; if it weren't for the need to identify
+// conservative roots, we wouldn't need a young bit at all.  Perhaps in
+// an all-precise system, we would be able to avoid the overhead of
+// initializing mark byte upon each fresh allocation.
+//
+// When an object becomes dead after a GC, it will still have a bit set
+// -- maybe the young bit, or maybe a survivor bit.  The sweeper has to
+// clear these bits before the next collection.  But, for concurrent
+// marking, we will also be marking "live" objects, updating their mark
+// bits.  So there are four object states concurrently observable:
+// young, dead, survivor, and marked.  (If we didn't have concurrent
+// marking we would still need the "marked" state, because marking
+// mutator roots before stopping is also a form of concurrent marking.)
+// Even though these states are mutually exclusive, we use separate bits
+// for them because we have the space.  After each collection, the dead,
+// survivor, and marked states rotate by one bit.
+enum nofl_metadata_byte {
+  NOFL_METADATA_BYTE_NONE = 0,
+  NOFL_METADATA_BYTE_YOUNG = 1,
+  NOFL_METADATA_BYTE_MARK_0 = 2,
+  NOFL_METADATA_BYTE_MARK_1 = 4,
+  NOFL_METADATA_BYTE_MARK_2 = 8,
+  NOFL_METADATA_BYTE_END = 16,
+  NOFL_METADATA_BYTE_EPHEMERON = 32,
+  NOFL_METADATA_BYTE_PINNED = 64,
+  NOFL_METADATA_BYTE_UNUSED_1 = 128
+};
+
+static uint8_t
+nofl_rotate_dead_survivor_marked(uint8_t mask) {
+  uint8_t all =
+    NOFL_METADATA_BYTE_MARK_0 | NOFL_METADATA_BYTE_MARK_1 | 
NOFL_METADATA_BYTE_MARK_2;
+  return ((mask << 1) | (mask >> 2)) & all;
+}
+
 static struct nofl_slab*
 nofl_object_slab(void *obj) {
   uintptr_t addr = (uintptr_t) obj;
@@ -235,12 +273,6 @@ nofl_block_summary_set_next(struct nofl_block_summary 
*summary,
     (summary->next_and_flags & (NOFL_BLOCK_SIZE - 1)) | next;
 }
 
-// Lock-free block list.
-struct nofl_block_list {
-  size_t count;
-  uintptr_t blocks;
-};
-
 static void
 nofl_push_block(struct nofl_block_list *list, uintptr_t block) {
   atomic_fetch_add_explicit(&list->count, 1, memory_order_acq_rel);
@@ -267,85 +299,81 @@ nofl_pop_block(struct nofl_block_list *list) {
   return head;
 }
 
-static inline size_t
-nofl_size_to_granules(size_t size) {
-  return (size + NOFL_GRANULE_SIZE - 1) >> NOFL_GRANULE_SIZE_LOG_2;
+static size_t
+nofl_block_count(struct nofl_block_list *list) {
+  return atomic_load_explicit(&list->count, memory_order_acquire);
 }
 
-struct nofl_evacuation_allocator {
-  size_t allocated; // atomically
-  size_t limit;
-  uintptr_t block_cursor; // atomically
-};
-
-struct nofl_space {
-  uint64_t sweep_mask;
-  uint8_t live_mask;
-  uint8_t marked_mask;
-  uint8_t evacuating;
-  uintptr_t low_addr;
-  size_t extent;
-  size_t heap_size;
-  uint8_t last_collection_was_minor;
-  uintptr_t next_block;   // atomically
-  struct nofl_block_list empty;
-  struct nofl_block_list unavailable;
-  struct nofl_block_list evacuation_targets;
-  double evacuation_minimum_reserve;
-  double evacuation_reserve;
-  double venerable_threshold;
-  ssize_t pending_unavailable_bytes; // atomically
-  struct nofl_evacuation_allocator evacuation_allocator;
-  struct nofl_slab *slabs;
-  size_t nslabs;
-  uintptr_t granules_freed_by_last_collection; // atomically
-  uintptr_t fragmentation_granules_since_last_collection; // atomically
-};
-
-struct nofl_allocator {
-  uintptr_t alloc;
-  uintptr_t sweep;
-  uintptr_t block;
-};
+static void
+nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
+  nofl_push_block(&space->unavailable, block);
+}
 
-static inline void
-nofl_clear_memory(uintptr_t addr, size_t size) {
-  memset((char*)addr, 0, size);
+static uintptr_t
+nofl_pop_unavailable_block(struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->unavailable);
+  if (!block)
+    return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
+  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
+  return block;
 }
 
-static size_t
-nofl_space_live_object_granules(uint8_t *metadata) {
-  return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 
1;
+static uintptr_t
+nofl_pop_empty_block(struct nofl_space *space) {
+  return nofl_pop_block(&space->empty);
 }
 
-static inline int
-nofl_space_mark_object(struct nofl_space *space, struct gc_ref ref) {
-  uint8_t *loc = nofl_metadata_byte_for_object(ref);
-  uint8_t byte = *loc;
-  if (byte & space->marked_mask)
+static int
+nofl_maybe_push_evacuation_target(struct nofl_space *space,
+                                  uintptr_t block, double reserve) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  size_t targets = nofl_block_count(&space->evacuation_targets);
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = nofl_block_count(&space->unavailable);
+  if (targets >= (total - unavailable) * reserve)
     return 0;
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  *loc = (byte & ~mask) | space->marked_mask;
+
+  nofl_push_block(&space->evacuation_targets, block);
   return 1;
 }
 
-static uintptr_t
-nofl_make_evacuation_allocator_cursor(uintptr_t block, size_t allocated) {
-  GC_ASSERT(allocated < (NOFL_BLOCK_SIZE - 1) * (uint64_t) NOFL_BLOCK_SIZE);
-  return align_down(block, NOFL_BLOCK_SIZE) | (allocated / NOFL_BLOCK_SIZE);
+static int
+nofl_push_evacuation_target_if_needed(struct nofl_space *space,
+                                      uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_minimum_reserve);
+}
+
+static int
+nofl_push_evacuation_target_if_possible(struct nofl_space *space,
+                                        uintptr_t block) {
+  return nofl_maybe_push_evacuation_target(space, block,
+                                           space->evacuation_reserve);
 }
 
 static void
-nofl_prepare_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
-                                  struct nofl_block_list *targets) {
-  uintptr_t first_block = targets->blocks;
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  alloc->limit =
-    atomic_load_explicit(&targets->count, memory_order_acquire) * 
NOFL_BLOCK_SIZE;
-  atomic_store_explicit(&alloc->block_cursor,
-                        nofl_make_evacuation_allocator_cursor(first_block, 0),
-                        memory_order_release);
+nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
+  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
+                                         NOFL_BLOCK_NEEDS_SWEEP));
+  nofl_push_block(&space->empty, block);
+}
+
+static inline void
+nofl_clear_memory(uintptr_t addr, size_t size) {
+  memset((char*)addr, 0, size);
+}
+
+static size_t
+nofl_space_live_object_granules(uint8_t *metadata) {
+  return scan_for_byte(metadata, -1, broadcast_byte(NOFL_METADATA_BYTE_END)) + 
1;
 }
 
 static void
@@ -360,440 +388,408 @@ nofl_clear_remaining_metadata_bytes_in_block(uintptr_t 
block,
 }
 
 static void
-nofl_finish_evacuation_allocator_block(uintptr_t block,
-                                       uintptr_t allocated) {
-  GC_ASSERT(allocated <= NOFL_BLOCK_SIZE);
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-  size_t fragmentation = (NOFL_BLOCK_SIZE - allocated) >> 
NOFL_GRANULE_SIZE_LOG_2;
-  summary->hole_count = 1;
-  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-  summary->holes_with_fragmentation = fragmentation ? 1 : 0;
-  summary->fragmentation_granules = fragmentation;
-  if (fragmentation)
-    nofl_clear_remaining_metadata_bytes_in_block(block, allocated);
+nofl_allocator_reset(struct nofl_allocator *alloc) {
+  alloc->alloc = alloc->sweep = alloc->block = 0;
 }
 
 static void
-nofl_finish_evacuation_allocator(struct nofl_evacuation_allocator *alloc,
-                                 struct nofl_block_list *targets,
-                                 struct nofl_block_list *empties,
-                                 size_t reserve) {
-  // Blocks that we used for evacuation get returned to the mutator as
-  // sweepable blocks.  Blocks that we didn't get to use go to the
-  // empties.
-  size_t allocated = atomic_load_explicit(&alloc->allocated,
-                                          memory_order_acquire);
-  atomic_store_explicit(&alloc->allocated, 0, memory_order_release);
-  if (allocated > alloc->limit)
-    allocated = alloc->limit;
-  while (allocated >= NOFL_BLOCK_SIZE) {
-    uintptr_t block = nofl_pop_block(targets);
-    GC_ASSERT(block);
-    allocated -= NOFL_BLOCK_SIZE;
-  }
-  if (allocated) {
-    // Finish off the last partially-filled block.
-    uintptr_t block = nofl_pop_block(targets);
-    GC_ASSERT(block);
-    nofl_finish_evacuation_allocator_block(block, allocated);
-  }
-  size_t remaining = atomic_load_explicit(&targets->count, 
memory_order_acquire);
-  while (remaining-- > reserve)
-    nofl_push_block(empties, nofl_pop_block(targets));
-}
+nofl_allocator_release_full_block(struct nofl_allocator *alloc,
+                                  struct nofl_space *space,
+                                  struct nofl_block_summary *summary) {
+  GC_ASSERT(alloc->block);
+  GC_ASSERT(alloc->alloc == alloc->sweep);
+  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE));
 
-static struct gc_ref
-nofl_evacuation_allocate(struct nofl_space *space, size_t granules) {
-  // All collector threads compete to allocate from what is logically a
-  // single bump-pointer arena, which is actually composed of a linked
-  // list of blocks.
-  struct nofl_evacuation_allocator *alloc = &space->evacuation_allocator;
-  uintptr_t cursor = atomic_load_explicit(&alloc->block_cursor,
-                                          memory_order_acquire);
-  size_t bytes = granules * NOFL_GRANULE_SIZE;
-  size_t prev = atomic_load_explicit(&alloc->allocated, memory_order_acquire);
-  size_t block_mask = (NOFL_BLOCK_SIZE - 1);
-  size_t next;
-  do {
-    if (prev >= alloc->limit)
-      // No more space.
-      return gc_ref_null();
-    next = prev + bytes;
-    if ((prev ^ next) & ~block_mask)
-      // Allocation straddles a block boundary; advance so it starts a
-      // fresh block.
-      next = (next & ~block_mask) + bytes;
-  } while (!atomic_compare_exchange_weak(&alloc->allocated, &prev, next));
-  // OK, we've claimed our memory, starting at next - bytes.  Now find
-  // the node in the linked list of evacuation targets that corresponds
-  // to this allocation pointer.
-  uintptr_t block = cursor & ~block_mask;
-  // This is the SEQ'th block to be allocated into.
-  uintptr_t seq = cursor & block_mask;
-  // Therefore this block handles allocations starting at SEQ*BLOCK_SIZE
-  // and continuing for NOFL_BLOCK_SIZE bytes.
-  uintptr_t base = seq * NOFL_BLOCK_SIZE;
-
-  while ((base ^ next) & ~block_mask) {
-    GC_ASSERT(base < next);
-    if (base + NOFL_BLOCK_SIZE > prev) {
-      // The allocation straddles a block boundary, and the cursor has
-      // caught up so that we identify the block for the previous
-      // allocation pointer.  Finish the previous block, probably
-      // leaving a small hole at the end.
-      nofl_finish_evacuation_allocator_block(block, prev - base);
-    }
-    // Cursor lags; advance it.
-    block = nofl_block_summary_next(nofl_block_summary_for_addr(block));
-    base += NOFL_BLOCK_SIZE;
-    if (base >= alloc->limit) {
-      // Ran out of blocks!
-      GC_ASSERT(!block);
-      return gc_ref_null();
-    }
-    GC_ASSERT(block);
-    // This store can race with other allocators, but that's OK as long
-    // as it never advances the cursor beyond the allocation pointer,
-    // which it won't because we updated the allocation pointer already.
-    atomic_store_explicit(&alloc->block_cursor,
-                          nofl_make_evacuation_allocator_cursor(block, base),
-                          memory_order_release);
-  }
-
-  uintptr_t addr = block + (next & block_mask) - bytes;
-  return gc_ref(addr);
-}
-
-static inline int
-nofl_space_evacuate_or_mark_object(struct nofl_space *space,
-                                   struct gc_edge edge,
-                                   struct gc_ref old_ref) {
-  uint8_t *metadata = nofl_metadata_byte_for_object(old_ref);
-  uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
-    return 0;
-  if (space->evacuating &&
-      
nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(old_ref)),
-                                  NOFL_BLOCK_EVACUATE)) {
-    // This is an evacuating collection, and we are attempting to
-    // evacuate this block, and we are tracing this particular object
-    // for what appears to be the first time.
-    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+  atomic_fetch_add(&space->granules_freed_by_last_collection,
+                   summary->free_granules);
+  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
+                   summary->fragmentation_granules);
 
-    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
-      gc_atomic_forward_acquire(&fwd);
+  // If this block has mostly survivors, we should avoid sweeping it and
+  // trying to allocate into it for a minor GC.  Sweep it next time to
+  // clear any garbage allocated in this cycle and mark it as
+  // "venerable" (i.e., old).
+  if (!nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) 
&&
+      summary->free_granules < NOFL_GRANULES_PER_BLOCK * 
space->venerable_threshold)
+    nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
 
-    switch (fwd.state) {
-    case GC_FORWARDING_STATE_NOT_FORWARDED:
-    case GC_FORWARDING_STATE_ABORTED:
-      // Impossible.
-      GC_CRASH();
-    case GC_FORWARDING_STATE_ACQUIRED: {
-      // We claimed the object successfully; evacuating is up to us.
-      size_t object_granules = nofl_space_live_object_granules(metadata);
-      struct gc_ref new_ref = nofl_evacuation_allocate(space, object_granules);
-      if (gc_ref_is_heap_object(new_ref)) {
-        // Copy object contents before committing, as we don't know what
-        // part of the object (if any) will be overwritten by the
-        // commit.
-        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
-               object_granules * NOFL_GRANULE_SIZE);
-        gc_atomic_forward_commit(&fwd, new_ref);
-        // Now update extent metadata, and indicate to the caller that
-        // the object's fields need to be traced.
-        uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
-        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
-        gc_edge_update(edge, new_ref);
-        metadata = new_metadata;
-        // Fall through to set mark bits.
-      } else {
-        // Well shucks; allocation failed, marking the end of
-        // opportunistic evacuation.  No future evacuation of this
-        // object will succeed.  Mark in place instead.
-        gc_atomic_forward_abort(&fwd);
-      }
-      break;
-    }
-    case GC_FORWARDING_STATE_BUSY:
-      // Someone else claimed this object first.  Spin until new address
-      // known, or evacuation aborts.
-      for (size_t spin_count = 0;; spin_count++) {
-        if (gc_atomic_forward_retry_busy(&fwd))
-          break;
-        yield_for_spin(spin_count);
-      }
-      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
-        // Remove evacuation aborted; remote will mark and enqueue.
-        return 0;
-      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-      // Fall through.
-    case GC_FORWARDING_STATE_FORWARDED:
-      // The object has been evacuated already.  Update the edge;
-      // whoever forwarded the object will make sure it's eventually
-      // traced.
-      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-      return 0;
-    }
-  }
+  nofl_allocator_reset(alloc);
+}
 
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  *metadata = (byte & ~mask) | space->marked_mask;
-  return 1;
+static void
+nofl_allocator_release_partly_full_block(struct nofl_allocator *alloc,
+                                         struct nofl_space *space,
+                                         struct nofl_block_summary *summary) {
+  // A block can go on the partly full list if it has exactly one
+  // hole, located at the end of the block.
+  GC_ASSERT(alloc->alloc > alloc->block);
+  GC_ASSERT(alloc->sweep == alloc->block + NOFL_BLOCK_SIZE);
+  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
+  size_t hole_size = alloc->sweep - alloc->alloc;
+  GC_ASSERT(hole_size);
+  summary->fragmentation_granules = hole_size >> NOFL_GRANULE_SIZE_LOG_2;
+  nofl_push_block(&space->partly_full, alloc->block);
+  nofl_allocator_reset(alloc);
 }
 
-static inline int
-nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
-  return addr - space->low_addr < space->extent;
+static size_t
+nofl_allocator_acquire_partly_full_block(struct nofl_allocator *alloc,
+                                         struct nofl_space *space) {
+  uintptr_t block = nofl_pop_block(&space->partly_full);
+  if (!block) return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  GC_ASSERT(summary->holes_with_fragmentation == 0);
+  alloc->block = block;
+  alloc->sweep = block + NOFL_BLOCK_SIZE;
+  size_t hole_granules = summary->fragmentation_granules;
+  summary->fragmentation_granules = 0;
+  alloc->alloc = alloc->sweep - (hole_granules << NOFL_GRANULE_SIZE_LOG_2);
+  return hole_granules;
 }
 
-static inline int
-nofl_space_contains_conservative_ref(struct nofl_space *space,
-                                     struct gc_conservative_ref ref) {
-  return nofl_space_contains_address(space, gc_conservative_ref_value(ref));
+static size_t
+nofl_allocator_acquire_empty_block(struct nofl_allocator *alloc,
+                                   struct nofl_space *space) {
+  uintptr_t block = nofl_pop_empty_block(space);
+  if (!block) return 0;
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
+  summary->hole_count = 1;
+  summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+  summary->holes_with_fragmentation = 0;
+  summary->fragmentation_granules = 0;
+  nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+  alloc->block = alloc->alloc = block;
+  alloc->sweep = block + NOFL_BLOCK_SIZE;
+  nofl_clear_memory(block, NOFL_BLOCK_SIZE);
+  return NOFL_GRANULES_PER_BLOCK;
 }
 
-static inline int
-nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
-  return nofl_space_contains_address(space, gc_ref_value(ref));
+static void
+nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
+  size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
+  if (granules) {
+    struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(alloc->block);
+    summary->holes_with_fragmentation++;
+    summary->fragmentation_granules += granules;
+    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
+    memset(metadata, 0, granules);
+    alloc->alloc = alloc->sweep;
+  }
 }
 
-static int
-nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
-                                     struct gc_edge edge,
-                                     struct gc_ref ref) {
-  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
-  uint8_t byte = *metadata;
-  if (byte & space->marked_mask)
-    return 1;
+// Sweep some heap to reclaim free space, advancing alloc->alloc and
+// alloc->sweep.  Return the size of the hole in granules, or 0 if we
+// reached the end of the block.
+static size_t
+nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
+                                  uintptr_t sweep_mask) {
+  GC_ASSERT(alloc->block != 0);
+  GC_ASSERT_EQ(alloc->alloc, alloc->sweep);
+  uintptr_t sweep = alloc->sweep;
+  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
 
-  if (!space->evacuating)
-    return 0;
-  if 
(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(ref)),
-                                   NOFL_BLOCK_EVACUATE))
+  if (sweep == limit)
     return 0;
 
-  struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
-  switch (fwd.state) {
-  case GC_FORWARDING_STATE_NOT_FORWARDED:
-    return 0;
-  case GC_FORWARDING_STATE_BUSY:
-    // Someone else claimed this object first.  Spin until new address
-    // known, or evacuation aborts.
-    for (size_t spin_count = 0;; spin_count++) {
-      if (gc_atomic_forward_retry_busy(&fwd))
-        break;
-      yield_for_spin(spin_count);
-    }
-    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
-      // Remote evacuation aborted; remote will mark and enqueue.
-      return 1;
-    ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
-    // Fall through.
-  case GC_FORWARDING_STATE_FORWARDED:
-    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
-    return 1;
-  default:
-    GC_CRASH();
+  GC_ASSERT((sweep & (NOFL_GRANULE_SIZE - 1)) == 0);
+  uint8_t* metadata = nofl_metadata_byte_for_addr(sweep);
+  size_t limit_granules = (limit - sweep) >> NOFL_GRANULE_SIZE_LOG_2;
+
+  // Except for when we first get a block, alloc->sweep is positioned
+  // right after a hole, which can point to either the end of the
+  // block or to a live object.  Assume that a live object is more
+  // common.
+  while (limit_granules && (metadata[0] & sweep_mask)) {
+    // Object survived collection; skip over it and continue sweeping.
+    size_t object_granules = nofl_space_live_object_granules(metadata);
+    sweep += object_granules * NOFL_GRANULE_SIZE;
+    limit_granules -= object_granules;
+    metadata += object_granules;
   }
-}
-
-static inline struct gc_ref
-nofl_space_mark_conservative_ref(struct nofl_space *space,
-                                 struct gc_conservative_ref ref,
-                                 int possibly_interior) {
-  uintptr_t addr = gc_conservative_ref_value(ref);
-
-  if (possibly_interior) {
-    addr = align_down(addr, NOFL_GRANULE_SIZE);
-  } else {
-    // Addr not an aligned granule?  Not an object.
-    uintptr_t displacement = addr & (NOFL_GRANULE_SIZE - 1);
-    if (!gc_is_valid_conservative_ref_displacement(displacement))
-      return gc_ref_null();
-    addr -= displacement;
+  if (!limit_granules) {
+    GC_ASSERT_EQ(sweep, limit);
+    alloc->alloc = alloc->sweep = limit;
+    return 0;
   }
 
-  // Addr in meta block?  Not an object.
-  if ((addr & (NOFL_SLAB_SIZE - 1)) < NOFL_META_BLOCKS_PER_SLAB * 
NOFL_BLOCK_SIZE)
-    return gc_ref_null();
-
-  // Addr in block that has been paged out?  Not an object.
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
-  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-    return gc_ref_null();
-
-  uint8_t *loc = nofl_metadata_byte_for_addr(addr);
-  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
-
-  // Already marked object?  Nothing to do.
-  if (byte & space->marked_mask)
-    return gc_ref_null();
-
-  // Addr is the not start of an unmarked object?  Search backwards if
-  // we have interior pointers, otherwise not an object.
-  uint8_t object_start_mask = space->live_mask | NOFL_METADATA_BYTE_YOUNG;
-  if (!(byte & object_start_mask)) {
-    if (!possibly_interior)
-      return gc_ref_null();
-
-    uintptr_t block_base = align_down(addr, NOFL_BLOCK_SIZE);
-    uint8_t *loc_base = nofl_metadata_byte_for_addr(block_base);
-    do {
-      // Searched past block?  Not an object.
-      if (loc-- == loc_base)
-        return gc_ref_null();
-
-      byte = atomic_load_explicit(loc, memory_order_relaxed);
-
-      // Ran into the end of some other allocation?  Not an object, then.
-      if (byte & NOFL_METADATA_BYTE_END)
-        return gc_ref_null();
-
-      // Continue until we find object start.
-    } while (!(byte & object_start_mask));
-
-    // Found object start, and object is unmarked; adjust addr.
-    addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE;
-  }
+  size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
+  GC_ASSERT(free_granules);
+  GC_ASSERT(free_granules <= limit_granules);
 
-  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
-    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
-  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
-                        memory_order_relaxed);
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
+  summary->hole_count++;
+  GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - summary->free_granules);
+  summary->free_granules += free_granules;
 
-  return gc_ref(addr);
+  size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
+  alloc->alloc = sweep;
+  alloc->sweep = sweep + free_bytes;
+  return free_granules;
 }
 
-static inline size_t
-nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
-  uint8_t *loc = nofl_metadata_byte_for_object(ref);
-  size_t granules = nofl_space_live_object_granules(loc);
-  return granules * NOFL_GRANULE_SIZE;
+static void
+nofl_allocator_finish_sweeping_in_block(struct nofl_allocator *alloc,
+                                        uintptr_t sweep_mask) {
+  do {
+    nofl_allocator_finish_hole(alloc);
+  } while (nofl_allocator_next_hole_in_block(alloc, sweep_mask));
 }
 
 static void
-nofl_push_unavailable_block(struct nofl_space *space, uintptr_t block) {
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP));
-  GC_ASSERT(!nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_set_flag(summary, NOFL_BLOCK_UNAVAILABLE);
-  madvise((void*)block, NOFL_BLOCK_SIZE, MADV_DONTNEED);
-  nofl_push_block(&space->unavailable, block);
+nofl_allocator_release_block(struct nofl_allocator *alloc,
+                             struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(alloc->block);
+  if (alloc->alloc < alloc->sweep &&
+      alloc->sweep == alloc->block + NOFL_BLOCK_SIZE &&
+      summary->holes_with_fragmentation == 0) {
+    nofl_allocator_release_partly_full_block(alloc, space, summary);
+  } else {
+    nofl_allocator_finish_sweeping_in_block(alloc, space->sweep_mask);
+    nofl_allocator_release_full_block(alloc, space, summary);
+  }
 }
 
-static uintptr_t
-nofl_pop_unavailable_block(struct nofl_space *space) {
-  uintptr_t block = nofl_pop_block(&space->unavailable);
-  if (!block)
-    return 0;
-  struct nofl_block_summary *summary = nofl_block_summary_for_addr(block);
-  GC_ASSERT(nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE));
-  nofl_block_summary_clear_flag(summary, NOFL_BLOCK_UNAVAILABLE);
-  return block;
+static void
+nofl_allocator_finish(struct nofl_allocator *alloc, struct nofl_space *space) {
+  if (alloc->block)
+    nofl_allocator_release_block(alloc, space);
 }
 
 static uintptr_t
-nofl_pop_empty_block(struct nofl_space *space) {
-  return nofl_pop_block(&space->empty);
-}
+nofl_space_next_block_to_sweep(struct nofl_space *space) {
+  uintptr_t block = atomic_load_explicit(&space->next_block,
+                                         memory_order_acquire);
+  uintptr_t next_block;
+  do {
+    if (block == 0)
+      return 0;
 
-static int
-nofl_maybe_push_evacuation_target(struct nofl_space *space,
-                                  uintptr_t block, double reserve) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
-  size_t targets = atomic_load_explicit(&space->evacuation_targets.count,
-                                        memory_order_acquire);
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  if (targets >= (total - unavailable) * reserve)
+    next_block = block + NOFL_BLOCK_SIZE;
+    if (next_block % NOFL_SLAB_SIZE == 0) {
+      uintptr_t hi_addr = space->low_addr + space->extent;
+      if (next_block == hi_addr)
+        next_block = 0;
+      else
+        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
+    }
+  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
+                                         next_block));
+  return block;
+}
+
+static int
+nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
+                                     struct nofl_space *space) {
+  GC_ASSERT(alloc->block);
+  uintptr_t block = alloc->block;
+  if (atomic_load_explicit(&space->pending_unavailable_bytes,
+                           memory_order_acquire) <= 0)
     return 0;
 
-  nofl_push_block(&space->evacuation_targets, block);
+  nofl_push_unavailable_block(space, block);
+  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
+  nofl_allocator_reset(alloc);
   return 1;
 }
 
-static int
-nofl_push_evacuation_target_if_needed(struct nofl_space *space,
-                                      uintptr_t block) {
-  return nofl_maybe_push_evacuation_target(space, block,
-                                           space->evacuation_minimum_reserve);
-}
+static size_t
+nofl_allocator_next_hole(struct nofl_allocator *alloc,
+                         struct nofl_space *space) {
+  nofl_allocator_finish_hole(alloc);
+  // As we sweep if we find that a block is empty, we return it to the
+  // empties list.  Empties are precious.  But if we return 10 blocks in
+  // a row, and still find an 11th empty, go ahead and use it.
+  size_t empties_countdown = 10;
+  while (1) {
+    // Sweep current block for a hole.
+    if (alloc->block) {
+      struct nofl_block_summary *summary =
+        nofl_block_summary_for_addr(alloc->block);
+      size_t granules =
+        nofl_allocator_next_hole_in_block(alloc, space->sweep_mask);
+      if (granules) {
+        // If the hole spans only part of a block, let the allocator try
+        // to use it.
+        if (granules < NOFL_GRANULES_PER_BLOCK)
+          return granules;
+        // Otherwise we have an empty block.
+        nofl_clear_remaining_metadata_bytes_in_block(alloc->block, 0);
+        nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        // If we need an evacuation reserve block, take it.
+        if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
+          nofl_allocator_reset(alloc);
+          continue;
+        }
+        // If we have pending pages to release to the OS, we should unmap
+        // this block.
+        if (nofl_maybe_release_swept_empty_block(alloc, space))
+          continue;
+        // Otherwise if we've already returned lots of empty blocks to the
+        // freelist, let the allocator keep this block.
+        if (!empties_countdown) {
+          // After this block is allocated into, it will need to be swept.
+          nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+          return granules;
+        }
+        // Otherwise we push to the empty blocks list.
+        nofl_push_empty_block(space, alloc->block);
+        nofl_allocator_reset(alloc);
+        empties_countdown--;
+      } else {
+        nofl_allocator_release_full_block(alloc, space, summary);
+      }
+    }
 
-static int
-nofl_push_evacuation_target_if_possible(struct nofl_space *space,
-                                        uintptr_t block) {
-  return nofl_maybe_push_evacuation_target(space, block,
-                                           space->evacuation_reserve);
-}
+    GC_ASSERT(alloc->block == 0);
 
-static void
-nofl_push_empty_block(struct nofl_space *space, uintptr_t block) {
-  GC_ASSERT(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(block),
-                                         NOFL_BLOCK_NEEDS_SWEEP));
-  nofl_push_block(&space->empty, block);
-}
+    {
+      size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+      if (granules)
+        return granules;
+    }
 
-static ssize_t
-nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
-  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
+    while (1) {
+      uintptr_t block = nofl_space_next_block_to_sweep(space);
+      if (block) {
+        // Sweeping found a block.  We might take it for allocation, or
+        // we might send it back.
+        struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(block);
+        // If it's marked unavailable, it's already on a list of
+        // unavailable blocks, so skip and get the next block.
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+          continue;
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
+          // Skip venerable blocks after a minor GC -- we don't need to
+          // sweep as they weren't allocated into last cycle, and the
+          // mark bytes didn't rotate, so we have no cleanup to do; and
+          // we shouldn't try to allocate into them as it's not worth
+          // it.  Any wasted space is measured as fragmentation.
+          if (space->last_collection_was_minor)
+            continue;
+          else
+            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
+        }
+        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
+          // Prepare to sweep the block for holes.
+          alloc->alloc = alloc->sweep = alloc->block = block;
+          if (nofl_block_summary_has_flag(summary, 
NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
+            // In the last cycle we noted that this block consists of
+            // mostly old data.  Sweep any garbage, commit the mark as
+            // venerable, and avoid allocating into it.
+            nofl_block_summary_clear_flag(summary, 
NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+            if (space->last_collection_was_minor) {
+              nofl_allocator_finish_sweeping_in_block(alloc, 
space->sweep_mask);
+              nofl_allocator_release_full_block(alloc, space, summary);
+              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
+              continue;
+            }
+          }
+          // This block was marked in the last GC and needs sweeping.
+          // As we sweep we'll want to record how many bytes were live
+          // at the last collection.  As we allocate we'll record how
+          // many granules were wasted because of fragmentation.
+          summary->hole_count = 0;
+          summary->free_granules = 0;
+          summary->holes_with_fragmentation = 0;
+          summary->fragmentation_granules = 0;
+          break;
+        } else {
+          // Otherwise this block is completely empty and is on the
+          // empties list.  We take from the empties list only after all
+          // the NEEDS_SWEEP blocks are processed.
+          continue;
+        }
+      } else {
+        // We are done sweeping for blocks.  Now take from the empties
+        // list.
+        block = nofl_pop_empty_block(space);
+        // No empty block?  Return 0 to cause collection.
+        if (!block)
+          return 0;
+
+        // Maybe we should use this empty as a target for evacuation.
+        if (nofl_push_evacuation_target_if_possible(space, block))
+          continue;
+
+        // Otherwise give the block to the allocator.
+        struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(block);
+        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
+        summary->hole_count = 1;
+        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
+        summary->holes_with_fragmentation = 0;
+        summary->fragmentation_granules = 0;
+        alloc->block = block;
+        alloc->alloc = block;
+        alloc->sweep = block + NOFL_BLOCK_SIZE;
+        return NOFL_GRANULES_PER_BLOCK;
+      }
+    }
+  }
 }
 
-static void
-nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
-  ssize_t pending =
-    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
-  while (pending + NOFL_BLOCK_SIZE <= 0) {
-    uintptr_t block = nofl_pop_unavailable_block(space);
-    GC_ASSERT(block);
-    if (nofl_push_evacuation_target_if_needed(space, block))
-      continue;
-    nofl_push_empty_block(space, block);
-    pending = atomic_fetch_add(&space->pending_unavailable_bytes, 
NOFL_BLOCK_SIZE)
-      + NOFL_BLOCK_SIZE;
+static struct gc_ref
+nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
+              size_t size, void (*gc)(void*), void *gc_data) {
+  GC_ASSERT(size > 0);
+  GC_ASSERT(size <= gc_allocator_large_threshold());
+  size = align_up(size, NOFL_GRANULE_SIZE);
+
+  if (alloc->alloc + size > alloc->sweep) {
+    size_t granules = size >> NOFL_GRANULE_SIZE_LOG_2;
+    while (1) {
+      size_t hole = nofl_allocator_next_hole(alloc, space);
+      if (hole >= granules) {
+        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
+        break;
+      }
+      if (!hole)
+        gc(gc_data);
+    }
   }
+
+  struct gc_ref ret = gc_ref(alloc->alloc);
+  alloc->alloc += size;
+  gc_update_alloc_table(ret, size);
+  return ret;
 }
 
 static size_t
-nofl_allocator_next_hole(struct nofl_allocator *alloc,
-                         struct nofl_space *space);
+nofl_allocator_acquire_evacuation_block(struct nofl_allocator* alloc,
+                                        struct nofl_space *space) {
+  size_t granules = nofl_allocator_acquire_partly_full_block(alloc, space);
+  if (granules)
+    return granules;
+  return nofl_allocator_acquire_empty_block(alloc, space);
+}
 
-static int
-nofl_space_sweep_until_memory_released(struct nofl_space *space,
-                                       struct nofl_allocator *alloc) {
-  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                         memory_order_acquire);
-  // First try to unmap previously-identified empty blocks.  If pending
-  // > 0 and other mutators happen to identify empty blocks, they will
-  // be unmapped directly and moved to the unavailable list.
-  while (pending > 0) {
-    uintptr_t block = nofl_pop_empty_block(space);
-    if (!block)
-      break;
-    // Note that we may have competing uses; if we're evacuating,
-    // perhaps we should push this block to the evacuation target list.
-    // That would enable us to reach a fragmentation low water-mark in
-    // fewer cycles.  But maybe evacuation started in order to obtain
-    // free blocks for large objects; in that case we should just reap
-    // the fruits of our labor.  Probably this second use-case is more
-    // important.
-    nofl_push_unavailable_block(space, block);
-    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, 
NOFL_BLOCK_SIZE);
-    pending -= NOFL_BLOCK_SIZE;
-  }
-  // Otherwise, sweep, transitioning any empty blocks to unavailable and
-  // throwing away any non-empty block.  A bit wasteful but hastening
-  // the next collection is a reasonable thing to do here.
-  while (pending > 0) {
-    if (!nofl_allocator_next_hole(alloc, space))
-      return 0;
-    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
-                                   memory_order_acquire);
+static struct gc_ref
+nofl_evacuation_allocate(struct nofl_allocator* alloc, struct nofl_space 
*space,
+                         size_t granules) {
+  size_t avail = (alloc->sweep - alloc->alloc) >> NOFL_GRANULE_SIZE_LOG_2;
+  while (avail < granules) {
+    if (alloc->block) {
+      nofl_allocator_finish_hole(alloc);
+      nofl_allocator_release_full_block(alloc, space,
+                                        
nofl_block_summary_for_addr(alloc->block));
+    }
+    avail = nofl_allocator_acquire_evacuation_block(alloc, space);
+    if (!avail)
+      return gc_ref_null();
   }
-  return pending <= 0;
+
+  struct gc_ref ret = gc_ref(alloc->alloc);
+  alloc->alloc += granules * NOFL_GRANULE_SIZE;
+  gc_update_alloc_table(ret, granules * NOFL_GRANULE_SIZE);
+  return ret;
+}
+
+// Another thread is triggering GC.  Before we stop, finish clearing the
+// dead mark bytes for the mutator's block, and release the block.
+static void
+nofl_finish_sweeping(struct nofl_allocator *alloc,
+                     struct nofl_space *space) {
+  while (nofl_allocator_next_hole(alloc, space)) {}
 }
 
 static inline int
@@ -810,11 +806,6 @@ nofl_space_set_ephemeron_flag(struct gc_ref ref) {
   }
 }
 
-static void nofl_finish_sweeping(struct nofl_allocator *alloc,
-                                 struct nofl_space *space);
-static void nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                                          struct nofl_space *space);
-
 // Note that it's quite possible (and even likely) that any given remset
 // byte doesn't hold any roots, if all stores were to nursery objects.
 STATIC_ASSERT_EQ(NOFL_GRANULES_PER_REMSET_BYTE % 8, 0);
@@ -902,9 +893,8 @@ nofl_space_yield(struct nofl_space *space) {
 }
 
 static size_t
-nofl_space_evacuation_reserve(struct nofl_space *space) {
-  return atomic_load_explicit(&space->evacuation_targets.count,
-                              memory_order_acquire) * NOFL_BLOCK_SIZE;
+nofl_space_evacuation_reserve_bytes(struct nofl_space *space) {
+  return nofl_block_count(&space->evacuation_targets) * NOFL_BLOCK_SIZE;
 }
 
 static size_t
@@ -914,40 +904,26 @@ nofl_space_fragmentation(struct nofl_space *space) {
 }
 
 static void
-nofl_space_release_evacuation_target_blocks(struct nofl_space *space) {
-  // Move excess evacuation target blocks back to empties.
-  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
-  size_t unavailable = atomic_load_explicit(&space->unavailable.count,
-                                            memory_order_acquire);
-  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
-  nofl_finish_evacuation_allocator(&space->evacuation_allocator,
-                                   &space->evacuation_targets,
-                                   &space->empty,
-                                   reserve);
-}
-
-static void
-nofl_space_prepare_for_evacuation(struct nofl_space *space,
-                                  enum gc_collection_kind gc_kind) {
-  if (gc_kind != GC_COLLECTION_COMPACTING) {
-    space->evacuating = 0;
-    space->evacuation_reserve = space->evacuation_minimum_reserve;
-    return;
+nofl_space_prepare_evacuation(struct nofl_space *space) {
+  GC_ASSERT(!space->evacuating);
+  {
+    uintptr_t block;
+    while ((block = nofl_pop_block(&space->evacuation_targets)))
+      nofl_push_empty_block(space, block);
   }
-
-  // Put the mutator into evacuation mode, collecting up to 50% of free space 
as
-  // evacuation blocks.
-  space->evacuation_reserve = 0.5;
-
-  size_t target_blocks = space->evacuation_targets.count;
+  size_t target_blocks = nofl_block_count(&space->empty);
   DEBUG("evacuation target block count: %zu\n", target_blocks);
 
   if (target_blocks == 0) {
-    DEBUG("no evacuation target blocks, disabling evacuation for this 
round\n");
-    space->evacuating = 0;
+    DEBUG("no evacuation target blocks, not evacuating this round\n");
     return;
   }
 
+  // Put the mutator into evacuation mode, collecting up to 50% of free
+  // space as evacuation blocks.
+  space->evacuation_reserve = 0.5;
+  space->evacuating = 1;
+
   size_t target_granules = target_blocks * NOFL_GRANULES_PER_BLOCK;
   // Compute histogram where domain is the number of granules in a block
   // that survived the last collection, aggregated into 33 buckets, and
@@ -1011,15 +987,43 @@ nofl_space_prepare_for_evacuation(struct nofl_space 
*space,
       }
     }
   }
+}
 
-  // We are ready to evacuate!
-  nofl_prepare_evacuation_allocator(&space->evacuation_allocator,
-                                    &space->evacuation_targets);
-  space->evacuating = 1;
+static void
+nofl_space_finish_evacuation(struct nofl_space *space) {
+  // When evacuation began, the evacuation reserve was moved to the
+  // empties list.  Now that evacuation is finished, attempt to
+  // repopulate the reserve.
+  GC_ASSERT(space->evacuating);
+  space->evacuating = 0;
+  space->evacuation_reserve = space->evacuation_minimum_reserve;
+  size_t total = space->nslabs * NOFL_NONMETA_BLOCKS_PER_SLAB;
+  size_t unavailable = nofl_block_count(&space->unavailable);
+  size_t reserve = space->evacuation_minimum_reserve * (total - unavailable);
+  GC_ASSERT(nofl_block_count(&space->evacuation_targets) == 0);
+  while (reserve--) {
+    uintptr_t block = nofl_pop_block(&space->empty);
+    if (!block) break;
+    nofl_push_block(&space->evacuation_targets, block);
+  }
+  {
+    // FIXME: We should avoid sweeping partly full blocks, but it's too 
annoying
+    // to do at the moment given the way sweeping works.
+    uintptr_t block;
+    do {
+      block = nofl_pop_block(&space->partly_full);
+    } while (block);
+  }
+}
+
+static inline size_t
+nofl_size_to_granules(size_t size) {
+  return (size + NOFL_GRANULE_SIZE - 1) >> NOFL_GRANULE_SIZE_LOG_2;
 }
 
 static void
 nofl_space_verify_before_restart(struct nofl_space *space) {
+  GC_ASSERT_EQ(nofl_block_count(&space->partly_full), 0);
   // Iterate objects in each block, verifying that the END bytes correspond to
   // the measured object size.
   for (size_t slab = 0; slab < space->nslabs; slab++) {
@@ -1056,333 +1060,278 @@ nofl_space_verify_before_restart(struct nofl_space 
*space) {
 static void
 nofl_space_finish_gc(struct nofl_space *space,
                      enum gc_collection_kind gc_kind) {
-  space->evacuating = 0;
   space->last_collection_was_minor = (gc_kind == GC_COLLECTION_MINOR);
+  if (space->evacuating)
+    nofl_space_finish_evacuation(space);
   nofl_space_reset_sweeper(space);
   nofl_space_update_mark_patterns(space, 0);
   nofl_space_reset_statistics(space);
-  nofl_space_release_evacuation_target_blocks(space);
   if (GC_DEBUG)
     nofl_space_verify_before_restart(space);
 }
 
-static int
-nofl_sweep_byte(uint8_t *loc, uintptr_t sweep_mask) {
-  uint8_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  // If the metadata byte is nonzero, that means either a young, dead,
-  // survived, or marked object.  If it's live (survived or marked), we
-  // found the next mark.  Otherwise it's dead and we clear the byte.
-  // If we see an END, that means an end of a dead object; clear it.
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
-  }
-  return 0;
+static ssize_t
+nofl_space_request_release_memory(struct nofl_space *space, size_t bytes) {
+  return atomic_fetch_add(&space->pending_unavailable_bytes, bytes) + bytes;
 }
 
-static int
-nofl_sweep_word(uintptr_t *loc, uintptr_t sweep_mask) {
-  uintptr_t metadata = atomic_load_explicit(loc, memory_order_relaxed);
-  if (metadata) {
-    if (metadata & sweep_mask)
-      return 1;
-    atomic_store_explicit(loc, 0, memory_order_relaxed);
+static void
+nofl_space_reacquire_memory(struct nofl_space *space, size_t bytes) {
+  ssize_t pending =
+    atomic_fetch_sub(&space->pending_unavailable_bytes, bytes) - bytes;
+  while (pending + NOFL_BLOCK_SIZE <= 0) {
+    uintptr_t block = nofl_pop_unavailable_block(space);
+    GC_ASSERT(block);
+    if (!nofl_push_evacuation_target_if_needed(space, block))
+      nofl_push_empty_block(space, block);
+    pending = atomic_fetch_add(&space->pending_unavailable_bytes, 
NOFL_BLOCK_SIZE)
+      + NOFL_BLOCK_SIZE;
   }
-  return 0;
 }
 
-static uintptr_t
-nofl_space_next_block_to_sweep(struct nofl_space *space) {
-  uintptr_t block = atomic_load_explicit(&space->next_block,
+static int
+nofl_space_sweep_until_memory_released(struct nofl_space *space,
+                                       struct nofl_allocator *alloc) {
+  ssize_t pending = atomic_load_explicit(&space->pending_unavailable_bytes,
                                          memory_order_acquire);
-  uintptr_t next_block;
-  do {
-    if (block == 0)
+  // First try to unmap previously-identified empty blocks.  If pending
+  // > 0 and other mutators happen to identify empty blocks, they will
+  // be unmapped directly and moved to the unavailable list.
+  while (pending > 0) {
+    uintptr_t block = nofl_pop_empty_block(space);
+    if (!block)
+      break;
+    // Note that we may have competing uses; if we're evacuating,
+    // perhaps we should push this block to the evacuation target list.
+    // That would enable us to reach a fragmentation low water-mark in
+    // fewer cycles.  But maybe evacuation started in order to obtain
+    // free blocks for large objects; in that case we should just reap
+    // the fruits of our labor.  Probably this second use-case is more
+    // important.
+    nofl_push_unavailable_block(space, block);
+    pending = atomic_fetch_sub(&space->pending_unavailable_bytes, 
NOFL_BLOCK_SIZE);
+    pending -= NOFL_BLOCK_SIZE;
+  }
+  // Otherwise, sweep, transitioning any empty blocks to unavailable and
+  // throwing away any non-empty block.  A bit wasteful but hastening
+  // the next collection is a reasonable thing to do here.
+  while (pending > 0) {
+    if (!nofl_allocator_next_hole(alloc, space))
       return 0;
+    pending = atomic_load_explicit(&space->pending_unavailable_bytes,
+                                   memory_order_acquire);
+  }
+  return pending <= 0;
+}
 
-    next_block = block + NOFL_BLOCK_SIZE;
-    if (next_block % NOFL_SLAB_SIZE == 0) {
-      uintptr_t hi_addr = space->low_addr + space->extent;
-      if (next_block == hi_addr)
-        next_block = 0;
-      else
-        next_block += NOFL_META_BLOCKS_PER_SLAB * NOFL_BLOCK_SIZE;
+static inline int
+nofl_space_evacuate_or_mark_object(struct nofl_space *space,
+                                   struct gc_edge edge,
+                                   struct gc_ref old_ref,
+                                   struct nofl_allocator *evacuate) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(old_ref);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 0;
+  if (space->evacuating &&
+      
nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(old_ref)),
+                                  NOFL_BLOCK_EVACUATE)) {
+    // This is an evacuating collection, and we are attempting to
+    // evacuate this block, and we are tracing this particular object
+    // for what appears to be the first time.
+    struct gc_atomic_forward fwd = gc_atomic_forward_begin(old_ref);
+
+    if (fwd.state == GC_FORWARDING_STATE_NOT_FORWARDED)
+      gc_atomic_forward_acquire(&fwd);
+
+    switch (fwd.state) {
+    case GC_FORWARDING_STATE_NOT_FORWARDED:
+    case GC_FORWARDING_STATE_ABORTED:
+      // Impossible.
+      GC_CRASH();
+    case GC_FORWARDING_STATE_ACQUIRED: {
+      // We claimed the object successfully; evacuating is up to us.
+      size_t object_granules = nofl_space_live_object_granules(metadata);
+      struct gc_ref new_ref = nofl_evacuation_allocate(evacuate, space,
+                                                       object_granules);
+      if (gc_ref_is_heap_object(new_ref)) {
+        // Copy object contents before committing, as we don't know what
+        // part of the object (if any) will be overwritten by the
+        // commit.
+        memcpy(gc_ref_heap_object(new_ref), gc_ref_heap_object(old_ref),
+               object_granules * NOFL_GRANULE_SIZE);
+        gc_atomic_forward_commit(&fwd, new_ref);
+        // Now update extent metadata, and indicate to the caller that
+        // the object's fields need to be traced.
+        uint8_t *new_metadata = nofl_metadata_byte_for_object(new_ref);
+        memcpy(new_metadata + 1, metadata + 1, object_granules - 1);
+        gc_edge_update(edge, new_ref);
+        metadata = new_metadata;
+        // Fall through to set mark bits.
+      } else {
+        // Well shucks; allocation failed, marking the end of
+        // opportunistic evacuation.  No future evacuation of this
+        // object will succeed.  Mark in place instead.
+        gc_atomic_forward_abort(&fwd);
+      }
+      break;
     }
-  } while (!atomic_compare_exchange_weak(&space->next_block, &block,
-                                         next_block));
-  return block;
-}
+    case GC_FORWARDING_STATE_BUSY:
+      // Someone else claimed this object first.  Spin until new address
+      // known, or evacuation aborts.
+      for (size_t spin_count = 0;; spin_count++) {
+        if (gc_atomic_forward_retry_busy(&fwd))
+          break;
+        yield_for_spin(spin_count);
+      }
+      if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+        // Remove evacuation aborted; remote will mark and enqueue.
+        return 0;
+      ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+      // Fall through.
+    case GC_FORWARDING_STATE_FORWARDED:
+      // The object has been evacuated already.  Update the edge;
+      // whoever forwarded the object will make sure it's eventually
+      // traced.
+      gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+      return 0;
+    }
+  }
 
-static void
-nofl_allocator_release_block(struct nofl_allocator *alloc) {
-  alloc->alloc = alloc->sweep = alloc->block = 0;
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  *metadata = (byte & ~mask) | space->marked_mask;
+  return 1;
 }
 
-static void
-nofl_allocator_finish_block(struct nofl_allocator *alloc,
-                            struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
-  struct nofl_block_summary *block = nofl_block_summary_for_addr(alloc->block);
-  atomic_fetch_add(&space->granules_freed_by_last_collection,
-                   block->free_granules);
-  atomic_fetch_add(&space->fragmentation_granules_since_last_collection,
-                   block->fragmentation_granules);
+static inline int
+nofl_space_contains_address(struct nofl_space *space, uintptr_t addr) {
+  return addr - space->low_addr < space->extent;
+}
 
-  // If this block has mostly survivors, we should avoid sweeping it and
-  // trying to allocate into it for a minor GC.  Sweep it next time to
-  // clear any garbage allocated in this cycle and mark it as
-  // "venerable" (i.e., old).
-  GC_ASSERT(!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE));
-  if (!nofl_block_summary_has_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP) &&
-      block->free_granules < NOFL_GRANULES_PER_BLOCK * 
space->venerable_threshold)
-    nofl_block_summary_set_flag(block, NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
+static inline int
+nofl_space_contains_conservative_ref(struct nofl_space *space,
+                                     struct gc_conservative_ref ref) {
+  return nofl_space_contains_address(space, gc_conservative_ref_value(ref));
+}
 
-  nofl_allocator_release_block(alloc);
+static inline int
+nofl_space_contains(struct nofl_space *space, struct gc_ref ref) {
+  return nofl_space_contains_address(space, gc_ref_value(ref));
 }
 
-// Sweep some heap to reclaim free space, resetting alloc->alloc and
-// alloc->sweep.  Return the size of the hole in granules.
-static size_t
-nofl_allocator_next_hole_in_block(struct nofl_allocator *alloc,
-                                  struct nofl_space *space) {
-  uintptr_t sweep = alloc->sweep;
-  if (sweep == 0)
-    return 0;
-  uintptr_t limit = alloc->block + NOFL_BLOCK_SIZE;
-  uintptr_t sweep_mask = space->sweep_mask;
+static int
+nofl_space_forward_or_mark_if_traced(struct nofl_space *space,
+                                     struct gc_edge edge,
+                                     struct gc_ref ref) {
+  uint8_t *metadata = nofl_metadata_byte_for_object(ref);
+  uint8_t byte = *metadata;
+  if (byte & space->marked_mask)
+    return 1;
 
-  while (sweep != limit) {
-    GC_ASSERT((sweep & (NOFL_GRANULE_SIZE - 1)) == 0);
-    uint8_t* metadata = nofl_metadata_byte_for_addr(sweep);
-    size_t limit_granules = (limit - sweep) >> NOFL_GRANULE_SIZE_LOG_2;
+  if (!space->evacuating)
+    return 0;
+  if 
(!nofl_block_summary_has_flag(nofl_block_summary_for_addr(gc_ref_value(ref)),
+                                   NOFL_BLOCK_EVACUATE))
+    return 0;
 
-    // Except for when we first get a block, alloc->sweep is positioned
-    // right after a hole, which can point to either the end of the
-    // block or to a live object.  Assume that a live object is more
-    // common.
-    {
-      size_t live_granules = 0;
-      while (limit_granules && (metadata[0] & sweep_mask)) {
-        // Object survived collection; skip over it and continue sweeping.
-        size_t object_granules = nofl_space_live_object_granules(metadata);
-        live_granules += object_granules;
-        limit_granules -= object_granules;
-        metadata += object_granules;
-      }
-      if (!limit_granules)
+  struct gc_atomic_forward fwd = gc_atomic_forward_begin(ref);
+  switch (fwd.state) {
+  case GC_FORWARDING_STATE_NOT_FORWARDED:
+    return 0;
+  case GC_FORWARDING_STATE_BUSY:
+    // Someone else claimed this object first.  Spin until new address
+    // known, or evacuation aborts.
+    for (size_t spin_count = 0;; spin_count++) {
+      if (gc_atomic_forward_retry_busy(&fwd))
         break;
-      sweep += live_granules * NOFL_GRANULE_SIZE;
+      yield_for_spin(spin_count);
     }
-
-    size_t free_granules = scan_for_byte(metadata, limit_granules, sweep_mask);
-    GC_ASSERT(free_granules);
-    GC_ASSERT(free_granules <= limit_granules);
-
-    struct nofl_block_summary *summary = nofl_block_summary_for_addr(sweep);
-    summary->hole_count++;
-    GC_ASSERT(free_granules <= NOFL_GRANULES_PER_BLOCK - 
summary->free_granules);
-    summary->free_granules += free_granules;
-
-    size_t free_bytes = free_granules * NOFL_GRANULE_SIZE;
-    alloc->alloc = sweep;
-    alloc->sweep = sweep + free_bytes;
-    return free_granules;
+    if (fwd.state == GC_FORWARDING_STATE_ABORTED)
+      // Remote evacuation aborted; remote will mark and enqueue.
+      return 1;
+    ASSERT(fwd.state == GC_FORWARDING_STATE_FORWARDED);
+    // Fall through.
+  case GC_FORWARDING_STATE_FORWARDED:
+    gc_edge_update(edge, gc_ref(gc_atomic_forward_address(&fwd)));
+    return 1;
+  default:
+    GC_CRASH();
   }
-
-  nofl_allocator_finish_block(alloc, space);
-  return 0;
 }
 
-static void
-nofl_allocator_finish_hole(struct nofl_allocator *alloc) {
-  size_t granules = (alloc->sweep - alloc->alloc) / NOFL_GRANULE_SIZE;
-  if (granules) {
-    struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(alloc->block);
-    summary->holes_with_fragmentation++;
-    summary->fragmentation_granules += granules;
-    uint8_t *metadata = nofl_metadata_byte_for_addr(alloc->alloc);
-    memset(metadata, 0, granules);
-    alloc->alloc = alloc->sweep;
+static inline struct gc_ref
+nofl_space_mark_conservative_ref(struct nofl_space *space,
+                                 struct gc_conservative_ref ref,
+                                 int possibly_interior) {
+  uintptr_t addr = gc_conservative_ref_value(ref);
+
+  if (possibly_interior) {
+    addr = align_down(addr, NOFL_GRANULE_SIZE);
+  } else {
+    // Addr not an aligned granule?  Not an object.
+    uintptr_t displacement = addr & (NOFL_GRANULE_SIZE - 1);
+    if (!gc_is_valid_conservative_ref_displacement(displacement))
+      return gc_ref_null();
+    addr -= displacement;
   }
-  // FIXME: add to fragmentation
-}
 
-static int
-nofl_maybe_release_swept_empty_block(struct nofl_allocator *alloc,
-                                     struct nofl_space *space) {
-  GC_ASSERT(alloc->block);
-  uintptr_t block = alloc->block;
-  if (atomic_load_explicit(&space->pending_unavailable_bytes,
-                           memory_order_acquire) <= 0)
-    return 0;
+  // Addr in meta block?  Not an object.
+  if ((addr & (NOFL_SLAB_SIZE - 1)) < NOFL_META_BLOCKS_PER_SLAB * 
NOFL_BLOCK_SIZE)
+    return gc_ref_null();
 
-  nofl_push_unavailable_block(space, block);
-  atomic_fetch_sub(&space->pending_unavailable_bytes, NOFL_BLOCK_SIZE);
-  nofl_allocator_release_block(alloc);
-  return 1;
-}
+  // Addr in block that has been paged out?  Not an object.
+  struct nofl_block_summary *summary = nofl_block_summary_for_addr(addr);
+  if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
+    return gc_ref_null();
 
-static size_t
-nofl_allocator_next_hole(struct nofl_allocator *alloc,
-                         struct nofl_space *space) {
-  nofl_allocator_finish_hole(alloc);
-  // As we sweep if we find that a block is empty, we return it to the
-  // empties list.  Empties are precious.  But if we return 10 blocks in
-  // a row, and still find an 11th empty, go ahead and use it.
-  size_t empties_countdown = 10;
-  while (1) {
-    // Sweep current block for a hole.
-    size_t granules = nofl_allocator_next_hole_in_block(alloc, space);
-    if (granules) {
-      // If the hole spans only part of a block, let the allocator try
-      // to use it.
-      if (granules < NOFL_GRANULES_PER_BLOCK)
-        return granules;
-      struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(alloc->block);
-      memset(nofl_metadata_byte_for_addr(alloc->block), 0, 
NOFL_GRANULES_PER_BLOCK);
-      nofl_block_summary_clear_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-      // Sweeping found a completely empty block.  If we are below the
-      // minimum evacuation reserve, take the block.
-      if (nofl_push_evacuation_target_if_needed(space, alloc->block)) {
-        nofl_allocator_release_block(alloc);
-        continue;
-      }
-      // If we have pending pages to release to the OS, we should unmap
-      // this block.
-      if (nofl_maybe_release_swept_empty_block(alloc, space))
-        continue;
-      // Otherwise if we've already returned lots of empty blocks to the
-      // freelist, let the allocator keep this block.
-      if (!empties_countdown) {
-        // After this block is allocated into, it will need to be swept.
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-        return granules;
-      }
-      // Otherwise we push to the empty blocks list.
-      nofl_push_empty_block(space, alloc->block);
-      nofl_allocator_release_block(alloc);
-      empties_countdown--;
-    }
-    GC_ASSERT(alloc->block == 0);
-    while (1) {
-      uintptr_t block = nofl_space_next_block_to_sweep(space);
-      if (block) {
-        // Sweeping found a block.  We might take it for allocation, or
-        // we might send it back.
-        struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(block);
-        // If it's marked unavailable, it's already on a list of
-        // unavailable blocks, so skip and get the next block.
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_UNAVAILABLE))
-          continue;
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_VENERABLE)) {
-          // Skip venerable blocks after a minor GC -- we don't need to
-          // sweep as they weren't allocated into last cycle, and the
-          // mark bytes didn't rotate, so we have no cleanup to do; and
-          // we shouldn't try to allocate into them as it's not worth
-          // it.  Any wasted space is measured as fragmentation.
-          if (space->last_collection_was_minor)
-            continue;
-          else
-            nofl_block_summary_clear_flag(summary, NOFL_BLOCK_VENERABLE);
-        }
-        if (nofl_block_summary_has_flag(summary, NOFL_BLOCK_NEEDS_SWEEP)) {
-          // Prepare to sweep the block for holes.
-          alloc->alloc = alloc->sweep = alloc->block = block;
-          if (nofl_block_summary_has_flag(summary, 
NOFL_BLOCK_VENERABLE_AFTER_SWEEP)) {
-            // In the last cycle we noted that this block consists of
-            // mostly old data.  Sweep any garbage, commit the mark as
-            // venerable, and avoid allocating into it.
-            nofl_block_summary_clear_flag(summary, 
NOFL_BLOCK_VENERABLE_AFTER_SWEEP);
-            if (space->last_collection_was_minor) {
-              nofl_finish_sweeping_in_block(alloc, space);
-              nofl_block_summary_set_flag(summary, NOFL_BLOCK_VENERABLE);
-              continue;
-            }
-          }
-          // This block was marked in the last GC and needs sweeping.
-          // As we sweep we'll want to record how many bytes were live
-          // at the last collection.  As we allocate we'll record how
-          // many granules were wasted because of fragmentation.
-          summary->hole_count = 0;
-          summary->free_granules = 0;
-          summary->holes_with_fragmentation = 0;
-          summary->fragmentation_granules = 0;
-          break;
-        } else {
-          // Otherwise this block is completely empty and is on the
-          // empties list.  We take from the empties list only after all
-          // the NEEDS_SWEEP blocks are processed.
-          continue;
-        }
-      } else {
-        // We are done sweeping for blocks.  Now take from the empties
-        // list.
-        block = nofl_pop_empty_block(space);
-        // No empty block?  Return 0 to cause collection.
-        if (!block)
-          return 0;
+  uint8_t *loc = nofl_metadata_byte_for_addr(addr);
+  uint8_t byte = atomic_load_explicit(loc, memory_order_relaxed);
 
-        // Maybe we should use this empty as a target for evacuation.
-        if (nofl_push_evacuation_target_if_possible(space, block))
-          continue;
+  // Already marked object?  Nothing to do.
+  if (byte & space->marked_mask)
+    return gc_ref_null();
 
-        // Otherwise give the block to the allocator.
-        struct nofl_block_summary *summary = 
nofl_block_summary_for_addr(block);
-        nofl_block_summary_set_flag(summary, NOFL_BLOCK_NEEDS_SWEEP);
-        summary->hole_count = 1;
-        summary->free_granules = NOFL_GRANULES_PER_BLOCK;
-        summary->holes_with_fragmentation = 0;
-        summary->fragmentation_granules = 0;
-        alloc->block = block;
-        alloc->alloc = block;
-        alloc->sweep = block + NOFL_BLOCK_SIZE;
-        return NOFL_GRANULES_PER_BLOCK;
-      }
-    }
-  }
-}
+  // Addr is the not start of an unmarked object?  Search backwards if
+  // we have interior pointers, otherwise not an object.
+  uint8_t object_start_mask = space->live_mask | NOFL_METADATA_BYTE_YOUNG;
+  if (!(byte & object_start_mask)) {
+    if (!possibly_interior)
+      return gc_ref_null();
 
-static void
-nofl_finish_sweeping_in_block(struct nofl_allocator *alloc,
-                              struct nofl_space *space) {
-  do {
-    nofl_allocator_finish_hole(alloc);
-  } while (nofl_allocator_next_hole_in_block(alloc, space));
-}
+    uintptr_t block_base = align_down(addr, NOFL_BLOCK_SIZE);
+    uint8_t *loc_base = nofl_metadata_byte_for_addr(block_base);
+    do {
+      // Searched past block?  Not an object.
+      if (loc-- == loc_base)
+        return gc_ref_null();
 
-// Another thread is triggering GC.  Before we stop, finish clearing the
-// dead mark bytes for the mutator's block, and release the block.
-static void
-nofl_finish_sweeping(struct nofl_allocator *alloc,
-                     struct nofl_space *space) {
-  while (nofl_allocator_next_hole(alloc, space)) {}
-}
+      byte = atomic_load_explicit(loc, memory_order_relaxed);
 
-static struct gc_ref
-nofl_allocate(struct nofl_allocator *alloc, struct nofl_space *space,
-              size_t size, void (*gc)(void*), void *gc_data) {
-  GC_ASSERT(size > 0);
-  GC_ASSERT(size <= gc_allocator_large_threshold());
-  size = align_up(size, NOFL_GRANULE_SIZE);
+      // Ran into the end of some other allocation?  Not an object, then.
+      if (byte & NOFL_METADATA_BYTE_END)
+        return gc_ref_null();
 
-  if (alloc->alloc + size > alloc->sweep) {
-    size_t granules = size >> NOFL_GRANULE_SIZE_LOG_2;
-    while (1) {
-      size_t hole = nofl_allocator_next_hole(alloc, space);
-      if (hole >= granules) {
-        nofl_clear_memory(alloc->alloc, hole * NOFL_GRANULE_SIZE);
-        break;
-      }
-      if (!hole)
-        gc(gc_data);
-    }
+      // Continue until we find object start.
+    } while (!(byte & object_start_mask));
+
+    // Found object start, and object is unmarked; adjust addr.
+    addr = block_base + (loc - loc_base) * NOFL_GRANULE_SIZE;
   }
 
-  struct gc_ref ret = gc_ref(alloc->alloc);
-  alloc->alloc += size;
-  gc_update_alloc_table(ret, size);
-  return ret;
+  uint8_t mask = NOFL_METADATA_BYTE_YOUNG | NOFL_METADATA_BYTE_MARK_0
+    | NOFL_METADATA_BYTE_MARK_1 | NOFL_METADATA_BYTE_MARK_2;
+  atomic_store_explicit(loc, (byte & ~mask) | space->marked_mask,
+                        memory_order_relaxed);
+
+  return gc_ref(addr);
+}
+
+static inline size_t
+nofl_space_object_size(struct nofl_space *space, struct gc_ref ref) {
+  uint8_t *loc = nofl_metadata_byte_for_object(ref);
+  size_t granules = nofl_space_live_object_granules(loc);
+  return granules * NOFL_GRANULE_SIZE;
 }
 
 static struct nofl_slab*
diff --git a/src/whippet.c b/src/whippet.c
index 76f8f1ed5..6e942d7da 100644
--- a/src/whippet.c
+++ b/src/whippet.c
@@ -91,6 +91,10 @@ struct gc_mutator {
   struct gc_mutator *next;
 };
 
+struct gc_trace_worker_data {
+  struct nofl_allocator allocator;
+};
+
 static inline struct nofl_space*
 heap_nofl_space(struct gc_heap *heap) {
   return &heap->nofl_space;
@@ -108,16 +112,34 @@ mutator_heap(struct gc_mutator *mutator) {
   return mutator->heap;
 }
 
+static void
+gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
+                                         struct gc_heap *heap,
+                                         struct gc_trace_worker *worker,
+                                         struct gc_trace_worker_data *data),
+                               struct gc_tracer *tracer,
+                               struct gc_heap *heap,
+                               struct gc_trace_worker *worker) {
+  struct gc_trace_worker_data data;
+  nofl_allocator_reset(&data.allocator);
+  f(tracer, heap, worker, &data);
+  nofl_allocator_finish(&data.allocator, heap_nofl_space(heap));
+}
+
 static void collect(struct gc_mutator *mut,
                     enum gc_collection_kind requested_kind) GC_NEVER_INLINE;
 
 static inline int
-do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref) {
+do_trace(struct gc_heap *heap, struct gc_edge edge, struct gc_ref ref,
+         struct gc_trace_worker *worker) {
   if (!gc_ref_is_heap_object(ref))
     return 0;
-  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref)))
-    return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, 
ref);
-  else if (large_object_space_contains(heap_large_object_space(heap), ref))
+  if (GC_LIKELY(nofl_space_contains(heap_nofl_space(heap), ref))) {
+    struct nofl_allocator *alloc =
+      worker ? &gc_trace_worker_data(worker)->allocator : NULL;
+    return nofl_space_evacuate_or_mark_object(heap_nofl_space(heap), edge, ref,
+                                              alloc);
+  } else if (large_object_space_contains(heap_large_object_space(heap), ref))
     return large_object_space_mark_object(heap_large_object_space(heap),
                                           ref);
   else
@@ -125,12 +147,13 @@ do_trace(struct gc_heap *heap, struct gc_edge edge, 
struct gc_ref ref) {
 }
 
 static inline int trace_edge(struct gc_heap *heap,
-                             struct gc_edge edge) GC_ALWAYS_INLINE;
+                             struct gc_edge edge,
+                             struct gc_trace_worker *worker) GC_ALWAYS_INLINE;
 
 static inline int
-trace_edge(struct gc_heap *heap, struct gc_edge edge) {
+trace_edge(struct gc_heap *heap, struct gc_edge edge, struct gc_trace_worker 
*worker) {
   struct gc_ref ref = gc_edge_ref(edge);
-  int is_new = do_trace(heap, edge, ref);
+  int is_new = do_trace(heap, edge, ref, worker);
 
   if (is_new &&
       GC_UNLIKELY(atomic_load_explicit(&heap->check_pending_ephemerons,
@@ -340,23 +363,12 @@ gc_heap_set_extern_space(struct gc_heap *heap, struct 
gc_extern_space *space) {
   heap->extern_space = space;
 }
 
-static void
-gc_trace_worker_call_with_data(void (*f)(struct gc_tracer *tracer,
-                                         struct gc_heap *heap,
-                                         struct gc_trace_worker *worker,
-                                         struct gc_trace_worker_data *data),
-                               struct gc_tracer *tracer,
-                               struct gc_heap *heap,
-                               struct gc_trace_worker *worker) {
-  f(tracer, heap, worker, NULL);
-}
-
 static inline void tracer_visit(struct gc_edge edge, struct gc_heap *heap,
                                 void *trace_data) GC_ALWAYS_INLINE;
 static inline void
 tracer_visit(struct gc_edge edge, struct gc_heap *heap, void *trace_data) {
   struct gc_trace_worker *worker = trace_data;
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, worker))
     gc_trace_worker_enqueue(worker, gc_edge_ref(edge));
 }
 
@@ -364,7 +376,7 @@ static void
 trace_and_enqueue_locally(struct gc_edge edge, struct gc_heap *heap,
                           void *data) {
   struct gc_mutator *mut = data;
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, NULL))
     mutator_mark_buf_push(&mut->mark_buf, gc_edge_ref(edge));
 }
 
@@ -396,7 +408,7 @@ trace_conservative_ref_and_enqueue_locally(struct 
gc_conservative_ref ref,
 static void
 trace_and_enqueue_globally(struct gc_edge edge, struct gc_heap *heap,
                            void *unused) {
-  if (trace_edge(heap, edge))
+  if (trace_edge(heap, edge, NULL))
     gc_tracer_enqueue_root(&heap->tracer, gc_edge_ref(edge));
 }
 
@@ -643,7 +655,7 @@ trace_mutator_roots_after_stop(struct gc_heap *heap) {
   atomic_store(&heap->mutator_trace_list, NULL);
 
   for (struct gc_mutator *mut = heap->inactive_mutators; mut; mut = mut->next) 
{
-    nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
+    nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
     trace_mutator_roots_with_lock(mut);
   }
 }
@@ -719,7 +731,7 @@ pause_mutator_for_collection_with_lock(struct gc_mutator 
*mut) {
   struct gc_heap *heap = mutator_heap(mut);
   GC_ASSERT(mutators_are_stopping(heap));
   MUTATOR_EVENT(mut, mutator_stopping);
-  nofl_finish_sweeping_in_block(&mut->allocator, heap_nofl_space(heap));
+  nofl_allocator_finish(&mut->allocator, heap_nofl_space(heap));
   gc_stack_capture_hot(&mut->stack);
   if (mutator_should_mark_while_stopping(mut))
     // No need to collect results in mark buf; we can enqueue roots directly.
@@ -760,7 +772,7 @@ static double
 heap_last_gc_yield(struct gc_heap *heap) {
   struct nofl_space *nofl_space = heap_nofl_space(heap);
   size_t nofl_yield = nofl_space_yield(nofl_space);
-  size_t evacuation_reserve = nofl_space_evacuation_reserve(nofl_space);
+  size_t evacuation_reserve = nofl_space_evacuation_reserve_bytes(nofl_space);
   // FIXME: Size nofl evacuation reserve based on size of nofl space,
   // not heap size.
   size_t minimum_evacuation_reserve =
@@ -1030,7 +1042,8 @@ collect(struct gc_mutator *mut, enum gc_collection_kind 
requested_kind) {
   DEBUG("last gc yield: %f; fragmentation: %f\n", yield, fragmentation);
   detect_out_of_memory(heap);
   trace_pinned_roots_after_stop(heap);
-  nofl_space_prepare_for_evacuation(nofl_space, gc_kind);
+  if (gc_kind == GC_COLLECTION_COMPACTING)
+    nofl_space_prepare_evacuation(nofl_space);
   trace_roots_after_stop(heap);
   HEAP_EVENT(heap, roots_traced);
   gc_tracer_trace(&heap->tracer);

[Guile-commits] 265/423: nofl: Refactor evacuation allocation to be thread-local

Reply via email to