Author: Armin Rigo <ar...@tunes.org> Branch: c7 Changeset: r593:a7e3185f3ead Date: 2014-01-02 10:26 +0100 http://bitbucket.org/pypy/stmgc/changeset/a7e3185f3ead/
Log: Initial checkin of the code from https://bitbucket.org/arigo/arigo/raw/default/hack/stm/c7 diff --git a/c7/core.c b/c7/core.c new file mode 100644 --- /dev/null +++ b/c7/core.c @@ -0,0 +1,648 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <asm/prctl.h> +#include <sys/prctl.h> + +#include "core.h" +#include "list.h" +#include "pagecopy.h" + + +#define NB_PAGES (256*256) // 256MB +#define NB_THREADS 2 +#define MAP_PAGES_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE) +#define LARGE_OBJECT_WORDS 36 + + +typedef TLPREFIX char localchar_t; +typedef TLPREFIX struct alloc_for_size_s alloc_for_size_t; +typedef TLPREFIX struct _thread_local2_s _thread_local2_t; + + +struct alloc_for_size_s { + localchar_t *next; + uint16_t start, stop; + bool flag_partial_page; +}; + +struct _thread_local2_s { + struct _thread_local1_s _tl1; + int thread_num; + char *thread_base; + struct stm_list_s *modified_objects; + struct stm_list_s *new_object_ranges; + struct alloc_for_size_s alloc[LARGE_OBJECT_WORDS]; +}; +#define _STM_TL2 ((_thread_local2_t *)_STM_TL1) + +enum { SHARED_PAGE=0, REMAPPING_PAGE, PRIVATE_PAGE }; /* flag_page_private */ + + +static char *object_pages; +static char *undo_log_pages; +static char *undo_log_current; +static int num_threads_started, leader_thread_num; +static uintptr_t index_page_never_used; +static int next_write_version; +static int undo_lock; +static struct stm_list_s *global_history; +static uint16_t gh_write_version_first; +static uint16_t gh_write_version_last; +static uint8_t flag_page_private[NB_PAGES]; /* xxx_PAGE constants above */ + + +/************************************************************/ + +static void spin_loop(void) +{ + asm("pause" : : : "memory"); +} + +static void acquire_lock(int *lock) +{ + while (__sync_lock_test_and_set(lock, 1) != 0) { + while (*lock != 0) + spin_loop(); + } +} + +#define ACQUIRE_LOCK_IF(lock, condition) \ +({ \ + bool _acquired = false; \ + while (condition) { \ + if (__sync_lock_test_and_set(lock, 1) == 0) { \ + if (condition) \ + _acquired = true; \ + else \ + __sync_lock_release(lock); \ + break; \ + } \ + spin_loop(); \ + } \ + _acquired; \ +}) + +static void release_lock(int *lock) +{ + __sync_lock_release(lock); +} + +static void write_fence(void) +{ +#if defined(__amd64__) || defined(__i386__) + asm("" : : : "memory"); +#else +# error "Define write_fence() for your architecture" +#endif +} + +static bool _stm_was_read(object_t *obj) +{ + read_marker_t *marker = (read_marker_t *)(((uintptr_t)obj) >> 4); + return (marker->rm == _STM_TL1->transaction_read_version); +} + + +static void _stm_privatize(uintptr_t pagenum) +{ + if (flag_page_private[pagenum] == PRIVATE_PAGE) + return; + + if (!__sync_bool_compare_and_swap(&flag_page_private[pagenum], + SHARED_PAGE, REMAPPING_PAGE)) { + while (flag_page_private[pagenum] == REMAPPING_PAGE) + spin_loop(); + assert(flag_page_private[pagenum] == PRIVATE_PAGE); + return; + } + + ssize_t pgoff1 = pagenum; + ssize_t pgoff2 = pagenum + NB_PAGES; + ssize_t localpgoff = pgoff1 + NB_PAGES * _STM_TL2->thread_num; + ssize_t otherpgoff = pgoff1 + NB_PAGES * (1 - _STM_TL2->thread_num); + + void *localpg = object_pages + localpgoff * 4096UL; + void *otherpg = object_pages + otherpgoff * 4096UL; + + int res = remap_file_pages(localpg, 4096, 0, pgoff2, 0); + if (res < 0) { + perror("remap_file_pages"); + abort(); + } + pagecopy(localpg, otherpg); + write_fence(); + assert(flag_page_private[pagenum] == REMAPPING_PAGE); + flag_page_private[pagenum] = PRIVATE_PAGE; +} + + +#define REAL_ADDRESS(object_pages, src) ((object_pages) + (uintptr_t)(src)) + +static char *real_address(uintptr_t src) +{ + return REAL_ADDRESS(_STM_TL2->thread_base, src); +} + +static char *get_thread_base(long thread_num) +{ + return object_pages + thread_num * (NB_PAGES * 4096UL); +} + +void stm_abort_transaction(void); + +enum detect_conflicts_e { CANNOT_CONFLICT, CAN_CONFLICT }; + +/* XXX this can be done by acquiring the undo_lock for much less time, + but it needs to be carefully synchronized with _stm_write_slowpath(). + For now it must be called with the undo_lock acquired. */ +static void update_to_current_version(enum detect_conflicts_e check_conflict) +{ + /* Loop over objects in 'global_history': if they have been + read by the current transaction, the current transaction must + abort; then copy them out of the leader's object space --- + which may have been modified by the leader's uncommitted + transaction; this case will be fixed afterwards. + */ + bool conflict_found_or_dont_check = (check_conflict == CANNOT_CONFLICT); + char *local_base = _STM_TL2->thread_base; + char *remote_base = get_thread_base(1 - _STM_TL2->thread_num); + struct stm_list_s *gh, *gh_next; + + assert(leader_thread_num != _STM_TL2->thread_num); + + for (gh = global_history; gh != NULL; gh = gh_next) { + + STM_LIST_FOREACH(gh, ({ + + if (!conflict_found_or_dont_check) + conflict_found_or_dont_check = _stm_was_read(item); + + char *dst = REAL_ADDRESS(local_base, item); + char *src = REAL_ADDRESS(remote_base, item); + char *src_rebased = src - (uintptr_t)local_base; + size_t size = stm_object_size_rounded_up((object_t *)src_rebased); + + memcpy(dst + sizeof(char *), + src + sizeof(char *), + size - sizeof(char *)); + })); + + gh_next = gh->nextlist; + stm_list_free(gh); + } + global_history = NULL; + gh_write_version_first = 0xffff; + gh_write_version_last = 0; + + /* Finally, loop over objects modified by the leader, + and copy them out of the undo log. + */ + char *undo = undo_log_pages; + char *undo_end = undo_log_current; + + while (undo < undo_end) { + + char *src = undo; + char *dst = *(char **)src; + char *src_rebased = src - (uintptr_t)local_base; + + *(char **)src = *(char **)dst; /* fix the first word of the object in + the undo log, for stm_object_size() */ + size_t size = stm_object_size_rounded_up((object_t *)src_rebased); + + memcpy(dst + sizeof(char *), + src + sizeof(char *), + size - sizeof(char *)); + + undo += size; + } + undo_log_current = undo_log_pages; /* make empty again */ + + if (conflict_found_or_dont_check && check_conflict == CAN_CONFLICT) { + release_lock(&undo_lock); + stm_abort_transaction(); + } +} + +static void maybe_update(enum detect_conflicts_e check_conflict) +{ + if (leader_thread_num != _STM_TL2->thread_num && global_history != NULL) { + acquire_lock(&undo_lock); + update_to_current_version(check_conflict); + release_lock(&undo_lock); + } +} + + +void _stm_write_slowpath(object_t *obj) +{ + maybe_update(CAN_CONFLICT); + + _stm_privatize(((uintptr_t)obj) / 4096); + + stm_read(obj); + + _STM_TL2->modified_objects = stm_list_append(_STM_TL2->modified_objects, obj); + + uint16_t wv = obj->write_version; + obj->write_version = _STM_TL1->transaction_write_version; + + /* We only need to store a copy of the current version of the object if: + - we are the leader; + - the object is present in the global_history. + The second condition is approximated by the following range check. + Storing a few more objects than strictly needed is not really a problem. + */ + /* XXX this can be done without acquiring the undo_lock at all, + but we need more care in update_to_current_version(). */ + + /* XXX can we avoid writing an unbounded number of copies of the + same object in case we run a lot of transactions while the other + thread is busy? Unlikely case but in theory annoying. Should + we anyway bound the undo log's size to much less than NB_PAGES, + and if full here, sleep? Should the bound also count the size + taken by the global_history lists? */ + if (ACQUIRE_LOCK_IF(&undo_lock, + wv <= gh_write_version_last && wv >= gh_write_version_first + && leader_thread_num == _STM_TL2->thread_num)) { + /* record in the undo log a copy of the content of the object */ + size_t size = stm_object_size_rounded_up(obj); + char *source = real_address((uintptr_t)obj); + char *undo = undo_log_current; + *((object_t **)undo) = obj; + memcpy(undo + sizeof(object_t *), + source + sizeof(object_t *), + size - sizeof(object_t *)); + /*write_fence();*/ + undo_log_current = undo + size; + release_lock(&undo_lock); + } +} + + +uintptr_t _stm_reserve_page(void) +{ + /* Grab a free page, initially shared between the threads. */ + + // XXX look in some free list first + + /* Return the index'th object page, which is so far never used. */ + uintptr_t index = __sync_fetch_and_add(&index_page_never_used, 1); + if (index >= NB_PAGES) { + fprintf(stderr, "Out of mmap'ed memory!\n"); + abort(); + } + return index; +} + +#define TO_RANGE(range, start, stop) \ + ((range) = (object_t *)((start) | (((uintptr_t)(stop)) << 16))) + +#define FROM_RANGE(start, stop, range) \ + ((start) = (uint16_t)(uintptr_t)(range), \ + (stop) = ((uintptr_t)(range)) >> 16) + +localchar_t *_stm_alloc_next_page(size_t i) +{ + /* 'alloc->next' points to where the next allocation should go. The + present function is called instead when this next allocation is + equal to 'alloc->stop'. As we know that 'start', 'next' and + 'stop' are always nearby pointers, we play tricks and only store + the lower 16 bits of 'start' and 'stop', so that the three + variables plus some flags fit in 16 bytes. + + 'flag_partial_page' is *cleared* to mean that the 'alloc' + describes a complete page, so that it needs not be listed inside + 'new_object_ranges'. In all other cases it is *set*. + */ + uintptr_t page; + localchar_t *result; + alloc_for_size_t *alloc = &_STM_TL2->alloc[i]; + size_t size = i * 8; + + if (alloc->flag_partial_page) { + /* record this range in 'new_object_ranges' */ + localchar_t *ptr1 = alloc->next - size - 1; + object_t *range; + TO_RANGE(range, alloc->start, alloc->stop); + page = ((uintptr_t)ptr1) / 4096; + _STM_TL2->new_object_ranges = stm_list_append( + _STM_TL2->new_object_ranges, (object_t *)page); + _STM_TL2->new_object_ranges = stm_list_append( + _STM_TL2->new_object_ranges, range); + } + + /* reserve a fresh new page */ + page = _stm_reserve_page(); + + result = (localchar_t *)(page * 4096UL); + alloc->start = (uintptr_t)result; + alloc->stop = alloc->start + (4096 / size) * size; + alloc->next = result + size; + alloc->flag_partial_page = false; + return result; +} + +object_t *stm_allocate(size_t size) +{ + assert(size % 8 == 0); + size_t i = size / 8; + assert(2 <= i && i < LARGE_OBJECT_WORDS);//XXX + alloc_for_size_t *alloc = &_STM_TL2->alloc[i]; + + localchar_t *p = alloc->next; + alloc->next = p + size; + if ((uint16_t)(uintptr_t)p == alloc->stop) + p = _stm_alloc_next_page(i); + + object_t *result = (object_t *)p; + result->write_version = _STM_TL1->transaction_write_version; + return result; +} + + +#define TOTAL_MEMORY (NB_PAGES * 4096UL * (NB_THREADS + 1)) +#define READMARKER_END ((NB_PAGES * 4096UL) >> 4) +#define FIRST_OBJECT_PAGE ((READMARKER_END + 4095) / 4096UL) +#define READMARKER_START ((FIRST_OBJECT_PAGE * 4096UL) >> 4) +#define FIRST_READMARKER_PAGE (READMARKER_START / 4096UL) + +void stm_setup(void) +{ + /* Check that some values are acceptable */ + assert(4096 <= ((uintptr_t)_STM_TL1)); + assert(((uintptr_t)_STM_TL1) == ((uintptr_t)_STM_TL2)); + assert(((uintptr_t)_STM_TL2) + sizeof(*_STM_TL2) <= 8192); + assert(2 <= FIRST_READMARKER_PAGE); + assert(FIRST_READMARKER_PAGE * 4096UL <= READMARKER_START); + assert(READMARKER_START < READMARKER_END); + assert(READMARKER_END <= 4096UL * FIRST_OBJECT_PAGE); + assert(FIRST_OBJECT_PAGE < NB_PAGES); + + object_pages = mmap(NULL, TOTAL_MEMORY, + PROT_READ | PROT_WRITE, + MAP_PAGES_FLAGS, -1, 0); + if (object_pages == MAP_FAILED) { + perror("object_pages mmap"); + abort(); + } + + long i; + for (i = 0; i < NB_THREADS; i++) { + char *thread_base = get_thread_base(i); + + /* In each thread's section, the first page is where TLPREFIX'ed + NULL accesses land. We mprotect it so that accesses fail. */ + mprotect(thread_base, 4096, PROT_NONE); + + /* Fill the TLS page (page 1) with 0xDD */ + memset(REAL_ADDRESS(thread_base, 4096), 0xDD, 4096); + /* Make a "hole" at _STM_TL1 / _STM_TL2 */ + memset(REAL_ADDRESS(thread_base, _STM_TL2), 0, sizeof(*_STM_TL2)); + + _STM_TL2->thread_num = i; + _STM_TL2->thread_base = thread_base; + + if (i > 0) { + int res; + res = remap_file_pages(thread_base + FIRST_OBJECT_PAGE * 4096UL, + (NB_PAGES - FIRST_OBJECT_PAGE) * 4096UL, + 0, FIRST_OBJECT_PAGE, 0); + if (res != 0) { + perror("remap_file_pages"); + abort(); + } + } + } + + undo_log_pages = get_thread_base(NB_THREADS); + mprotect(undo_log_pages, 4096, PROT_NONE); + mprotect(undo_log_pages + (NB_PAGES - 1) * 4096UL, 4096, PROT_NONE); + undo_log_pages += 4096; + undo_log_current = undo_log_pages; + + num_threads_started = 0; + index_page_never_used = FIRST_OBJECT_PAGE; + next_write_version = 1; + leader_thread_num = 0; + global_history = NULL; + gh_write_version_first = 0xffff; + gh_write_version_last = 0; +} + +#define INVALID_GS_VALUE 0xDDDDDDDDDDDDDDDDUL + +static void set_gs_register(uint64_t value) +{ + int result = syscall(SYS_arch_prctl, ARCH_SET_GS, value); + assert(result == 0); +} + +void stm_setup_thread(void) +{ + int thread_num = __sync_fetch_and_add(&num_threads_started, 1); + assert(thread_num < 2); /* only 2 threads for now */ + + char *thread_base = get_thread_base(thread_num); + set_gs_register((uintptr_t)thread_base); + + assert(_STM_TL2->thread_num == thread_num); + assert(_STM_TL2->thread_base == thread_base); + + _STM_TL2->modified_objects = stm_list_create(); +} + +void _stm_teardown_thread(void) +{ + stm_list_free(_STM_TL2->modified_objects); + _STM_TL2->modified_objects = NULL; + + set_gs_register(INVALID_GS_VALUE); +} + +void _stm_teardown(void) +{ + munmap(object_pages, TOTAL_MEMORY); + object_pages = NULL; + undo_log_pages = NULL; + undo_log_current = NULL; +} + + +static void reset_transaction_read_version(void) +{ + /* force-reset all read markers to 0 */ + int res = madvise(real_address(FIRST_READMARKER_PAGE * 4096UL), + (FIRST_OBJECT_PAGE - FIRST_READMARKER_PAGE) * 4096UL, + MADV_DONTNEED); + if (res < 0) { + perror("madvise"); + abort(); + } + _STM_TL1->transaction_read_version = 0; +} + +void stm_major_collection(void) +{ + abort(); +} + +void stm_start_transaction(jmp_buf *jmpbufptr) +{ + if (_STM_TL1->transaction_read_version == 0xff) + reset_transaction_read_version(); + _STM_TL1->transaction_read_version++; + _STM_TL1->jmpbufptr = NULL; + + while (1) { + int wv = __sync_fetch_and_add(&next_write_version, 1); + if (LIKELY(wv <= 0xffff)) { + _STM_TL1->transaction_write_version = wv; + break; + } + /* We run out of 16-bit numbers before we do the next major + collection, which resets it. XXX This case seems unlikely + for now, but check if it could become a bottleneck at some + point. */ + stm_major_collection(); + } + assert(stm_list_is_empty(_STM_TL2->modified_objects)); + assert(stm_list_is_empty(_STM_TL2->new_object_ranges)); + + maybe_update(CANNOT_CONFLICT); /* no read object: cannot conflict */ + + _STM_TL1->jmpbufptr = jmpbufptr; +} + +static void update_new_objects_in_other_threads(uintptr_t pagenum, + uint16_t start, uint16_t stop) +{ + size_t size = (uint16_t)(stop - start); + assert(size <= 4096 - (start & 4095)); + assert((start & ~4095) == (uint16_t)(pagenum * 4096)); + + int thread_num = _STM_TL2->thread_num; + uintptr_t local_src = (pagenum * 4096UL) + (start & 4095); + char *dst = REAL_ADDRESS(get_thread_base(1 - thread_num), local_src); + char *src = REAL_ADDRESS(_STM_TL2->thread_base, local_src); + + memcpy(dst, src, size); +} + +void stm_stop_transaction(void) +{ + write_fence(); /* see later in this function for why */ + + acquire_lock(&undo_lock); + + if (leader_thread_num != _STM_TL2->thread_num) { + /* non-leader thread */ + if (global_history != NULL) { + update_to_current_version(CAN_CONFLICT); + assert(global_history == NULL); + } + + /* steal leadership now */ + leader_thread_num = _STM_TL2->thread_num; + } + + /* now we are the leader thread. the leader can always commit */ + _STM_TL1->jmpbufptr = NULL; /* cannot abort any more */ + undo_log_current = undo_log_pages; /* throw away the content */ + + /* add these objects to the global_history */ + _STM_TL2->modified_objects->nextlist = global_history; + global_history = _STM_TL2->modified_objects; + _STM_TL2->modified_objects = stm_list_create(); + + uint16_t wv = _STM_TL1->transaction_write_version; + if (wv < gh_write_version_last) gh_write_version_last = wv; + if (wv > gh_write_version_first) gh_write_version_first = wv; + + /* walk the new_object_ranges and manually copy the new objects + to the other thread's pages in the (hopefully rare) case that + the page they belong to is already unshared */ + long i; + struct stm_list_s *lst = _STM_TL2->new_object_ranges; + for (i = stm_list_count(lst); i > 0; ) { + i -= 2; + uintptr_t pagenum = (uintptr_t)stm_list_item(lst, i); + + /* NB. the read next line should work even against a parallel + thread, thanks to the lock acquisition we do earlier (see the + beginning of this function). Indeed, if this read returns + SHARED_PAGE, then we know that the real value in memory was + actually SHARED_PAGE at least at the time of the + acquire_lock(). It may have been modified afterwards by a + compare_and_swap() in the other thread, but then we know for + sure that the other thread is seeing the last, up-to-date + version of our data --- this is the reason of the + write_fence() just before the acquire_lock(). + */ + if (flag_page_private[pagenum] != SHARED_PAGE) { + object_t *range = stm_list_item(lst, i + 1); + uint16_t start, stop; + FROM_RANGE(start, stop, range); + update_new_objects_in_other_threads(pagenum, start, stop); + } + } + + /* do the same for the partially-allocated pages */ + long j; + for (j = 2; j < LARGE_OBJECT_WORDS; j++) { + alloc_for_size_t *alloc = &_STM_TL2->alloc[j]; + uint16_t start = alloc->start; + uint16_t cur = (uintptr_t)alloc->next; + + if (start == cur) { + /* nothing to do: this assigned page was left empty by the + previous transaction, and also starts empty in the new + transaction. 'flag_partial_page' is unchanged. */ + } + else { + uintptr_t pagenum = ((uintptr_t)(alloc->next - 1)) / 4096UL; + /* for the new transaction, it will start here: */ + alloc->start = cur; + + if (alloc->flag_partial_page) { + if (flag_page_private[pagenum] != SHARED_PAGE) { + update_new_objects_in_other_threads(pagenum, start, cur); + } + } + else { + /* we can skip checking page->private_page because the + whole page can only contain objects made by the just- + finished transaction. */ + assert(flag_page_private[pagenum] == SHARED_PAGE); + + /* the next transaction will start with this page + containing objects that are now committed, so + we need to set this flag now */ + alloc->flag_partial_page = true; + } + } + } + + release_lock(&undo_lock); +} + +void stm_abort_transaction(void) +{ + long j; + for (j = 2; j < LARGE_OBJECT_WORDS; j++) { + alloc_for_size_t *alloc = &_STM_TL2->alloc[j]; + uint16_t num_allocated = ((uintptr_t)alloc->next) - alloc->start; + alloc->next -= num_allocated; + } + stm_list_clear(_STM_TL2->new_object_ranges); + stm_list_clear(_STM_TL2->modified_objects); + assert(_STM_TL1->jmpbufptr != NULL); + assert(_STM_TL1->jmpbufptr != (jmp_buf *)-1); /* for tests only */ + longjmp(*_STM_TL1->jmpbufptr, 1); +} diff --git a/c7/core.h b/c7/core.h new file mode 100644 --- /dev/null +++ b/c7/core.h @@ -0,0 +1,58 @@ +#ifndef _STM_CORE_H +#define _STM_CORE_H + +#include <stdint.h> +#include <stdbool.h> +#include <setjmp.h> + + +#define TLPREFIX __attribute__((address_space(256))) + +typedef TLPREFIX struct _thread_local1_s _thread_local1_t; +typedef TLPREFIX struct object_s object_t; +typedef TLPREFIX struct read_marker_s read_marker_t; + + +struct object_s { + uint16_t write_version; + /*uint8_t stm_flags;*/ +}; + +struct read_marker_s { + uint8_t rm; +}; + +struct _thread_local1_s { + jmp_buf *jmpbufptr; + uint8_t transaction_read_version; + uint16_t transaction_write_version; +}; +#define _STM_TL1 ((_thread_local1_t *)4352) + + +/* this should use llvm's coldcc calling convention, + but it's not exposed to C code so far */ +void _stm_write_slowpath(object_t *); + +#define LIKELY(x) __builtin_expect(x, true) +#define UNLIKELY(x) __builtin_expect(x, false) + + +static inline void stm_read(object_t *obj) +{ + ((read_marker_t *)(((uintptr_t)obj) >> 4))->rm = + _STM_TL1->transaction_read_version; +} + +static inline void stm_write(object_t *obj) +{ + if (UNLIKELY(obj->write_version != _STM_TL1->transaction_write_version)) + _stm_write_slowpath(obj); +} + + +/* must be provided by the user of this library */ +extern size_t stm_object_size_rounded_up(object_t *); + + +#endif diff --git a/c7/list.c b/c7/list.c new file mode 100644 --- /dev/null +++ b/c7/list.c @@ -0,0 +1,38 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "list.h" + + +#define SETSIZE(n) (sizeof(struct stm_list_s) + ITEMSSIZE(n)) +#define ITEMSSIZE(n) ((n) * sizeof(object_t*)) +#define OVERCNT(n) (33 + ((((n) / 2) * 3) | 1)) + +struct stm_list_s *stm_list_create(void) +{ + uintptr_t initial_allocation = 32; + struct stm_list_s *lst = malloc(SETSIZE(initial_allocation)); + if (lst == NULL) { + perror("out of memory in stm_list_create"); + abort(); + } + lst->count = 0; + lst->last_allocated = initial_allocation - 1; + assert(lst->last_allocated & 1); + return lst; +} + +struct stm_list_s *_stm_list_grow(struct stm_list_s *lst, uintptr_t nalloc) +{ + assert(lst->last_allocated & 1); + nalloc = OVERCNT(nalloc); + lst = realloc(lst, SETSIZE(nalloc)); + if (lst == NULL) { + perror("out of memory in _stm_list_grow"); + abort(); + } + lst->last_allocated = nalloc - 1; + assert(lst->last_allocated & 1); + return lst; +} diff --git a/c7/list.h b/c7/list.h new file mode 100644 --- /dev/null +++ b/c7/list.h @@ -0,0 +1,67 @@ +#ifndef _STM_LIST_H +#define _STM_LIST_H + +#include "core.h" + + +struct stm_list_s { + uintptr_t count; + union { + uintptr_t last_allocated; /* always odd */ + struct stm_list_s *nextlist; /* always even */ + }; + object_t *items[]; +}; + +struct stm_list_s *stm_list_create(void); + +static inline void stm_list_free(struct stm_list_s *lst) +{ + free(lst); +} + + +struct stm_list_s *_stm_list_grow(struct stm_list_s *, uintptr_t); + +static inline struct stm_list_s * +stm_list_append(struct stm_list_s *lst, object_t *item) +{ + uintptr_t index = lst->count++; + if (UNLIKELY(index > lst->last_allocated)) + lst = _stm_list_grow(lst, index); + lst->items[index] = item; + return lst; +} + +static inline void stm_list_clear(struct stm_list_s *lst) +{ + lst->count = 0; +} + +static inline bool stm_list_is_empty(struct stm_list_s *lst) +{ + return (lst->count == 0); +} + +static inline bool stm_list_count(struct stm_list_s *lst) +{ + return lst->count; +} + +static inline object_t *stm_list_item(struct stm_list_s *lst, uintptr_t index) +{ + return lst->items[index]; +} + +#define STM_LIST_FOREACH(lst, CODE) \ + do { \ + struct stm_list_s *_lst = (lst); \ + uintptr_t _i; \ + for (_i = _lst->count; _i--; ) { \ + object_t *item = _lst->items[_i]; \ + CODE; \ + } \ + } while (0) + + +#endif diff --git a/c7/pagecopy.c b/c7/pagecopy.c new file mode 100644 --- /dev/null +++ b/c7/pagecopy.c @@ -0,0 +1,57 @@ + +void pagecopy(void *dest, const void *src) +{ + unsigned long i; + for (i=0; i<4096/128; i++) { + asm volatile("movdqa (%0), %%xmm0\n" + "movdqa 16(%0), %%xmm1\n" + "movdqa 32(%0), %%xmm2\n" + "movdqa 48(%0), %%xmm3\n" + "movdqa %%xmm0, (%1)\n" + "movdqa %%xmm1, 16(%1)\n" + "movdqa %%xmm2, 32(%1)\n" + "movdqa %%xmm3, 48(%1)\n" + "movdqa 64(%0), %%xmm0\n" + "movdqa 80(%0), %%xmm1\n" + "movdqa 96(%0), %%xmm2\n" + "movdqa 112(%0), %%xmm3\n" + "movdqa %%xmm0, 64(%1)\n" + "movdqa %%xmm1, 80(%1)\n" + "movdqa %%xmm2, 96(%1)\n" + "movdqa %%xmm3, 112(%1)\n" + : + : "r"(src + 128*i), "r"(dest + 128*i) + : "xmm0", "xmm1", "xmm2", "xmm3", "memory"); + } +} + +#if 0 /* XXX enable if detected on the cpu */ +void pagecopy_ymm8(void *dest, const void *src) +{ + asm volatile("0:\n" + "vmovdqa (%0), %%ymm0\n" + "vmovdqa 32(%0), %%ymm1\n" + "vmovdqa 64(%0), %%ymm2\n" + "vmovdqa 96(%0), %%ymm3\n" + "vmovdqa 128(%0), %%ymm4\n" + "vmovdqa 160(%0), %%ymm5\n" + "vmovdqa 192(%0), %%ymm6\n" + "vmovdqa 224(%0), %%ymm7\n" + "addq $256, %0\n" + "vmovdqa %%ymm0, (%1)\n" + "vmovdqa %%ymm1, 32(%1)\n" + "vmovdqa %%ymm2, 64(%1)\n" + "vmovdqa %%ymm3, 96(%1)\n" + "vmovdqa %%ymm4, 128(%1)\n" + "vmovdqa %%ymm5, 160(%1)\n" + "vmovdqa %%ymm6, 192(%1)\n" + "vmovdqa %%ymm7, 224(%1)\n" + "addq $256, %1\n" + "cmpq %2, %0\n" + "jne 0b" + : "=r"(src), "=r"(dest) + : "r"((char *)src + 4096), "0"(src), "1"(dest) + : "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7"); +} +#endif diff --git a/c7/pagecopy.h b/c7/pagecopy.h new file mode 100644 --- /dev/null +++ b/c7/pagecopy.h @@ -0,0 +1,2 @@ + +void pagecopy(void *dest, const void *src); _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit