Hi, I didn't want to bring it on the teleconference but I want to commit Gleb's NUMA awareness patch before you branching trunk. Since I didn't get any objection / response about it I guess it's OK.
Best Regards Lenny, ---------- Forwarded message ---------- From: Lenny Verkhovsky <lenny.verkhov...@gmail.com> List-Post: devel@lists.open-mpi.org Date: Tue, Jun 3, 2008 at 2:38 PM Subject: [OMPI devel] SM BTL NUMA awareness patches To: Open MPI Developers <de...@open-mpi.org> Hi, all, If there are no comments for this patch I can commit it. Lenny. ---------- Forwarded message ---------- From: Gleb Natapov <gl...@voltaire.com> List-Post: devel@lists.open-mpi.org Date: 2008/5/28 Subject: [OMPI devel] SM BTL NUMA awareness patches To: de...@open-mpi.org Hi, Attached two patches implement NUMA awareness in SM BTL. The first one adds two new functions to maffinity framework required by the second patch. The functions are: opal_maffinity_base_node_name_to_id() - gets a string that represents a memory node name and translates it to memory node id. opal_maffinity_base_bind() - binds an address range to specific memory node. The bind() function cannot be implemented by all maffinity components. (There is no way first_use maffinity component can implement such functionality). In this case this function can be set to NULL. The second one adds NUMA awareness support to SM BTL and SM MPOOL. Each process determines what CPU it is running on and exchange this info with other local processes. Each process creates separate MPOOL for every memory node available and use them to allocate memory on specific memory nodes if needed. For instance circular buffer memory is always allocated on memory node local to receiver process. To use this on a Linux machine carto file with HW topology description should be provided. Processes should be bound to specific CPU (by specifying rank file for instance) and session directory should be created on tmpfs file system (otherwise Linux ignores memory binding commands) by setting orte_tmpdir_base parameter to point to tmpfs mount point. Questions and suggestion are alway welcome. -- Gleb. _______________________________________________ devel mailing list de...@open-mpi.org http://www.open-mpi.org/mailman/listinfo.cgi/devel
commit 883db5e1ce8c3b49cc1376e6acf9c2d5d0d77983 Author: Gleb Natapov <g...@voltaire.com> Date: Tue May 27 14:55:11 2008 +0300 Add functions to maffinity. diff --git a/opal/mca/maffinity/base/base.h b/opal/mca/maffinity/base/base.h index c44efed..339e6a1 100644 --- a/opal/mca/maffinity/base/base.h +++ b/opal/mca/maffinity/base/base.h @@ -105,6 +105,9 @@ OPAL_DECLSPEC int opal_maffinity_base_select(void); */ OPAL_DECLSPEC int opal_maffinity_base_set(opal_maffinity_base_segment_t *segments, size_t num_segments); +OPAL_DECLSPEC int opal_maffinity_base_node_name_to_id(char *, int *); +OPAL_DECLSPEC int opal_maffinity_base_bind(opal_maffinity_base_segment_t *, size_t, int); + /** * Shut down the maffinity MCA framework. * diff --git a/opal/mca/maffinity/base/maffinity_base_wrappers.c b/opal/mca/maffinity/base/maffinity_base_wrappers.c index ec843eb..eef5c7d 100644 --- a/opal/mca/maffinity/base/maffinity_base_wrappers.c +++ b/opal/mca/maffinity/base/maffinity_base_wrappers.c @@ -31,3 +31,33 @@ int opal_maffinity_base_set(opal_maffinity_base_segment_t *segments, } return opal_maffinity_base_module->maff_module_set(segments, num_segments); } + +int opal_maffinity_base_node_name_to_id(char *node_name, int *node_id) +{ + if (!opal_maffinity_base_selected) { + return OPAL_ERR_NOT_FOUND; + } + + if (!opal_maffinity_base_module->maff_module_name_to_id) { + *node_id = 0; + return OPAL_ERR_NOT_IMPLEMENTED; + } + + return opal_maffinity_base_module->maff_module_name_to_id(node_name, + node_id); +} + +int opal_maffinity_base_bind(opal_maffinity_base_segment_t *segments, + size_t num_segments, int node_id) +{ + if (!opal_maffinity_base_selected) { + return OPAL_ERR_NOT_FOUND; + } + + if (!opal_maffinity_base_module->maff_module_bind) { + return OPAL_ERR_NOT_IMPLEMENTED; + } + + return opal_maffinity_base_module->maff_module_bind(segments, num_segments, + node_id); +} diff --git a/opal/mca/maffinity/first_use/maffinity_first_use_module.c b/opal/mca/maffinity/first_use/maffinity_first_use_module.c index a68c2a9..0ae33e1 100644 --- a/opal/mca/maffinity/first_use/maffinity_first_use_module.c +++ b/opal/mca/maffinity/first_use/maffinity_first_use_module.c @@ -41,7 +41,9 @@ static const opal_maffinity_base_module_1_0_0_t loc_module = { first_use_module_init, /* Module function pointers */ - first_use_module_set + first_use_module_set, + NULL, + NULL }; int opal_maffinity_first_use_component_query(mca_base_module_t **module, int *priority) diff --git a/opal/mca/maffinity/libnuma/maffinity_libnuma_module.c b/opal/mca/maffinity/libnuma/maffinity_libnuma_module.c index 1fc2231..b2b109c 100644 --- a/opal/mca/maffinity/libnuma/maffinity_libnuma_module.c +++ b/opal/mca/maffinity/libnuma/maffinity_libnuma_module.c @@ -20,6 +20,7 @@ #include <string.h> #include <numa.h> +#include <numaif.h> #include "opal/constants.h" #include "opal/mca/maffinity/maffinity.h" @@ -33,6 +34,8 @@ static int libnuma_module_init(void); static int libnuma_module_set(opal_maffinity_base_segment_t *segments, size_t num_segments); +static int libnuma_module_node_name_to_id(char *, int *); +static int libnuma_modules_bind(opal_maffinity_base_segment_t *, size_t, int); /* * Libnuma maffinity module @@ -42,7 +45,9 @@ static const opal_maffinity_base_module_1_0_0_t loc_module = { libnuma_module_init, /* Module function pointers */ - libnuma_module_set + libnuma_module_set, + libnuma_module_node_name_to_id, + libnuma_modules_bind }; int opal_maffinity_libnuma_component_query(mca_base_module_t **module, int *priority) @@ -92,3 +97,28 @@ static int libnuma_module_set(opal_maffinity_base_segment_t *segments, return OPAL_SUCCESS; } + +static int libnuma_module_node_name_to_id(char *node_name, int *id) +{ + /* GLB: fix me */ + *id = atoi(node_name + 3); + + return OPAL_SUCCESS; +} + +static int libnuma_modules_bind(opal_maffinity_base_segment_t *segs, + size_t count, int node_id) +{ + size_t i; + int rc; + unsigned long node_mask = (1 << node_id); + + for(i = 0; i < count; i++) { + rc = mbind(segs[i].mbs_start_addr, segs[i].mbs_len, MPOL_PREFERRED, + &node_mask, sizeof(node_mask) * 8, MPOL_MF_MOVE); + if(rc != 0) + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/maffinity/maffinity.h b/opal/mca/maffinity/maffinity.h index 0fdcee3..5a1a5b2 100644 --- a/opal/mca/maffinity/maffinity.h +++ b/opal/mca/maffinity/maffinity.h @@ -82,6 +82,17 @@ typedef int (*opal_maffinity_base_module_init_1_0_0_fn_t)(void); typedef int (*opal_maffinity_base_module_set_fn_t) (opal_maffinity_base_segment_t *segments, size_t num_segments); +/** + * translate memory node name (such as "mem0") to memory node id + */ +typedef int (*opal_maffinity_base_module_node_name_to_id_fn_t) + (char *node_name, int *node_id); + +/** + * bind memory to node + */ +typedef int (*opal_maffinity_base_module_bind_fn_t) + (opal_maffinity_base_segment_t *segments, size_t num_segments, int node_id); /** * Structure for maffinity v1.0.0 components. @@ -108,6 +119,8 @@ struct opal_maffinity_base_module_1_0_0_t { /** Set memory affinity */ opal_maffinity_base_module_set_fn_t maff_module_set; + opal_maffinity_base_module_node_name_to_id_fn_t maff_module_name_to_id; + opal_maffinity_base_module_bind_fn_t maff_module_bind; }; /** * Convenience typedef
commit 9d2f87a8264dd5167b3a3955b8fba2f2b16822e2 Author: Gleb Natapov <g...@voltaire.com> Date: Tue May 27 17:37:32 2008 +0300 Add NUMA support to SM BTL. diff --git a/ompi/class/ompi_circular_buffer_fifo.h b/ompi/class/ompi_circular_buffer_fifo.h index 21a10f6..c360ff7 100644 --- a/ompi/class/ompi_circular_buffer_fifo.h +++ b/ompi/class/ompi_circular_buffer_fifo.h @@ -18,6 +18,7 @@ #ifndef _OMPI_CIRCULAR_BUFFER_FIFO #define _OMPI_CIRCULAR_BUFFER_FIFO +#include <unistd.h> /* for getpagesize() */ #include "ompi/constants.h" #include "opal/sys/cache.h" @@ -125,10 +126,10 @@ static inline int ompi_cb_fifo_size(ompi_cb_fifo_t *fifo) { * */ static inline int ompi_cb_fifo_init(int size_of_fifo, - int lazy_free_freq, int fifo_memory_locality_index, - int head_memory_locality_index, int tail_memory_locality_index, - ompi_cb_fifo_t *fifo, ptrdiff_t offset, - mca_mpool_base_module_t *memory_allocator) + int lazy_free_freq, + mca_mpool_base_module_t *head_mpool, + mca_mpool_base_module_t *tail_mpool, + ompi_cb_fifo_t *fifo, ptrdiff_t offset) { int i, size; char *buf; @@ -154,37 +155,39 @@ static inline int ompi_cb_fifo_init(int size_of_fifo, fifo->mask = (size - 1); /* allocate fifo array */ - buf = (char *) memory_allocator->mpool_alloc(memory_allocator, - sizeof(void *) * size + 2*CACHE_LINE_SIZE, CACHE_LINE_SIZE, 0, - NULL); + buf = (char *) tail_mpool->mpool_alloc(tail_mpool, + sizeof(void *) * size + CACHE_LINE_SIZE, getpagesize(), 0, NULL); if (NULL == buf) { return OMPI_ERR_OUT_OF_RESOURCE; } - fifo->queue = (volatile void**)(buf + 2*CACHE_LINE_SIZE); + fifo->queue = (volatile void**)(buf + CACHE_LINE_SIZE); /* buffer address in a receiver address space */ fifo->recv_queue = (volatile void**)((char*)fifo->queue - offset); /* initialize the queue entries */ for (i = 0; i < size; i++) { fifo->queue[i] = OMPI_CB_FREE; } + fifo->tail = (ompi_cb_fifo_ctl_t*)buf; + + /* initialize the tail structure */ + opal_atomic_unlock(&(fifo->tail->lock)); + fifo->tail->fifo_index=0; + fifo->tail->num_to_clear=0; + + /* recalculate tail address in a receiver address space */ + fifo->tail = (ompi_cb_fifo_ctl_t*)((char*)fifo->tail - offset); + + fifo->head = (ompi_cb_fifo_ctl_t*)head_mpool->mpool_alloc(head_mpool, + sizeof(ompi_cb_fifo_ctl_t), getpagesize(), 0, NULL); - fifo->head = (ompi_cb_fifo_ctl_t*)buf; /* head address in a receiver address space */ fifo->recv_head = (ompi_cb_fifo_ctl_t*)((char*)fifo->head - offset); - fifo->tail = (ompi_cb_fifo_ctl_t*)(buf + CACHE_LINE_SIZE); /* initialize the head structure */ opal_atomic_unlock(&(fifo->head->lock)); fifo->head->fifo_index=0; fifo->head->num_to_clear=0; - /* initialize the head structure */ - opal_atomic_unlock(&(fifo->tail->lock)); - fifo->tail->fifo_index=0; - fifo->tail->num_to_clear=0; - - /* recalculate tail address in a receiver address space */ - fifo->tail = (ompi_cb_fifo_ctl_t*)((char*)fifo->tail - offset); /* return */ return OMPI_SUCCESS; diff --git a/ompi/class/ompi_fifo.h b/ompi/class/ompi_fifo.h index f5feec2..0f1c4ee 100644 --- a/ompi/class/ompi_fifo.h +++ b/ompi/class/ompi_fifo.h @@ -211,13 +211,13 @@ struct ompi_fifo_t { int cb_count; /* fifo memory locality index */ - int fifo_memory_locality_index; + mca_mpool_base_module_t *fifo_mpool; /* head memory locality index */ - int head_memory_locality_index; + mca_mpool_base_module_t *head_mpool; /* tail memory locality index */ - int tail_memory_locality_index; + mca_mpool_base_module_t *tail_mpool; /* offset between sender and receiver shared mapping */ ptrdiff_t offset; @@ -259,10 +259,11 @@ typedef struct ompi_fifo_t ompi_fifo_t; * */ static inline int ompi_fifo_init(int size_of_cb_fifo, - int lazy_free_freq, int cb_num_limit, int fifo_memory_locality_index, - int head_memory_locality_index, int tail_memory_locality_index, - ompi_fifo_t *fifo, ptrdiff_t offset, - mca_mpool_base_module_t *memory_allocator) + int lazy_free_freq, int cb_num_limit, + mca_mpool_base_module_t *fifo_mpool, + mca_mpool_base_module_t *head_mpool, + mca_mpool_base_module_t *tail_mpool, + ompi_fifo_t *fifo, ptrdiff_t offset) { int error_code; @@ -270,24 +271,23 @@ static inline int ompi_fifo_init(int size_of_cb_fifo, fifo->size = size_of_cb_fifo; /*we allocate one cb below so subtract one here */ fifo->cb_count = cb_num_limit - 1; - fifo->fifo_memory_locality_index = fifo_memory_locality_index; - fifo->head_memory_locality_index = head_memory_locality_index; - fifo->tail_memory_locality_index = tail_memory_locality_index; + fifo->fifo_mpool = fifo_mpool; + fifo->head_mpool = head_mpool; + fifo->tail_mpool = tail_mpool; /* allocate head ompi_cb_fifo_t structure and place for head and tail locks * on different cache lines */ - fifo->head = (ompi_cb_fifo_wrapper_t*)memory_allocator->mpool_alloc( - memory_allocator, sizeof(ompi_cb_fifo_wrapper_t), CACHE_LINE_SIZE, + fifo->head = (ompi_cb_fifo_wrapper_t*)fifo_mpool->mpool_alloc( + fifo_mpool, sizeof(ompi_cb_fifo_wrapper_t), getpagesize(), 0, NULL); if(NULL == fifo->head) { return OMPI_ERR_OUT_OF_RESOURCE; } /* initialize the circular buffer fifo head structure */ - error_code=ompi_cb_fifo_init(size_of_cb_fifo, - lazy_free_freq, fifo_memory_locality_index, - head_memory_locality_index, tail_memory_locality_index, - &(fifo->head->cb_fifo), offset, memory_allocator); + error_code = ompi_cb_fifo_init(size_of_cb_fifo, + lazy_free_freq, head_mpool, tail_mpool, &(fifo->head->cb_fifo), + offset); if ( OMPI_SUCCESS != error_code ) { return error_code; } @@ -314,8 +314,7 @@ static inline int ompi_fifo_init(int size_of_cb_fifo, * @returncode Slot index to which data is written * */ -static inline int ompi_fifo_write_to_head(void *data, - ompi_fifo_t *fifo, mca_mpool_base_module_t *fifo_allocator) +static inline int ompi_fifo_write_to_head(void *data, ompi_fifo_t *fifo) { int error_code; ompi_cb_fifo_wrapper_t *next_ff; @@ -343,7 +342,7 @@ static inline int ompi_fifo_write_to_head(void *data, /* We retry to write to the old head before creating new one just in * case consumer read all entries after first attempt failed, but * before we set cb_overflow to true */ - error_code=ompi_cb_fifo_write_to_head(data, &fifo->head->cb_fifo); + error_code = ompi_cb_fifo_write_to_head(data, &fifo->head->cb_fifo); if(error_code != OMPI_CB_ERROR) { fifo->head->cb_overflow = false; @@ -361,9 +360,10 @@ static inline int ompi_fifo_write_to_head(void *data, if(0 == fifo->cb_count) next_ff = NULL; else - next_ff = (ompi_cb_fifo_wrapper_t*)fifo_allocator->mpool_alloc( - fifo_allocator, sizeof(ompi_cb_fifo_wrapper_t), - CACHE_LINE_SIZE, 0, NULL); + next_ff = (ompi_cb_fifo_wrapper_t*) + fifo->fifo_mpool->mpool_alloc(fifo->fifo_mpool, + sizeof(ompi_cb_fifo_wrapper_t), getpagesize(), 0, + NULL); if (NULL == next_ff) { opal_atomic_unlock(&fifo->fifo_lock); return OMPI_ERR_OUT_OF_RESOURCE; @@ -372,12 +372,10 @@ static inline int ompi_fifo_write_to_head(void *data, /* initialize the circular buffer fifo head structure */ error_code = ompi_cb_fifo_init(fifo->size, fifo->head->cb_fifo.lazy_free_frequency, - fifo->fifo_memory_locality_index, - fifo->head_memory_locality_index, - fifo->tail_memory_locality_index, - &(next_ff->cb_fifo), fifo->offset, fifo_allocator); + fifo->head_mpool, fifo->tail_mpool, + &(next_ff->cb_fifo), fifo->offset); if (OMPI_SUCCESS != error_code) { - fifo_allocator->mpool_free(fifo_allocator, next_ff, NULL); + fifo->fifo_mpool->mpool_free(fifo->fifo_mpool, next_ff, NULL); opal_atomic_unlock(&fifo->fifo_lock); return error_code; } diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index d158595..6fc19e6 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -31,6 +31,10 @@ #include "opal/sys/atomic.h" #include "orte/util/output.h" #include "opal/util/if.h" +#include "opal/mca/carto/carto.h" +#include "opal/mca/carto/base/base.h" +#include "opal/mca/paffinity/base/base.h" +#include "opal/mca/maffinity/base/base.h" #include "orte/util/proc_info.h" #include "opal/util/printf.h" #include "ompi/class/ompi_fifo.h" @@ -39,6 +43,7 @@ #include "ompi/mca/btl/btl.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm_mmap.h" +#include "ompi/mca/mpool/sm/mpool_sm.h" #include "btl_sm.h" #include "btl_sm_endpoint.h" #include "btl_sm_frag.h" @@ -92,9 +97,10 @@ static void *mpool_calloc(size_t nmemb, size_t size) { void *buf; size_t bsize = nmemb * size; + mca_mpool_base_module_t *mpool = mca_btl_sm_component.sm_mpool; + + buf = mpool->mpool_alloc(mpool, bsize, CACHE_LINE_SIZE, 0, NULL); - buf = mca_btl_sm_component.sm_mpool->mpool_alloc( - mca_btl_sm_component.sm_mpool, bsize, CACHE_LINE_SIZE, 0, NULL); if (NULL == buf) return NULL; @@ -127,27 +133,93 @@ static int init_fifos(ompi_fifo_t *f, int n) return OMPI_SUCCESS; } +static void init_maffinity(int *my_mem_node, int *max_mem_node) +{ + static opal_carto_graph_t *topo; + opal_value_array_t dists; + int i, num_core, max_core, socket; + opal_paffinity_base_cpu_set_t cpus; + char *myslot = NULL; + opal_carto_node_distance_t *dist; + opal_carto_base_node_t *slot_node; + + *my_mem_node = 0; + *max_mem_node = 1; + + if(opal_carto_base_get_host_graph(&topo, "Memory") != OMPI_SUCCESS) + return; + + OBJ_CONSTRUCT(&dists, opal_value_array_t); + opal_value_array_init(&dists, sizeof(opal_carto_node_distance_t)); + + if(opal_paffinity_base_get_processor_info(&num_core, &max_core) != + OMPI_SUCCESS) + max_core = 100; + + OPAL_PAFFINITY_CPU_ZERO(cpus); + opal_paffinity_base_get(&cpus); + + /* find core we are running on */ + for(i = 0; i < max_core; i++) + if(OPAL_PAFFINITY_CPU_ISSET(i, cpus)) + break; + + opal_paffinity_base_map_to_socket_core(i, &socket, &i); + asprintf(&myslot, "slot%d", socket); + + slot_node = opal_carto_base_find_node(topo, myslot); + + if(NULL == slot_node) + goto out; + + opal_carto_base_get_nodes_distance(topo, slot_node, "Memory", &dists); + if((*max_mem_node = opal_value_array_get_size(&dists)) < 2) + goto out; + + dist = opal_value_array_get_item(&dists, 0); + opal_maffinity_base_node_name_to_id(dist->node->node_name, my_mem_node); + +out: + if(myslot) free(myslot); + OBJ_DESTRUCT(&dists); + opal_carto_base_free_graph(topo); +} + static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) { size_t size, length, length_payload; char *sm_ctl_file; ompi_fifo_t *my_fifos; + int my_mem_node, num_mem_nodes, i; + + init_maffinity(&my_mem_node, &num_mem_nodes); + + mca_btl_sm_component.mem_node = my_mem_node; + mca_btl_sm_component.num_mem_nodes = num_mem_nodes; /* lookup shared memory pool */ - mca_btl_sm_component.sm_mpool = - mca_mpool_base_module_lookup(mca_btl_sm_component.sm_mpool_name); - if(NULL == mca_btl_sm_component.sm_mpool) { - mca_btl_sm_component.sm_mpool = + mca_btl_sm_component.sm_mpools = calloc(num_mem_nodes, + sizeof(mca_mpool_base_module_t*)); + + /* create mpool for each memory node */ + for(i = 0; i < num_mem_nodes; i++) { + mca_mpool_base_resources_t res; + /* disable memory binding if there is only one memory node */ + res.mem_node = (num_mem_nodes == 1) ? -1 : i; + mca_btl_sm_component.sm_mpools[i] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, - sm_btl, NULL); + sm_btl, &res); + /* Sanity check to ensure that we found it */ + if(NULL == mca_btl_sm_component.sm_mpools[i]) + return OMPI_ERR_OUT_OF_RESOURCE; + + if(i == my_mem_node) + mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[i]; } - /* Sanity check to ensure that we found it */ - if(NULL == mca_btl_sm_component.sm_mpool) - return OMPI_ERR_OUT_OF_RESOURCE; mca_btl_sm_component.sm_mpool_base = - mca_btl_sm_component.sm_mpool->mpool_base(mca_btl_sm_component.sm_mpool); + mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]); /* set the shared memory offset */ mca_btl_sm_component.sm_offset = (ptrdiff_t*)calloc(n, sizeof(ptrdiff_t)); @@ -172,7 +244,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* Pass in a data segment alignment of 0 to get no data segment (only the shared control structure) */ size = sizeof(mca_common_sm_file_header_t) + - n * (sizeof(ompi_fifo_t*) + sizeof(char *)) + CACHE_LINE_SIZE; + n * (sizeof(ompi_fifo_t*) + sizeof(char *) + sizeof(uint16_t)) + CACHE_LINE_SIZE; if(!(mca_btl_sm_component.mmap_file = mca_common_sm_mmap_init(size, sm_ctl_file, sizeof(mca_common_sm_file_header_t), @@ -201,6 +273,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.shm_fifo = (ompi_fifo_t **)mca_btl_sm_component.mmap_file->data_addr; mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); + mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); /* Sync with other local procs. (Do we have to?) */ if(0 == mca_btl_sm_component.my_smp_rank) { @@ -219,6 +292,8 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) /* set the base of the shared memory segment */ mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] = (char*)mca_btl_sm_component.sm_mpool_base; + mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] = + (uint16_t)my_mem_node; /* * initialize the array of fifo's "owned" by this process @@ -246,6 +321,10 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; + mca_btl_sm_component.mem_nodes = malloc(sizeof(uint16_t) * n); + if(NULL == mca_btl_sm_component.mem_nodes) + return OMPI_ERR_OUT_OF_RESOURCE; + /* initialize fragment descriptor free lists */ /* allocation will be for the fragment descriptor and payload buffer */ @@ -425,6 +504,7 @@ int mca_btl_sm_add_procs( for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) { ptrdiff_t diff; + int peer_mem_node; if(j == my_smp_rank) continue; @@ -450,14 +530,21 @@ int mca_btl_sm_add_procs( (opal_atomic_lock_t*)OFFSET2ADDR(diff, mca_btl_sm_component.fifo[j][my_smp_rank].head_lock); } + /* cache local copy of peer memory node number */ + peer_mem_node = mca_btl_sm_component.mem_nodes[j] = mca_btl_sm_component.shm_mem_nodes[j]; + /* Initialize fifo for use. Note that sender does initialization */ return_code = ompi_fifo_init(mca_btl_sm_component.size_of_cb_queue, mca_btl_sm_component.cb_lazy_free_freq, mca_btl_sm_component.cb_max_num, - 0,0,0, + /* fifo mpool */ + mca_btl_sm_component.sm_mpools[peer_mem_node], + /* head mpool */ + mca_btl_sm_component.sm_mpool, + /* tail mpool */ + mca_btl_sm_component.sm_mpools[peer_mem_node], &mca_btl_sm_component.fifo[j][my_smp_rank], - mca_btl_sm_component.sm_offset[j], - mca_btl_sm_component.sm_mpool); + mca_btl_sm_component.sm_offset[j]); if(return_code != OMPI_SUCCESS) goto CLEANUP; diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index a338ff6..c418124 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -61,6 +61,10 @@ extern "C" { #define DONE (char)1 #endif +typedef struct mca_btl_sm_mem_node_t { + mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */ +} mca_btl_sm_mem_node_t; + /** * Shared Memory (SM) BTL module. */ @@ -72,7 +76,8 @@ struct mca_btl_sm_component_t { int32_t sm_max_procs; /**< upper limit on the number of processes using the shared memory pool */ int sm_extra_procs; /**< number of extra procs to allow */ char* sm_mpool_name; /**< name of shared memory pool module */ - mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */ + mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node */ + mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */ void* sm_mpool_base; /**< base address of shared memory pool */ size_t eager_limit; /**< first fragment size */ size_t max_frag_size; /**< maximum (second and beyone) fragment size */ @@ -82,11 +87,13 @@ struct mca_btl_sm_component_t { shared memory */ ompi_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ char **shm_bases; /**< pointer to base pointers in shared memory */ + uint16_t *shm_mem_nodes; /**< pointer to mem noded in shared memory */ ompi_fifo_t **fifo; /**< cached copy of the pointer to the 2D fifo array. The address in the shared memory segment sm_ctl_header is a relative, but this one, in process private memory, is a real virtual address */ + uint16_t *mem_nodes; /**< cached copy of mem nodes of each local rank */ size_t size_of_cb_queue; /**< size of each circular buffer queue array */ size_t cb_lazy_free_freq; /**< frequency of lazy free */ int cb_max_num; /**< max number of circular buffers for each peer */ @@ -94,16 +101,15 @@ struct mca_btl_sm_component_t { addresses, per local process value */ int32_t num_smp_procs; /**< current number of smp procs on this host */ int32_t my_smp_rank; /**< My SMP process rank. Used for accessing - * SMP specfic data structures. */ + * SMP specfic data structures. */ ompi_free_list_t sm_frags1; /**< free list of sm first */ ompi_free_list_t sm_frags2; /**< free list of sm second */ ompi_free_list_t sm_frags; /**< free list of frags without data */ - ompi_free_list_t sm_first_frags_to_progress; /**< list of first - fragments that are - awaiting resources */ struct mca_btl_base_endpoint_t **sm_peers; opal_free_list_t pending_send_fl; + int mem_node; + int num_mem_nodes; #if OMPI_ENABLE_PROGRESS_THREADS == 1 char sm_fifo_path[PATH_MAX]; /**< path to fifo used to signal this process */ diff --git a/ompi/mca/btl/sm/btl_sm_fifo.h b/ompi/mca/btl/sm/btl_sm_fifo.h index 4d5a29f..3208f11 100644 --- a/ompi/mca/btl/sm/btl_sm_fifo.h +++ b/ompi/mca/btl/sm/btl_sm_fifo.h @@ -14,7 +14,7 @@ do { \ if(opal_using_threads()) \ opal_atomic_lock(fifo->head_lock); \ /* post fragment */ \ - if(ompi_fifo_write_to_head(hdr, fifo, mca_btl_sm_component.sm_mpool) \ + if(ompi_fifo_write_to_head(hdr, fifo) \ != OMPI_SUCCESS) { \ btl_sm_add_pending(endpoint_peer, hdr, resend); \ rc = OMPI_ERR_RESOURCE_BUSY; \ diff --git a/ompi/mca/mpool/sm/mpool_sm.h b/ompi/mca/mpool/sm/mpool_sm.h index 71eacb5..3bb30f1 100644 --- a/ompi/mca/mpool/sm/mpool_sm.h +++ b/ompi/mca/mpool/sm/mpool_sm.h @@ -5,15 +5,15 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ /** @@ -42,18 +42,23 @@ struct mca_mpool_sm_component_t { }; typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; +typedef struct mca_mpool_base_resources_t { + int32_t mem_node; +} mca_mpool_base_resources_t; + OMPI_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; -struct mca_mpool_sm_module_t { - mca_mpool_base_module_t super; - mca_allocator_base_module_t * sm_allocator; - struct mca_mpool_sm_mmap_t *sm_mmap; -}; typedef struct mca_mpool_sm_module_t mca_mpool_sm_module_t; +typedef struct mca_mpool_sm_module_t { + mca_mpool_base_module_t super; + mca_allocator_base_module_t * sm_allocator; + struct mca_mpool_sm_mmap_t *sm_mmap; + int32_t mem_node; +} mca_mpool_sm_module_t; -/* - * Initializes the mpool module. - */ -void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool); +/* + * Initializes the mpool module. + */ +void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool); /* @@ -64,10 +69,10 @@ void* mca_mpool_sm_base(mca_mpool_base_module_t*); /** * Allocate block of shared memory. */ -void* mca_mpool_sm_alloc( - mca_mpool_base_module_t* mpool, - size_t size, - size_t align, +void* mca_mpool_sm_alloc( + mca_mpool_base_module_t* mpool, + size_t size, + size_t align, uint32_t flags, mca_mpool_base_registration_t** registration); @@ -75,17 +80,17 @@ void* mca_mpool_sm_alloc( * realloc function typedef */ void* mca_mpool_sm_realloc( - mca_mpool_base_module_t* mpool, - void* addr, - size_t size, + mca_mpool_base_module_t* mpool, + void* addr, + size_t size, mca_mpool_base_registration_t** registration); /** * free function typedef */ void mca_mpool_sm_free( - mca_mpool_base_module_t* mpool, - void * addr, + mca_mpool_base_module_t* mpool, + void * addr, mca_mpool_base_registration_t* registration); /** diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index 632a69e..60ce052 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -5,15 +5,15 @@ * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -65,7 +65,7 @@ mca_mpool_sm_component_t mca_mpool_sm_component = { }, /* Next the MCA v1.0.0 component meta data */ - + { /* The component is not checkpoint ready */ MCA_BASE_METADATA_PARAM_NONE @@ -89,7 +89,7 @@ static int mca_mpool_sm_open(void) default_max = 512*1024*1024; default_min = 128*1024*1024; default_peer = 32*1024*1024; - + /* register SM component parameters */ mca_base_param_reg_string(&mca_mpool_sm_component.super.mpool_version, "allocator", @@ -151,12 +151,12 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( { char *file_name; int len; - mca_mpool_sm_module_t* mpool_module; + mca_mpool_sm_module_t* mpool_module; mca_allocator_base_component_t* allocator_component; long max_size, min_size, peer_size; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; - + /* README: this needs to change if procs in different jobs (even spawned ones) are to talk using shared memory */ procs = ompi_proc_world(&num_all_procs); @@ -166,9 +166,9 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( } } free(procs); - + /* parse the max, min and peer sizes, and validate them */ - /* absolutely necessary to reset errno each time */ + /* absolutely necessary to reset errno each time */ errno = 0; max_size = strtol(max_size_param, (char **)NULL, 10); if (errno == ERANGE) { @@ -178,7 +178,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( orte_output(0, "mca_mpool_sm_init: invalid max_size entered. set it to (%ld)", default_max); max_size = default_max; } - + errno = 0; min_size = strtol(min_size_param, (char **)NULL, 10); if (errno == ERANGE) { @@ -212,7 +212,7 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( * (peer_size * num_local_procs) is going to overflow SIZE_MAX, then we'll * set sm_size to max_size. */ if ((double)LONG_MAX / num_local_procs < peer_size) { - /* enable verbose would show if sm_size overflows */ + /* enable verbose would show if sm_size overflows */ orte_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: sm_size overflows, set sm_size to max_size (%ld)", LONG_MAX); @@ -246,8 +246,8 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( } } - mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); - mca_mpool_sm_module_init(mpool_module); + mpool_module = (mca_mpool_sm_module_t*)malloc(sizeof(mca_mpool_sm_module_t)); + mca_mpool_sm_module_init(mpool_module); /* create initial shared memory mapping */ len = asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", @@ -256,16 +256,16 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( if ( 0 > len ) { return NULL; } - + orte_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: shared memory size used: (%ld)", mca_mpool_sm_component.sm_size); - if(NULL == - (mca_common_sm_mmap = + if(NULL == + (mca_common_sm_mmap = mca_common_sm_mmap_init(mca_mpool_sm_component.sm_size, file_name,sizeof(mca_common_sm_mmap_t), 8 ) - )) + )) { orte_output(0, "mca_mpool_sm_init: unable to create shared memory mapping (%s)", file_name); free(file_name); @@ -274,14 +274,16 @@ static mca_mpool_base_module_t* mca_mpool_sm_init( free(file_name); /* setup allocator */ - mpool_module->sm_allocator = + mpool_module->sm_allocator = allocator_component->allocator_init(true, mca_common_sm_mmap_seg_alloc, NULL, NULL); if(NULL == mpool_module->sm_allocator) { orte_output(0, "mca_mpool_sm_init: unable to initialize allocator"); return NULL; } - + + mpool_module->mem_node = resources ? resources->mem_node : -1; + return &mpool_module->super; } diff --git a/ompi/mca/mpool/sm/mpool_sm_module.c b/ompi/mca/mpool/sm/mpool_sm_module.c index 1efab2e..9610a14 100644 --- a/ompi/mca/mpool/sm/mpool_sm_module.c +++ b/ompi/mca/mpool/sm/mpool_sm_module.c @@ -5,39 +5,44 @@ * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ #include "ompi_config.h" #include <string.h> +#include <unistd.h> #include "orte/util/output.h" +#include "opal/include/opal/align.h" +#include "opal/mca/maffinity/maffinity.h" +#include "opal/mca/maffinity/maffinity_types.h" +#include "opal/mca/maffinity/base/base.h" #include "ompi/mca/mpool/sm/mpool_sm.h" #include "ompi/mca/common/sm/common_sm_mmap.h" -/* +/* * Initializes the mpool module. - */ + */ void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) { - mpool->super.mpool_component = &mca_mpool_sm_component.super; - mpool->super.mpool_base = mca_mpool_sm_base; - mpool->super.mpool_alloc = mca_mpool_sm_alloc; - mpool->super.mpool_realloc = mca_mpool_sm_realloc; - mpool->super.mpool_free = mca_mpool_sm_free; - mpool->super.mpool_find = NULL; - mpool->super.mpool_register = NULL; - mpool->super.mpool_deregister = NULL; + mpool->super.mpool_component = &mca_mpool_sm_component.super; + mpool->super.mpool_base = mca_mpool_sm_base; + mpool->super.mpool_alloc = mca_mpool_sm_alloc; + mpool->super.mpool_realloc = mca_mpool_sm_realloc; + mpool->super.mpool_free = mca_mpool_sm_free; + mpool->super.mpool_find = NULL; + mpool->super.mpool_register = NULL; + mpool->super.mpool_deregister = NULL; mpool->super.mpool_release_memory = NULL; - mpool->super.mpool_finalize = NULL; + mpool->super.mpool_finalize = NULL; mpool->super.mpool_ft_event = mca_mpool_sm_ft_event; mpool->super.flags = 0; } @@ -51,39 +56,60 @@ void* mca_mpool_sm_base(mca_mpool_base_module_t* mpool) } /** - * allocate function + * allocate function */ void* mca_mpool_sm_alloc( - mca_mpool_base_module_t* mpool, - size_t size, - size_t align, + mca_mpool_base_module_t* mpool, + size_t size, + size_t align, uint32_t flags, mca_mpool_base_registration_t** registration) { - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; - return mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, align, registration); + mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; + opal_maffinity_base_segment_t mseg; + + mseg.mbs_start_addr = + mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, + OPAL_ALIGN(align, getpagesize(), size_t), registration); + + if(mpool_sm->mem_node >= 0) { + mseg.mbs_len = size; + opal_maffinity_base_bind(&mseg, 1, mpool_sm->mem_node); + } + + return mseg.mbs_start_addr; } /** - * realloc function + * realloc function */ void* mca_mpool_sm_realloc( - mca_mpool_base_module_t* mpool, - void* addr, - size_t size, + mca_mpool_base_module_t* mpool, + void* addr, + size_t size, mca_mpool_base_registration_t** registration) { - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; - return mpool_sm->sm_allocator->alc_realloc(mpool_sm->sm_allocator, addr, size, registration); + mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; + opal_maffinity_base_segment_t mseg; + + mseg.mbs_start_addr = + mpool_sm->sm_allocator->alc_realloc(mpool_sm->sm_allocator, addr, size, + registration); + if(mpool_sm->mem_node >= 0) { + mseg.mbs_len = size; + opal_maffinity_base_bind(&mseg, 1, mpool_sm->mem_node); + } + + return mseg.mbs_start_addr; } /** - * free function + * free function */ -void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr, +void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr, mca_mpool_base_registration_t* registration) { - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; + mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; mpool_sm->sm_allocator->alc_free(mpool_sm->sm_allocator, addr); }