Apologies, I didn't explain it very well, I do make sure that if the tile size 
256*8 < 4096 (pagesize), then I double the number of tiles per page, I just 
wanted to keep the explanation simple. 

here are some code snippets to give you the flavour of it 

initializing the helper sruct

    matrix_numa_binder(std::size_t Ncols, std::size_t Nrows,
                       std::size_t Ntile, std::size_t Ntiles_per_domain,
                       std::size_t Ncolprocs=1, std::size_t Nrowprocs=1,
                       std::string pool_name="default"
        : cols_(Ncols), rows_(Nrows),
          tile_size_(Ntile), tiles_per_domain_(Ntiles_per_domain),
          colprocs_(Ncolprocs), rowprocs_(Nrowprocs)
        using namespace hpx::compute::host;
        binding_helper<T>::pool_name_ = pool_name;
        const int CACHE_LINE_SIZE = sysconf (_SC_LEVEL1_DCACHE_LINESIZE);
        const int PAGE_SIZE       = sysconf(_SC_PAGE_SIZE);
        const int ALIGNMENT       = std::max(PAGE_SIZE,CACHE_LINE_SIZE);
        const int ELEMS_ALIGN     = (ALIGNMENT/sizeof(T));
        rows_page_        = ELEMS_ALIGN;
        leading_dim_      = ELEMS_ALIGN*((rows_*sizeof(T) + 
        tiles_per_domain_ = std::max(tiles_per_domain_, ELEMS_ALIGN/tile_size_);

operator called by allocator which returns the domain index to bind a page to

    virtual std::size_t operator ()(
            const T * const base_ptr, const T * const page_ptr,
            const std::size_t pagesize, const std::size_t domains) const 
        std::size_t offset  = (page_ptr - base_ptr);
        std::size_t col     = (offset / leading_dim_);
        std::size_t row     = (offset % leading_dim_);
        std::size_t index   = (col / (tile_size_ * tiles_per_domain_));

        if ((tile_size_*tiles_per_domain_*sizeof(T))>=pagesize) {
            index += (row / (tile_size_ * tiles_per_domain_));
        else {
        return index % domains;

this function is called by each thread (one per numa domain) and if the domain 
returned by the page query matches the domain ID of the thread/task then the 
first memory location on the page is written to

            for (size_type i=0; i<num_pages; ++i) {
                // we pass the base pointer and current page pointer
                size_type dom = helper->operator()(p, page_ptr, pagesize, 
                if (dom==numa_domain) {
                    // trigger a memory read and rewrite without changing 
                    volatile char* vaddr = (volatile char*) page_ptr;
                    *vaddr = T(0); // *vaddr;
                page_ptr += pageN;

All of this has been debugged quite extensively and I can write numbers to 
memory and read them back and the patterns always match the domains expected.

This function is called after all data is written to attempt to verify (and 
display the patterns above)

    int topology::get_numa_domain(const void *addr) const
#if HWLOC_API_VERSION >= 0x00010b06
        hpx_hwloc_bitmap_wrapper *nodeset = topology::bitmap_storage_.get();
        if (nullptr == nodeset)
            hwloc_bitmap_t nodeset_ = hwloc_bitmap_alloc();
            nodeset = topology::bitmap_storage_.get();
        hwloc_nodeset_t ns = 

        int ret = hwloc_get_area_memlocation(topo, addr, 1,  ns,
        if (ret<0) {
            std::string msg(strerror(errno));
              , "hpx::threads::topology::get_numa_domain"
              , "hwloc_get_area_memlocation failed " + msg);
            return -1;
// this uses hwloc directly
//        int bit = hwloc_bitmap_first(ns);
//        return bit
// this uses an alternative method, both give the same result AFAICT
        threads::mask_type mask = bitmap_to_mask(ns, HWLOC_OBJ_NUMANODE);
        return static_cast<int>(threads::find_first(mask));
        return 0;

Thanks for taking the time to look it over

hwloc-users mailing list

Reply via email to