diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index c45f100..a8ccc69 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -211,27 +211,23 @@ int ompi_coll_base_retain_datatypes( ompi_request_t *req, ompi_datatype_t *stype
 
 static void release_vecs_callback(ompi_coll_base_nbc_request_t *request) {
     ompi_communicator_t *comm = request->super.req_mpi_object.comm;
-    int scount, rcount;
-    if (OMPI_COMM_IS_TOPO(comm)) {
-        (void)mca_topo_base_neighbor_count (comm, &rcount, &scount);
-    } else {
-        scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
-    }
-    if (NULL != request->data.vecs.stypes) {
-        for (int i=0; i<scount; i++) {
+    if (NULL != request->data.vecs.stypes && 0 < request->data.vecs.scount) {
+        for (int i=0; i<request->data.vecs.scount; i++) {
             if (NULL != request->data.vecs.stypes[i]) {
                 OMPI_DATATYPE_RELEASE_NO_NULLIFY(request->data.vecs.stypes[i]);
             }
         }
         request->data.vecs.stypes = NULL;
+        request->data.vecs.scount = 0;
     }
-    if (NULL != request->data.vecs.rtypes) {
-        for (int i=0; i<rcount; i++) {
+    if (NULL != request->data.vecs.rtypes && 0 < request->data.vecs.rcount) {
+        for (int i=0; i<request->data.vecs.rcount; i++) {
             if (NULL != request->data.vecs.rtypes[i]) {
                 OMPI_DATATYPE_RELEASE_NO_NULLIFY(request->data.vecs.rtypes[i]);
             }
         }
         request->data.vecs.rtypes = NULL;
+        request->data.vecs.rcount = 0;
     }
 }
 
@@ -257,19 +253,14 @@ static int free_vecs_callback(struct ompi_request_t **rptr) {
 }
 
 int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
-                                       ompi_datatype_t *const stypes[], ompi_datatype_t *const rtypes[]) {
+                                       ompi_datatype_t *const stypes[], int scount,
+                                       ompi_datatype_t *const rtypes[], int rcount) {
     ompi_coll_base_nbc_request_t *request = (ompi_coll_base_nbc_request_t *)req;
     bool retain = false;
     ompi_communicator_t *comm = request->super.req_mpi_object.comm;
-    int scount, rcount;
     if (REQUEST_COMPLETE(req)) {
         return OMPI_SUCCESS;
     }
-    if (OMPI_COMM_IS_TOPO(comm)) {
-        (void)mca_topo_base_neighbor_count (comm, &rcount, &scount);
-    } else {
-        scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
-    }
 
     for (int i=0; i<scount; i++) {
         if (NULL != stypes && NULL != stypes[i] && !ompi_datatype_is_predefined(stypes[i])) {
@@ -285,7 +276,9 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req,
     }
     if (OPAL_UNLIKELY(retain)) {
         request->data.vecs.stypes = stypes;
+        request->data.vecs.scount = scount;
         request->data.vecs.rtypes = rtypes;
+        request->data.vecs.rcount = rcount;
         if (req->req_persistent) {
             request->cb.req_free = req->req_free;
             req->req_free = free_vecs_callback;
@@ -305,6 +298,8 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req)
     req->req_complete_cb_data = NULL;
     req->data.objs.objs[0] = NULL;
     req->data.objs.objs[1] = NULL;
+    req->data.vecs.scount = 0;
+    req->data.vecs.rcount = 0;
 }
 
 OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL);
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 46c9515..c6d466e 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -60,6 +60,8 @@ struct ompi_coll_base_nbc_request_t {
         struct {
             ompi_datatype_t * const *stypes;
             ompi_datatype_t * const *rtypes;
+            int scount;
+            int rcount;
         } vecs;
     } data;
 };
@@ -174,8 +176,8 @@ int ompi_coll_base_retain_datatypes( ompi_request_t *request,
  * (will be cast internally).
  */
 int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
-                                       ompi_datatype_t * const stypes[],
-                                       ompi_datatype_t * const rtypes[]);
+                                       ompi_datatype_t * const stypes[], int scount,
+                                       ompi_datatype_t * const rtypes[], int rcount);
 
 /* File reading function */
 int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val);
diff --git a/ompi/mca/pml/ucx/pml_ucx_datatype.h b/ompi/mca/pml/ucx/pml_ucx_datatype.h
index 8e1fbba..97653d1 100644
--- a/ompi/mca/pml/ucx/pml_ucx_datatype.h
+++ b/ompi/mca/pml/ucx/pml_ucx_datatype.h
@@ -14,6 +14,7 @@
 
 
 #define PML_UCX_DATATYPE_INVALID   0
+#define PML_UCX_DATATYPE_PENDING   1
 
 #ifdef HAVE_UCP_REQUEST_PARAM_T
 typedef struct {
@@ -49,9 +50,17 @@ static inline ucp_datatype_t mca_pml_ucx_get_datatype(ompi_datatype_t *datatype)
 #ifdef HAVE_UCP_REQUEST_PARAM_T
     pml_ucx_datatype_t *ucp_type = (pml_ucx_datatype_t*)datatype->pml_data;
 
-    if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID)) {
+    if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID && (int64_t)ucp_type != PML_UCX_DATATYPE_PENDING)) {
         return ucp_type->datatype;
     }
+    int64_t oldval = PML_UCX_DATATYPE_INVALID;
+    if (opal_atomic_compare_exchange_strong_64((int64_t *)&datatype->pml_data, &oldval, PML_UCX_DATATYPE_PENDING)) {
+        ucp_datatype_t res =  mca_pml_ucx_init_datatype(datatype);
+        return res;
+    } else {
+        while(PML_UCX_DATATYPE_PENDING == datatype->pml_data);
+        return (ucp_datatype_t)datatype->pml_data;
+    }
 #else
     ucp_datatype_t ucp_type = datatype->pml_data;
 
@@ -70,11 +79,16 @@ mca_pml_ucx_get_op_data(ompi_datatype_t *datatype)
 {
     pml_ucx_datatype_t *ucp_type = (pml_ucx_datatype_t*)datatype->pml_data;
 
-    if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID)) {
+    if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID && (int64_t)ucp_type != PML_UCX_DATATYPE_PENDING)) {
         return ucp_type;
     }
+    int64_t oldval = PML_UCX_DATATYPE_INVALID;
+    if (opal_atomic_compare_exchange_strong_64((int64_t *)&datatype->pml_data, &oldval, PML_UCX_DATATYPE_PENDING)) {
+        mca_pml_ucx_init_datatype(datatype);
+    } else {
+        while(PML_UCX_DATATYPE_PENDING == datatype->pml_data);
+    }
 
-    mca_pml_ucx_init_datatype(datatype);
     return (pml_ucx_datatype_t*)datatype->pml_data;
 }
 
diff --git a/ompi/mpi/c/ialltoallw.c b/ompi/mpi/c/ialltoallw.c
index 6dc4af8..591e76d 100644
--- a/ompi/mpi/c/ialltoallw.c
+++ b/ompi/mpi/c/ialltoallw.c
@@ -78,6 +78,8 @@ int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispl
         }
     );
 
+    size = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
+
     if (MPI_PARAM_CHECK) {
 
         /* Unrooted operation -- same checks for all ranks */
@@ -102,7 +104,6 @@ int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispl
             return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME);
         }
 
-        size = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
         for (i = 0; i < size; ++i) {
             OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtypes[i], sendcounts[i]);
             OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
@@ -129,7 +130,7 @@ int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispl
                                        rdispls, recvtypes, comm, request,
                                        comm->c_coll->coll_ialltoallw_module);
     if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
-        ompi_coll_base_retain_datatypes_w(*request, (MPI_IN_PLACE==sendbuf)?NULL:sendtypes, recvtypes);
+        ompi_coll_base_retain_datatypes_w(*request, (MPI_IN_PLACE==sendbuf)?NULL:sendtypes, size, recvtypes, size);
     }
     OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
 }
diff --git a/ompi/mpi/c/ineighbor_alltoallw.c b/ompi/mpi/c/ineighbor_alltoallw.c
index 1821a33..3dc6531 100644
--- a/ompi/mpi/c/ineighbor_alltoallw.c
+++ b/ompi/mpi/c/ineighbor_alltoallw.c
@@ -88,7 +88,10 @@ int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const M
         }
     );
 
+    err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree);
+
     if (MPI_PARAM_CHECK) {
+        OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
 
         /* Unrooted operation -- same checks for all ranks */
 
@@ -102,8 +105,6 @@ int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const M
                                           FUNC_NAME);
         }
 
-        err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree);
-        OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
         if (((0 < outdegree) && ((NULL == sendcounts) || (NULL == sdispls) || (NULL == sendtypes))) ||
             ((0 < indegree) && ((NULL == recvcounts) || (NULL == rdispls) || (NULL == recvtypes))) ||
             MPI_IN_PLACE == sendbuf || MPI_IN_PLACE == recvbuf) {
@@ -148,7 +149,7 @@ int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const M
                                                 recvbuf, recvcounts, rdispls, recvtypes, comm, request,
                                                 comm->c_coll->coll_ineighbor_alltoallw_module);
     if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
-        ompi_coll_base_retain_datatypes_w(*request, sendtypes, recvtypes);
+        ompi_coll_base_retain_datatypes_w(*request, sendtypes, outdegree, recvtypes, indegree);
     }
     OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
 }
diff --git a/ompi/mpiext/pcollreq/c/alltoallw_init.c b/ompi/mpiext/pcollreq/c/alltoallw_init.c
index 50902f1..5de0df4 100644
--- a/ompi/mpiext/pcollreq/c/alltoallw_init.c
+++ b/ompi/mpiext/pcollreq/c/alltoallw_init.c
@@ -79,6 +79,8 @@ int MPIX_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int s
         }
     );
 
+    size = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
+
     if (MPI_PARAM_CHECK) {
 
         /* Unrooted operation -- same checks for all ranks */
@@ -103,7 +105,6 @@ int MPIX_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int s
             return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME);
         }
 
-        size = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm);
         for (i = 0; i < size; ++i) {
             OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtypes[i], sendcounts[i]);
             OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
@@ -130,7 +131,7 @@ int MPIX_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int s
                                             rdispls, recvtypes, comm, info, request,
                                             comm->c_coll->coll_alltoallw_init_module);
     if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
-        ompi_coll_base_retain_datatypes_w(*request, (MPI_IN_PLACE==sendbuf)?NULL:sendtypes, recvtypes);
+        ompi_coll_base_retain_datatypes_w(*request, (MPI_IN_PLACE==sendbuf)?NULL:sendtypes, size, recvtypes, size);
     }
     OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
 }
diff --git a/ompi/mpiext/pcollreq/c/neighbor_alltoallw_init.c b/ompi/mpiext/pcollreq/c/neighbor_alltoallw_init.c
index 1143ccb..faad1b0 100644
--- a/ompi/mpiext/pcollreq/c/neighbor_alltoallw_init.c
+++ b/ompi/mpiext/pcollreq/c/neighbor_alltoallw_init.c
@@ -89,11 +89,12 @@ int MPIX_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], co
         }
     );
 
+    err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree);
     if (MPI_PARAM_CHECK) {
+        OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
 
         /* Unrooted operation -- same checks for all ranks */
 
-        err = MPI_SUCCESS;
         OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
         if (ompi_comm_invalid(comm) || OMPI_COMM_IS_INTER(comm)) {
             return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM,
@@ -109,8 +110,6 @@ int MPIX_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], co
             return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME);
         }
 
-        err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree);
-        OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
         for (i = 0; i < outdegree; ++i) {
             OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtypes[i], sendcounts[i]);
             OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME);
@@ -151,7 +150,7 @@ int MPIX_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], co
                                                      info, request,
                                                      comm->c_coll->coll_neighbor_alltoallw_init_module);
     if (OPAL_LIKELY(OMPI_SUCCESS == err)) {
-        ompi_coll_base_retain_datatypes_w(*request, sendtypes, recvtypes);
+        ompi_coll_base_retain_datatypes_w(*request, sendtypes, outdegree, recvtypes, indegree);
     }
     OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME);
 }
