Re: [PATCH v2 16/53] migration/rdma: Fix or document problematic uses of errno

2023-10-06 Thread Zhijian Li (Fujitsu)


On 28/09/2023 21:19, Markus Armbruster wrote:
> We use errno after calling Libibverbs functions that are not
> documented to set errno (manual page does not mention errno), or where
> the documentation is unclear ("returns [...] the value of errno on
> failure").  While this could be read as "sets errno and returns it",
> a glance at the source code[*] kills that hope:
> 
>  static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr 
> *wr,
>  struct ibv_send_wr **bad_wr)
>  {
>  return qp->context->ops.post_send(qp, wr, bad_wr);
>  }
> 
> The callback can be
> 
>  static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>struct ibv_send_wr **bad)
>  {
>  /* This version of driver supports RAW QP only.
>   * Posting WR is done directly in the application.
>   */
>  return EOPNOTSUPP;
>  }
> 
> Neither of them touches errno.
> 
> One of these errno uses is easy to fix, so do that now.  Several more
> will go away later in the series; add temporary FIXME commments.
> Three will remain; add TODO comments.  TODO, not FIXME, because the
> bug might be in Libibverbs documentation.
> 
> [*] https://github.com/linux-rdma/rdma-core.git
>  commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
> 
> Signed-off-by: Markus Armbruster 


Reviewed-by: Li Zhijian 


> ---
>   migration/rdma.c | 45 +++--
>   1 file changed, 39 insertions(+), 6 deletions(-)
> 
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 28097ce604..bba8c99fa9 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -853,6 +853,12 @@ static int qemu_rdma_broken_ipv6_kernel(struct 
> ibv_context *verbs, Error **errp)
>   
>   for (x = 0; x < num_devices; x++) {
>   verbs = ibv_open_device(dev_list[x]);
> +/*
> + * ibv_open_device() is not documented to set errno.  If
> + * it does, it's somebody else's doc bug.  If it doesn't,
> + * the use of errno below is wrong.
> + * TODO Find out whether ibv_open_device() sets errno.
> + */
>   if (!verbs) {
>   if (errno == EPERM) {
>   continue;
> @@ -1162,11 +1168,7 @@ static void qemu_rdma_advise_prefetch_mr(struct ibv_pd 
> *pd, uint64_t addr,
>   ret = ibv_advise_mr(pd, advice,
>   IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
>   /* ignore the error */
> -if (ret) {
> -trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
> -} else {
> -trace_qemu_rdma_advise_mr(name, len, addr, "successed");
> -}
> +trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
>   #endif
>   }
>   
> @@ -1183,7 +1185,12 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
> *rdma)
>   local->block[i].local_host_addr,
>   local->block[i].length, access
>   );
> -
> +/*
> + * ibv_reg_mr() is not documented to set errno.  If it does,
> + * it's somebody else's doc bug.  If it doesn't, the use of
> + * errno below is wrong.
> + * TODO Find out whether ibv_reg_mr() sets errno.
> + */
>   if (!local->block[i].mr &&
>   errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
>   access |= IBV_ACCESS_ON_DEMAND;
> @@ -1291,6 +1298,12 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
> *rdma,
>   trace_qemu_rdma_register_and_get_keys(len, chunk_start);
>   
>   block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
> +/*
> + * ibv_reg_mr() is not documented to set errno.  If it does,
> + * it's somebody else's doc bug.  If it doesn't, the use of
> + * errno below is wrong.
> + * TODO Find out whether ibv_reg_mr() sets errno.
> + */
>   if (!block->pmr[chunk] &&
>   errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
>   access |= IBV_ACCESS_ON_DEMAND;
> @@ -1408,6 +1421,11 @@ static int qemu_rdma_unregister_waiting(RDMAContext 
> *rdma)
>   block->remote_keys[chunk] = 0;
>   
>   if (ret != 0) {
> +/*
> + * FIXME perror() is problematic, bcause ibv_dereg_mr() is
> + * not documented to set errno.  Will go away later in
> + * this series.
> + */
>   perror("unregistration chunk failed");
>   return -ret;
>   }
> @@ -1658,6 +1676,11 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
>   
>   ret = ibv_get_cq_event(ch, , _ctx);
>   if (ret) {
> +/*
> + * FIXME perror() is problematic, because ibv_reg_mr() is
> + * not documented to set errno.  Will go away later in
> + * this 

Re: [PATCH v2 16/53] migration/rdma: Fix or document problematic uses of errno

2023-10-05 Thread Juan Quintela
Markus Armbruster  wrote:
> We use errno after calling Libibverbs functions that are not
> documented to set errno (manual page does not mention errno), or where
> the documentation is unclear ("returns [...] the value of errno on
> failure").  While this could be read as "sets errno and returns it",
> a glance at the source code[*] kills that hope:
>
> static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
> struct ibv_send_wr **bad_wr)
> {
> return qp->context->ops.post_send(qp, wr, bad_wr);
> }
>
> The callback can be
>
> static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>   struct ibv_send_wr **bad)
> {
> /* This version of driver supports RAW QP only.
>  * Posting WR is done directly in the application.
>  */
> return EOPNOTSUPP;
> }
>
> Neither of them touches errno.
>
> One of these errno uses is easy to fix, so do that now.  Several more
> will go away later in the series; add temporary FIXME commments.
> Three will remain; add TODO comments.  TODO, not FIXME, because the
> bug might be in Libibverbs documentation.
>
> [*] https://github.com/linux-rdma/rdma-core.git
> commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
>
> Signed-off-by: Markus Armbruster 

Reviewed-by: Juan Quintela 

And here I am, re-merging from this patch O:-)




Re: [PATCH v2 16/53] migration/rdma: Fix or document problematic uses of errno

2023-10-04 Thread Markus Armbruster
Fabiano Rosas  writes:

> Markus Armbruster  writes:
>
>> We use errno after calling Libibverbs functions that are not
>> documented to set errno (manual page does not mention errno), or where
>> the documentation is unclear ("returns [...] the value of errno on
>> failure").  While this could be read as "sets errno and returns it",
>> a glance at the source code[*] kills that hope:
>>
>> static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr 
>> *wr,
>> struct ibv_send_wr **bad_wr)
>> {
>> return qp->context->ops.post_send(qp, wr, bad_wr);
>> }
>>
>> The callback can be
>>
>> static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>>   struct ibv_send_wr **bad)
>> {
>> /* This version of driver supports RAW QP only.
>>  * Posting WR is done directly in the application.
>>  */
>> return EOPNOTSUPP;
>> }
>>
>> Neither of them touches errno.
>>
>> One of these errno uses is easy to fix, so do that now.  Several more
>> will go away later in the series; add temporary FIXME commments.
>> Three will remain; add TODO comments.  TODO, not FIXME, because the
>> bug might be in Libibverbs documentation.
>>
>> [*] https://github.com/linux-rdma/rdma-core.git
>> commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
>>
>> Signed-off-by: Markus Armbruster 
>> ---
>>  migration/rdma.c | 45 +++--
>>  1 file changed, 39 insertions(+), 6 deletions(-)
>>
>> diff --git a/migration/rdma.c b/migration/rdma.c
>> index 28097ce604..bba8c99fa9 100644
>> --- a/migration/rdma.c
>> +++ b/migration/rdma.c
>> @@ -853,6 +853,12 @@ static int qemu_rdma_broken_ipv6_kernel(struct 
>> ibv_context *verbs, Error **errp)
>>  
>>  for (x = 0; x < num_devices; x++) {
>>  verbs = ibv_open_device(dev_list[x]);
>> +/*
>> + * ibv_open_device() is not documented to set errno.  If
>> + * it does, it's somebody else's doc bug.  If it doesn't,
>> + * the use of errno below is wrong.
>> + * TODO Find out whether ibv_open_device() sets errno.
>> + */
>>  if (!verbs) {
>>  if (errno == EPERM) {
>>  continue;
>
> This function can call into glibc, so it's not unreasonable to expect
> errno to be set at some point. We're not relying on errno to be set,
> just taking an action if it happens to be.

errno may well be set on some failures.  But it needs to be set on *all*
failures to be reliable.  If it's not, then its value on such failures
comes from some unrelated, prior errno-setting failure.

> I don't think someone would just decide to handle EPERM at this point
> for no reason. Specially since the manual makes no mention to
> errno. This was probably introduced after someone got bit by it.
>
> ... indeed the commit 5b61d57521 ("rdma: Fix qemu crash when IPv6
> address is used for migration") introduced this to fix a crash.

I don't doubt the error recovery added there works in the case described
by the commit message.  I suspect it can break on unrelated failures.




Re: [PATCH v2 16/53] migration/rdma: Fix or document problematic uses of errno

2023-09-29 Thread Fabiano Rosas
Markus Armbruster  writes:

> We use errno after calling Libibverbs functions that are not
> documented to set errno (manual page does not mention errno), or where
> the documentation is unclear ("returns [...] the value of errno on
> failure").  While this could be read as "sets errno and returns it",
> a glance at the source code[*] kills that hope:
>
> static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
> struct ibv_send_wr **bad_wr)
> {
> return qp->context->ops.post_send(qp, wr, bad_wr);
> }
>
> The callback can be
>
> static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
>   struct ibv_send_wr **bad)
> {
> /* This version of driver supports RAW QP only.
>  * Posting WR is done directly in the application.
>  */
> return EOPNOTSUPP;
> }
>
> Neither of them touches errno.
>
> One of these errno uses is easy to fix, so do that now.  Several more
> will go away later in the series; add temporary FIXME commments.
> Three will remain; add TODO comments.  TODO, not FIXME, because the
> bug might be in Libibverbs documentation.
>
> [*] https://github.com/linux-rdma/rdma-core.git
> commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
>
> Signed-off-by: Markus Armbruster 
> ---
>  migration/rdma.c | 45 +++--
>  1 file changed, 39 insertions(+), 6 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index 28097ce604..bba8c99fa9 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -853,6 +853,12 @@ static int qemu_rdma_broken_ipv6_kernel(struct 
> ibv_context *verbs, Error **errp)
>  
>  for (x = 0; x < num_devices; x++) {
>  verbs = ibv_open_device(dev_list[x]);
> +/*
> + * ibv_open_device() is not documented to set errno.  If
> + * it does, it's somebody else's doc bug.  If it doesn't,
> + * the use of errno below is wrong.
> + * TODO Find out whether ibv_open_device() sets errno.
> + */
>  if (!verbs) {
>  if (errno == EPERM) {
>  continue;

This function can call into glibc, so it's not unreasonable to expect
errno to be set at some point. We're not relying on errno to be set,
just taking an action if it happens to be.

I don't think someone would just decide to handle EPERM at this point
for no reason. Specially since the manual makes no mention to
errno. This was probably introduced after someone got bit by it.

... indeed the commit 5b61d57521 ("rdma: Fix qemu crash when IPv6
address is used for migration") introduced this to fix a crash.




[PATCH v2 16/53] migration/rdma: Fix or document problematic uses of errno

2023-09-28 Thread Markus Armbruster
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure").  While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:

static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}

The callback can be

static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
  struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
 * Posting WR is done directly in the application.
 */
return EOPNOTSUPP;
}

Neither of them touches errno.

One of these errno uses is easy to fix, so do that now.  Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments.  TODO, not FIXME, because the
bug might be in Libibverbs documentation.

[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf

Signed-off-by: Markus Armbruster 
---
 migration/rdma.c | 45 +++--
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 28097ce604..bba8c99fa9 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -853,6 +853,12 @@ static int qemu_rdma_broken_ipv6_kernel(struct ibv_context 
*verbs, Error **errp)
 
 for (x = 0; x < num_devices; x++) {
 verbs = ibv_open_device(dev_list[x]);
+/*
+ * ibv_open_device() is not documented to set errno.  If
+ * it does, it's somebody else's doc bug.  If it doesn't,
+ * the use of errno below is wrong.
+ * TODO Find out whether ibv_open_device() sets errno.
+ */
 if (!verbs) {
 if (errno == EPERM) {
 continue;
@@ -1162,11 +1168,7 @@ static void qemu_rdma_advise_prefetch_mr(struct ibv_pd 
*pd, uint64_t addr,
 ret = ibv_advise_mr(pd, advice,
 IBV_ADVISE_MR_FLAG_FLUSH, _list, 1);
 /* ignore the error */
-if (ret) {
-trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
-} else {
-trace_qemu_rdma_advise_mr(name, len, addr, "successed");
-}
+trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
 #endif
 }
 
@@ -1183,7 +1185,12 @@ static int qemu_rdma_reg_whole_ram_blocks(RDMAContext 
*rdma)
 local->block[i].local_host_addr,
 local->block[i].length, access
 );
-
+/*
+ * ibv_reg_mr() is not documented to set errno.  If it does,
+ * it's somebody else's doc bug.  If it doesn't, the use of
+ * errno below is wrong.
+ * TODO Find out whether ibv_reg_mr() sets errno.
+ */
 if (!local->block[i].mr &&
 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
 access |= IBV_ACCESS_ON_DEMAND;
@@ -1291,6 +1298,12 @@ static int qemu_rdma_register_and_get_keys(RDMAContext 
*rdma,
 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
 
 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
+/*
+ * ibv_reg_mr() is not documented to set errno.  If it does,
+ * it's somebody else's doc bug.  If it doesn't, the use of
+ * errno below is wrong.
+ * TODO Find out whether ibv_reg_mr() sets errno.
+ */
 if (!block->pmr[chunk] &&
 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
 access |= IBV_ACCESS_ON_DEMAND;
@@ -1408,6 +1421,11 @@ static int qemu_rdma_unregister_waiting(RDMAContext 
*rdma)
 block->remote_keys[chunk] = 0;
 
 if (ret != 0) {
+/*
+ * FIXME perror() is problematic, bcause ibv_dereg_mr() is
+ * not documented to set errno.  Will go away later in
+ * this series.
+ */
 perror("unregistration chunk failed");
 return -ret;
 }
@@ -1658,6 +1676,11 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
 
 ret = ibv_get_cq_event(ch, , _ctx);
 if (ret) {
+/*
+ * FIXME perror() is problematic, because ibv_reg_mr() is
+ * not documented to set errno.  Will go away later in
+ * this series.
+ */
 perror("ibv_get_cq_event");
 goto err_block_for_wrid;
 }
@@ -2199,6 +,11 @@ retry:
 goto retry;
 
 } else if (ret > 0) {
+/*
+ * FIXME perror() is problematic, because whether
+ * ibv_post_send() sets errno is unclear.  Will go away later
+ * in this series.
+