From: Aya Levin <a...@mellanox.com>

Add support for report and recovery from error on completion on RQ by
setting the queue back to ready state. Handle only errors with a
syndrome indicating the RQ might enter error state and could be
recovered.

Signed-off-by: Aya Levin <a...@mellanox.com>
Reviewed-by: Tariq Toukan <tar...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
Signed-off-by: Saeed Mahameed <sae...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 +
 .../ethernet/mellanox/mlx5/core/en/health.h   |  9 +++
 .../mellanox/mlx5/core/en/reporter_rx.c       | 66 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  9 +++
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 11 ++++
 5 files changed, 98 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5f2a1d14de68..822f7b620640 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -295,6 +295,7 @@ struct mlx5e_dcbx_dp {
 
 enum {
        MLX5E_RQ_STATE_ENABLED,
+       MLX5E_RQ_STATE_RECOVERING,
        MLX5E_RQ_STATE_AM,
        MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
        MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
@@ -667,6 +668,8 @@ struct mlx5e_rq {
        struct zero_copy_allocator zca;
        struct xdp_umem       *umem;
 
+       struct work_struct     recover_work;
+
        /* control */
        struct mlx5_wq_ctrl    wq_ctrl;
        __be32                 mkey_be;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
index 52e9ca37cf46..d3693fa547ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h
@@ -8,6 +8,14 @@
 
 #define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)
 
+static inline bool cqe_syndrome_needs_recover(u8 syndrome)
+{
+       return syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR ||
+              syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+}
+
 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq);
@@ -21,6 +29,7 @@ int mlx5e_reporter_named_obj_nest_end(struct devlink_fmsg 
*fmsg);
 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv);
 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
 
 #define MLX5E_REPORTER_PER_Q_MAX_LEN 256
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index 4f5547ac4bee..b4f7e535dbc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -111,6 +111,72 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq 
*icosq)
        mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
 }
 
+static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
+{
+       struct net_device *dev = rq->netdev;
+       int err;
+
+       err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
+       if (err) {
+               netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
+               return err;
+       }
+       err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+       if (err) {
+               netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
+               return err;
+       }
+
+       return 0;
+}
+
+static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
+{
+       struct mlx5e_rq *rq = ctx;
+       struct mlx5_core_dev *mdev = rq->mdev;
+       struct net_device *dev = rq->netdev;
+       u8 state;
+       int err;
+
+       err = mlx5e_query_rq_state(mdev, rq->rqn, &state);
+       if (err) {
+               netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n",
+                          rq->rqn, err);
+               goto out;
+       }
+
+       if (state != MLX5_RQC_STATE_ERR)
+               goto out;
+
+       mlx5e_deactivate_rq(rq);
+       mlx5e_free_rx_descs(rq);
+
+       err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
+       if (err)
+               goto out;
+
+       clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+       mlx5e_activate_rq(rq);
+       rq->stats->recover++;
+       return 0;
+out:
+       clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
+       return err;
+}
+
+void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq)
+{
+       struct mlx5e_priv *priv = rq->channel->priv;
+       char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
+       struct mlx5e_err_ctx err_ctx = {};
+
+       err_ctx.ctx = rq;
+       err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover;
+       sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn);
+
+       mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
+}
+
 static int mlx5e_rx_reporter_timeout_recover(void *ctx)
 {
        struct mlx5e_rq *rq = ctx;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 76845bafd708..77f0c8fad9df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -362,6 +362,13 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
        kvfree(rq->wqe.di);
 }
 
+static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work)
+{
+       struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, 
recover_work);
+
+       mlx5e_reporter_rq_cqe_err(rq);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                          struct mlx5e_params *params,
                          struct mlx5e_xsk_param *xsk,
@@ -398,6 +405,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                rq->stats = &c->priv->channel_stats[c->ix].xskrq;
        else
                rq->stats = &c->priv->channel_stats[c->ix].rq;
+       INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
 
        rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
        if (IS_ERR(rq->xdp_prog)) {
@@ -907,6 +915,7 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
 {
        cancel_work_sync(&rq->dim.work);
        cancel_work_sync(&rq->channel->icosq.recover_work);
+       cancel_work_sync(&rq->recover_work);
        mlx5e_destroy_rq(rq);
        mlx5e_free_rx_descs(rq);
        mlx5e_free_rq(rq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 43d790b7d4ec..2fd2760d0bb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1130,6 +1130,15 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe,
        return skb;
 }
 
+static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+       struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+       if (cqe_syndrome_needs_recover(err_cqe->syndrome) &&
+           !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state))
+               queue_work(rq->channel->priv->wq, &rq->recover_work);
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
        struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1143,6 +1152,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
        cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
        if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+               trigger_report(rq, cqe);
                rq->stats->wqe_err++;
                goto free_wqe;
        }
@@ -1328,6 +1338,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, 
struct mlx5_cqe64 *cqe)
        wi->consumed_strides += cstrides;
 
        if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+               trigger_report(rq, cqe);
                rq->stats->wqe_err++;
                goto mpwrq_cqe_out;
        }
-- 
2.21.0

Reply via email to