From: Omer Shpigelman <oshpigel...@habana.ai>

Add Queue Pair (QP) error notification to the user e.g. security violation,
too many retransmissions, invalid QP etc.

Whenever a QP caused an error, the firmware will send an event to the
driver which will push the error as an error entry to the Completion Queue
(if exists).

Signed-off-by: Omer Shpigelman <oshpigel...@habana.ai>
Reviewed-by: Oded Gabbay <oded.gab...@gmail.com>
Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
---
 drivers/misc/habanalabs/gaudi/gaudi.c     | 13 ++++
 drivers/misc/habanalabs/gaudi/gaudiP.h    |  1 +
 drivers/misc/habanalabs/gaudi/gaudi_nic.c | 95 +++++++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c 
b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4602e4780651..71c9e2d18032 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6660,6 +6660,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
                hl_fw_unmask_irq(hdev, event_type);
                break;
 
+       case GAUDI_EVENT_NIC0_QP0:
+       case GAUDI_EVENT_NIC0_QP1:
+       case GAUDI_EVENT_NIC1_QP0:
+       case GAUDI_EVENT_NIC1_QP1:
+       case GAUDI_EVENT_NIC2_QP0:
+       case GAUDI_EVENT_NIC2_QP1:
+       case GAUDI_EVENT_NIC3_QP0:
+       case GAUDI_EVENT_NIC3_QP1:
+       case GAUDI_EVENT_NIC4_QP0:
+       case GAUDI_EVENT_NIC4_QP1:
+               gaudi_nic_handle_qp_err(hdev, event_type);
+               break;
+
        case GAUDI_EVENT_PSOC_GPIO_U16_0:
                cause = le64_to_cpu(eq_entry->data[0]) & 0xFF;
                dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h 
b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 3158d5d68c1d..7d7439da88bc 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -576,5 +576,6 @@ netdev_tx_t gaudi_nic_handle_tx_pkt(struct gaudi_nic_device 
*gaudi_nic,
                                        struct sk_buff *skb);
 int gaudi_nic_sw_init(struct hl_device *hdev);
 void gaudi_nic_sw_fini(struct hl_device *hdev);
+void gaudi_nic_handle_qp_err(struct hl_device *hdev, u16 event_type);
 
 #endif /* GAUDIP_H_ */
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_nic.c 
b/drivers/misc/habanalabs/gaudi/gaudi_nic.c
index 37f25247f751..49e94e9c786a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_nic.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_nic.c
@@ -3988,3 +3988,98 @@ int gaudi_nic_cq_mmap(struct hl_device *hdev, struct 
vm_area_struct *vma)
 
        return rc;
 }
+
+static char *get_syndrome_text(u32 syndrome)
+{
+       char *str;
+
+       switch (syndrome) {
+       case 0x05:
+               str = "Rx got invalid QP";
+               break;
+       case 0x06:
+               str = "Rx transport service mismatch";
+               break;
+       case 0x09:
+               str = "Rx Rkey check failed";
+               break;
+       case 0x40:
+               str = "timer retry exceeded";
+               break;
+       case 0x41:
+               str = "NACK retry exceeded";
+               break;
+       case 0x42:
+               str = "doorbell on invalid QP";
+               break;
+       case 0x43:
+               str = "doorbell security check failed";
+               break;
+       case 0x44:
+               str = "Tx got invalid QP";
+               break;
+       case 0x45:
+               str = "responder got ACK/NACK on invalid QP";
+               break;
+       case 0x46:
+               str = "responder try to send ACK/NACK on invalid QP";
+               break;
+       default:
+               str = "unknown syndrome";
+               break;
+       }
+
+       return str;
+}
+
+void gaudi_nic_handle_qp_err(struct hl_device *hdev, u16 event_type)
+{
+       struct gaudi_device *gaudi = hdev->asic_specific;
+       struct gaudi_nic_device *gaudi_nic;
+       struct qp_err *qp_err_arr;
+       struct hl_nic_cqe cqe_sw;
+       u32 pi, ci;
+
+       gaudi_nic = &gaudi->nic_devices[event_type - GAUDI_EVENT_NIC0_QP0];
+       qp_err_arr = gaudi_nic->qp_err_mem_cpu;
+
+       mutex_lock(&gaudi->nic_qp_err_lock);
+
+       if (!gaudi->nic_cq_enable)
+               dev_err_ratelimited(hdev->dev,
+                       "received NIC %d QP error event %d but no CQ to push 
it\n",
+                       gaudi_nic->port, event_type);
+
+       pi = NIC_RREG32(mmNIC0_QPC0_ERR_FIFO_PRODUCER_INDEX);
+       ci = gaudi_nic->qp_err_ci;
+
+       cqe_sw.is_err = true;
+       cqe_sw.port = gaudi_nic->port;
+
+       while (ci < pi) {
+               cqe_sw.type = QP_ERR_IS_REQ(qp_err_arr[ci]) ?
+                               HL_NIC_CQE_TYPE_REQ : HL_NIC_CQE_TYPE_RES;
+               cqe_sw.qp_number = QP_ERR_QP_NUM(qp_err_arr[ci]);
+               cqe_sw.qp_err.syndrome = QP_ERR_ERR_NUM(qp_err_arr[ci]);
+
+               ci = (ci + 1) & (QP_ERR_BUF_LEN - 1);
+
+               dev_err_ratelimited(hdev->dev,
+                       "NIC QP error port: %d, type: %d, qpn: %d, syndrome: %s 
(0x%x)\n",
+                       cqe_sw.port, cqe_sw.type, cqe_sw.qp_number,
+                       get_syndrome_text(cqe_sw.qp_err.syndrome),
+                       cqe_sw.qp_err.syndrome);
+
+               if (gaudi->nic_cq_enable)
+                       copy_cqe_to_main_queue(hdev, &cqe_sw);
+       }
+
+       gaudi_nic->qp_err_ci = ci;
+       NIC_WREG32(mmNIC0_QPC0_ERR_FIFO_CONSUMER_INDEX, ci);
+
+       /* signal the completion queue that there are available CQEs */
+       if (gaudi->nic_cq_enable)
+               complete(&gaudi->nic_cq_comp);
+
+       mutex_unlock(&gaudi->nic_qp_err_lock);
+}
-- 
2.17.1

Reply via email to