Polling by MSI isn't necessarily faster than polling by SEV. Tests on
hi1620 show hns3 100G NIC network throughput can improve from 25G to
27G if we disable MSI polling while running 16 netperf threads sending
UDP packets in size 32KB. TX throughput can improve from 7G to 7.7G for
single thread.
The reason for the throughput improvement is that the latency to poll
the completion of CMD_SYNC becomes smaller. After sending a CMD_SYNC
in an empty cmd queue, typically we need to wait for 280ns using MSI
polling. But we only need around 190ns after disabling MSI polling.
This patch provides a command line option so that users can decide to
use MSI polling or not based on their tests.

Signed-off-by: Barry Song <song.bao....@hisilicon.com>
---
 -v4: add ARM_SMMU_OPT_MSIPOLL flag with respect to Robin's comment

 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5b40d535a7c8..7332251dd8cd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -418,6 +418,11 @@ module_param(disable_bypass, bool, 0444);
 MODULE_PARM_DESC(disable_bypass,
        "Disable bypass streams such that incoming transactions from devices 
that are not attached to an iommu domain will report an abort back to the 
device and will not be allowed to pass through the SMMU.");
 
+static bool disable_msipolling;
+module_param(disable_msipolling, bool, 0444);
+MODULE_PARM_DESC(disable_msipolling,
+       "Disable MSI-based polling for CMD_SYNC completion.");
+
 enum pri_resp {
        PRI_RESP_DENY = 0,
        PRI_RESP_FAIL = 1,
@@ -652,6 +657,7 @@ struct arm_smmu_device {
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH     (1 << 0)
 #define ARM_SMMU_OPT_PAGE0_REGS_ONLY   (1 << 1)
+#define ARM_SMMU_OPT_MSIPOLL           (1 << 2)
        u32                             options;
 
        struct arm_smmu_cmdq            cmdq;
@@ -992,8 +998,7 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct 
arm_smmu_device *smmu,
         * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
         * payload, so the write will zero the entire command on that platform.
         */
-       if (smmu->features & ARM_SMMU_FEAT_MSI &&
-           smmu->features & ARM_SMMU_FEAT_COHERENCY) {
+       if (smmu->options & ARM_SMMU_OPT_MSIPOLL) {
                ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
                                   q->ent_dwords * 8;
        }
@@ -1332,8 +1337,7 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct 
arm_smmu_device *smmu,
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
                                         struct arm_smmu_ll_queue *llq)
 {
-       if (smmu->features & ARM_SMMU_FEAT_MSI &&
-           smmu->features & ARM_SMMU_FEAT_COHERENCY)
+       if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
                return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
 
        return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
@@ -3741,8 +3745,11 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
        if (reg & IDR0_SEV)
                smmu->features |= ARM_SMMU_FEAT_SEV;
 
-       if (reg & IDR0_MSI)
+       if (reg & IDR0_MSI) {
                smmu->features |= ARM_SMMU_FEAT_MSI;
+               if (coherent && !disable_msipolling)
+                       smmu->options |= ARM_SMMU_OPT_MSIPOLL;
+       }
 
        if (reg & IDR0_HYP)
                smmu->features |= ARM_SMMU_FEAT_HYP;
-- 
2.27.0


_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to