From: Jiong Wang <jiong.w...@netronome.com>

For NFP, we want to re-group a sequence of load/store pairs lowered from
memcpy/memmove into single memory bulk operation which then could be
accelerated using NFP CPP bus.

This patch extends the existing load/store auxiliary information by adding
two new fields:

        struct bpf_insn *paired_st;
        s16 ldst_gather_len;

Both fields are supposed to be carried by the the load instruction at the
head of the sequence. "paired_st" is the corresponding store instruction at
the head and "ldst_gather_len" is the gathered length.

If "ldst_gather_len" is negative, then the sequence is doing memory
load/store in descending order, otherwise it is in ascending order. We need
this information to detect overlapped memory access.

This patch then optimize memory bulk copy when the copy length is within
32-bytes.

The strategy of read/write used is:

  * Read.
    Use read32 (direct_ref), always.

  * Write.
    - length <= 8-bytes
      write8 (direct_ref).
    - length <= 32-bytes and is 4-byte aligned
      write32 (direct_ref).
    - length <= 32-bytes but is not 4-byte aligned
      write8 (indirect_ref).

NOTE: the optimization should not change program semantics. The destination
register of the last load instruction should contain the same value before
and after this optimization.

Signed-off-by: Jiong Wang <jiong.w...@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicin...@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 113 ++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/main.h |   4 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.c  |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |   4 +
 4 files changed, 122 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c 
b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index da4e106d3b16..138568c0eee6 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, 
u8 mode, u8 xfer,
        emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
 }
 
+static void
+emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 
xfer,
+              swreg lreg, swreg rreg, u8 size, bool sync)
+{
+       emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true);
+}
+
 static void
 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
          enum br_ctx_signal_state css, u16 addr, u8 defer)
@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 
dst, u16 src)
        wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
 }
 
+/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
+ * result to @dst from low end.
+ */
+static void
+wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
+               u8 offset)
+{
+       enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
+       u8 mask = (1 << field_len) - 1;
+
+       emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
+}
+
+/* NFP has Command Push Pull bus which supports bluk memory operations. */
+static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta 
*meta)
+{
+       bool descending_seq = meta->ldst_gather_len < 0;
+       s16 len = abs(meta->ldst_gather_len);
+       swreg src_base, off;
+       unsigned int i;
+       u8 xfer_num;
+
+       if (WARN_ON_ONCE(len > 32))
+               return -EOPNOTSUPP;
+
+       off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
+       src_base = reg_a(meta->insn.src_reg * 2);
+       xfer_num = round_up(len, 4) / 4;
+
+       /* Memory read from source addr into transfer-in registers. */
+       emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
+                xfer_num - 1, true);
+
+       /* Move from transfer-in to transfer-out. */
+       for (i = 0; i < xfer_num; i++)
+               wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
+
+       off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
+
+       if (len <= 8) {
+               /* Use single direct_ref write8. */
+               emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+                        reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
+                        true);
+       } else if (IS_ALIGNED(len, 4)) {
+               /* Use single direct_ref write32. */
+               emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+                        reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
+                        true);
+       } else {
+               /* Use single indirect_ref write8. */
+               wrp_immed(nfp_prog, reg_none(),
+                         CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
+               emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+                              reg_a(meta->paired_st->dst_reg * 2), off,
+                              len - 1, true);
+       }
+
+       /* TODO: The following extra load is to make sure data flow be identical
+        *  before and after we do memory copy optimization.
+        *
+        *  The load destination register is not guaranteed to be dead, so we
+        *  need to make sure it is loaded with the value the same as before
+        *  this transformation.
+        *
+        *  These extra loads could be removed once we have accurate register
+        *  usage information.
+        */
+       if (descending_seq)
+               xfer_num = 0;
+       else if (BPF_SIZE(meta->insn.code) != BPF_DW)
+               xfer_num = xfer_num - 1;
+       else
+               xfer_num = xfer_num - 2;
+
+       switch (BPF_SIZE(meta->insn.code)) {
+       case BPF_B:
+               wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+                               reg_xfer(xfer_num), 1,
+                               IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
+               break;
+       case BPF_H:
+               wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+                               reg_xfer(xfer_num), 2, (len & 3) ^ 2);
+               break;
+       case BPF_W:
+               wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+                       reg_xfer(0));
+               break;
+       case BPF_DW:
+               wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+                       reg_xfer(xfer_num));
+               wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
+                       reg_xfer(xfer_num + 1));
+               break;
+       }
+
+       if (BPF_SIZE(meta->insn.code) != BPF_DW)
+               wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+
+       return 0;
+}
+
 static int
 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
 {
@@ -1490,6 +1600,9 @@ static int
 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
        unsigned int size)
 {
+       if (meta->ldst_gather_len)
+               return nfp_cpp_memcpy(nfp_prog, meta);
+
        if (meta->ptr.type == PTR_TO_CTX) {
                if (nfp_prog->type == BPF_PROG_TYPE_XDP)
                        return mem_ldx_xdp(nfp_prog, meta, size);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h 
b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 20ef0adb2931..5884291ddba5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct 
nfp_insn_meta *);
  * struct nfp_insn_meta - BPF instruction wrapper
  * @insn: BPF instruction
  * @ptr: pointer type for memory operations
+ * @ldst_gather_len: memcpy length gathered from load/store sequence
+ * @paired_st: the paired store insn at the head of the sequence
  * @ptr_not_const: pointer is not always constant
  * @jmp_dst: destination info for jump instructions
  * @off: index of first generated machine instruction (in nfp_prog.prog)
@@ -109,6 +111,8 @@ struct nfp_insn_meta {
        union {
                struct {
                        struct bpf_reg_state ptr;
+                       struct bpf_insn *paired_st;
+                       s16 ldst_gather_len;
                        bool ptr_not_const;
                };
                struct nfp_insn_meta *jmp_dst;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c 
b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index da277386077c..d3610987fb07 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -41,6 +41,7 @@
 
 const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
        [CMD_TGT_WRITE8_SWAP] =         { 0x02, 0x42 },
+       [CMD_TGT_WRITE32_SWAP] =        { 0x02, 0x5f },
        [CMD_TGT_READ8] =               { 0x01, 0x43 },
        [CMD_TGT_READ32] =              { 0x00, 0x5c },
        [CMD_TGT_READ32_LE] =           { 0x01, 0x5c },
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h 
b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index 6ff842a15e5d..98803f9f40b6 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -220,6 +220,7 @@ struct cmd_tgt_act {
 enum cmd_tgt_map {
        CMD_TGT_READ8,
        CMD_TGT_WRITE8_SWAP,
+       CMD_TGT_WRITE32_SWAP,
        CMD_TGT_READ32,
        CMD_TGT_READ32_LE,
        CMD_TGT_READ32_SWAP,
@@ -241,6 +242,9 @@ enum cmd_ctx_swap {
        CMD_CTX_NO_SWAP = 3,
 };
 
+#define CMD_OVE_LEN    BIT(7)
+#define CMD_OV_LEN     GENMASK(12, 8)
+
 #define OP_LCSR_BASE           0x0fc00000000ULL
 #define OP_LCSR_A_SRC          0x000000003ffULL
 #define OP_LCSR_B_SRC          0x000000ffc00ULL
-- 
2.15.0

Reply via email to