On 13/11/2020 16:33, Christophe Lombard wrote:
OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance problems caused by the PAU having to process all incoming TLBI/SLBI commands which will cause them to back up on the PowerBus. When the Address Translation Mode requires TLB and SLB Invalidate operations to be initiated using MMIO registers, a set of registers like the following is used: • XTS MMIO ATSD0 LPARID register • XTS MMIO ATSD0 AVA register • XTS MMIO ATSD0 launch register, write access initiates a shoot down • XTS MMIO ATSD0 status register The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE commands from the PowerBus. The Shootdown commands (ATSD) will be generated using MMIO registers in the NPU/PAU and sent to the device. Signed-off-by: Christophe Lombard <clomb...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/pnv-ocxl.h | 2 + arch/powerpc/platforms/powernv/ocxl.c | 19 +++ drivers/misc/ocxl/link.c | 180 ++++++++++++++++++++++---- drivers/misc/ocxl/ocxl_internal.h | 46 ++++++- drivers/misc/ocxl/trace.h | 125 ++++++++++++++++++ 5 files changed, 348 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h index d37ededca3ee..4a23abcc347b 100644 --- a/arch/powerpc/include/asm/pnv-ocxl.h +++ b/arch/powerpc/include/asm/pnv-ocxl.h @@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p void pnv_ocxl_spa_release(void *platform_data); int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle); +extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, + uint64_t lpcr);
"extern" is useless
#endif /* _ASM_PNV_OCXL_H */ diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c index ecdad219d704..100546ea635f 100644 --- a/arch/powerpc/platforms/powernv/ocxl.c +++ b/arch/powerpc/platforms/powernv/ocxl.c @@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) return rc; } EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); + +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, + uint64_t lpcr) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + u32 bdfn; + int rc; + + bdfn = (dev->bus->number << 8) | dev->devfn;
I was told a bit too late that pci_dev_id() exists, so we should probably use from now on.
+ rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr); + if (rc) { + dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index fd73d3bc0eb6..9b5b77d40734 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c
Overall, there are many changes in that file and it would help the review if it could be broken up in a set of smaller patches.
@@ -4,6 +4,8 @@ #include <linux/mutex.h> #include <linux/mm_types.h> #include <linux/mmu_context.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> #include <asm/copro.h> #include <asm/pnv-ocxl.h> #include <asm/xive.h> @@ -33,6 +35,31 @@ #define SPA_PE_VALID 0x80000000 +struct spa; + +/* + * A opencapi link can be used be by several PCI functions. We have + * one link per device slot. + * + * A linked list of opencapi links should suffice, as there's a + * limited number of opencapi slots on a system and lookup is only + * done when the device is probed + */ +struct ocxl_link { + struct list_head list; + struct kref ref; + int domain; + int bus; + int dev; + u64 mmio_atsd; /* ATSD physical address */ + void __iomem *base; /* ATSD register virtual address */ + spinlock_t atsd_lock; // to serialize shootdowns + atomic_t irq_available; + struct spa *spa; + void *platform_data; +}; +static struct list_head links_list = LIST_HEAD_INIT(links_list); +static DEFINE_MUTEX(links_list_lock); struct pe_data { struct mm_struct *mm; @@ -41,6 +68,8 @@ struct pe_data { /* opaque pointer to be passed to the above callback */ void *xsl_err_data; struct rcu_head rcu; + struct ocxl_link *link; + struct mmu_notifier mmu_notifier; }; struct spa { @@ -69,27 +98,6 @@ struct spa { } xsl_fault; }; -/* - * A opencapi link can be used be by several PCI functions. We have - * one link per device slot. - * - * A linked list of opencapi links should suffice, as there's a - * limited number of opencapi slots on a system and lookup is only - * done when the device is probed - */ -struct ocxl_link { - struct list_head list; - struct kref ref; - int domain; - int bus; - int dev; - atomic_t irq_available; - struct spa *spa; - void *platform_data; -}; -static struct list_head links_list = LIST_HEAD_INIT(links_list); -static DEFINE_MUTEX(links_list_lock); - enum xsl_response { CONTINUE, ADDRESS_ERROR, @@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r) } } +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops; + static void xsl_fault_handler_bh(struct work_struct *fault_work) { vm_fault_t flt = 0; @@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link) static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link) { + struct pci_controller *hose = pci_bus_to_host(dev->bus); struct ocxl_link *link; int rc; @@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l if (rc) goto err_xsl_irq; + /* Since OpenCAPI 5.0, Address Translation Mode requires TLB + * and SLB Invalidate operations to be initiated using MMIO + * registers + */ + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", + 0, &link->mmio_atsd)) { + dev_info(&dev->dev, "No available ATSD found\n"); + } + if (link->mmio_atsd) { + link->base = ioremap(link->mmio_atsd, 24); + if (!link->base) + dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd); + else + pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0); + }
To stay coherent with how we've done things like interrupts, the device tree parsing and ioremap() for the shootdown page should be done in a platform-specific file (powernv) Also, could we get a more descriptive name than link->base?
+ *out_link = link; return 0; @@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) { struct ocxl_link *link = (struct ocxl_link *) link_handle; + if (link->base) { + iounmap(link->base); + link->base = NULL; + }
What's allocated in alloc_link(), i.e. the shootdown page, should be released in release_xsl() and not in ocxl_link_release() (it's too early in case of several functions).
mutex_lock(&links_list_lock); kref_put(&link->ref, release_xsl); mutex_unlock(&links_list_lock); } EXPORT_SYMBOL_GPL(ocxl_link_release); +static void tlb_invalidate(struct ocxl_link *link, + unsigned long pid, + unsigned long addr) +{ + unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT); + uint64_t val; + int pend; + + if (!link->base) + return; + + spin_lock(&link->atsd_lock);
Looks like atsd_lock is not initialized.
+ if (addr) { + /* load Abbreviated Virtual Address register with + * the necessary value + */ + val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51)); + out_be64(link->base + XTS_ATSD_AVA, val); + eieio();
That eieio() needs a comment.
+ trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid); + } + + /* Write access initiates a shoot down to initiate the + * TLB Invalidate command + */ + val = XTS_ATSD_LNCH_R;
All that code should be in a platform-specific file.
+ if (addr) { + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00); + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00); + } else { + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10); + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01); + val |= XTS_ATSD_LNCH_OCAPI_SINGLETON; + } + val |= XTS_ATSD_LNCH_PRS; + val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101); + val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid); + out_be64(link->base + XTS_ATSD_LNCH, val); + trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid); + + /* Poll the ATSD status register to determine when the + * TLB Invalidate has been completed. + */ + val = in_be64(link->base + XTS_ATSD_STAT); + pend = val >> 63; + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); + + while (pend) { + if (time_after_eq(jiffies, timeout)) { + pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n", + __func__, val, pid); + spin_unlock(&link->atsd_lock); + return; + } + cpu_relax(); + val = in_be64(link->base + XTS_ATSD_STAT); + pend = val >> 63; + } + spin_unlock(&link->atsd_lock); + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); +} + +static void invalidate_range_end(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) +{ + struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); + struct ocxl_link *link = pe_data->link; + struct mm_struct *mm = mn->mm; + unsigned long addr, pid, page_size = PAGE_SIZE; + + pid = mm->context.id; + trace_ocxl_mmu_notifier_range(range->start, range->end, pid); + + for (addr = range->start; addr < range->end; addr += page_size) + tlb_invalidate(link, pid, addr); +} + +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { + /* invalidate_range_end() is called when all pages in the + * range have been unmapped and the pages have been freed by + * the VM + */ + .invalidate_range_end = invalidate_range_end, +}; +
See comments in mmu_notifier.h. Looks like we shouldn't be using invalidate_range_end(), it's too late
Did we test abrupt death of the host process? Do we need to also use .release()?
static u64 calculate_cfg_state(bool kernel) { u64 state; @@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, goto unlock; } - pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL); + pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL); if (!pe_data) { rc = -ENOMEM; goto unlock; @@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, pe_data->mm = mm; pe_data->xsl_err_cb = xsl_err_cb; pe_data->xsl_err_data = xsl_err_data; + pe_data->link = link; + pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops; memset(pe, 0, sizeof(struct ocxl_process_element)); pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0)); + pe->pasid = cpu_to_be32(pasid << (31 - 19)); + pe->bdf = cpu_to_be32(1 << (31 - 15));
why do we hard-code bdf 1?
pe->lpid = cpu_to_be32(mfspr(SPRN_LPID)); pe->pid = cpu_to_be32(pidr); pe->tid = cpu_to_be32(tidr); @@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, * by the nest MMU. If we have a kernel context, TLBIs are * already global. */ - if (mm) + if (mm) { mm_context_add_copro(mm); + if (link->base) { + /* Use MMIO registers for the TLB and SLB + * Invalidate operations. + */ + trace_init_mmu_notifier(pasid, mm->context.id); + mmu_notifier_register(&pe_data->mmu_notifier, mm); + } + } + /* * Barrier is to make sure PE is visible in the SPA before it * is used by the device. It also helps with the global TLBI @@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid) WARN(1, "Couldn't find pe data when removing PE\n"); } else { if (pe_data->mm) { + if (link->base) { + trace_release_mmu_notifier(pasid, pe_data->mm->context.id); + mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm); + tlb_invalidate(link, pe_data->mm->context.id, 0ull); + } mm_context_remove_copro(pe_data->mm); mmdrop(pe_data->mm); } diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 0bad0a123af6..35d8be3cd270 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -8,6 +8,48 @@ #include <linux/list.h> #include <misc/ocxl.h> +/* Find left shift from first set bit in mask */ +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) + +/* Set field fname of oval to fval + * NOTE: oval isn't modified, the combined result is returned + */ +#define SETFIELD(m, v, val) \ + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) + +#define OCXL_ATSD_TIMEOUT 1 + +/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */ +#define XTS_ATSD_LNCH 0x00 +#define XTS_ATSD_LNCH_R PPC_BIT(0) /* Radix Invalidate */ +#define XTS_ATSD_LNCH_RIC PPC_BITMASK(1,2) /* Radix Invalidation Control + * 0b00 Just invalidate TLB. + * 0b01 Invalidate just Page Walk Cache. + * 0b10 Invalidate TLB, Page Walk Cache, and any + * caching of Partition and Process Table Entries. + */ +#define XTS_ATSD_LNCH_LP PPC_BITMASK(3, 10) /* Number and Page Size of translations to be invalidated (HPT only ?) */ +#define XTS_ATSD_LNCH_IS PPC_BITMASK(11, 12) /* Invalidation Criteria + * 0b00 Invalidate just the target VA. + * 0b01 Invalidate matching PID. + */ +#define XTS_ATSD_LNCH_PRS PPC_BIT(13) /* 0b1: Process Scope, 0b0: Partition Scope */ +#define XTS_ATSD_LNCH_B PPC_BIT(14) /* Invalidation Flag */ +#define XTS_ATSD_LNCH_AP PPC_BITMASK(15, 17) /* Actual Page Size to be invalidated + * 000 4KB + * 101 64KB + * 001 2MB + * 010 1GB + */ +#define XTS_ATSD_LNCH_L PPC_BIT(18) /* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */ +#define XTS_ATSD_LNCH_PID PPC_BITMASK(19, 38) /* Process ID */ +#define XTS_ATSD_LNCH_F PPC_BIT(39) /* NoFlush – Assumed to be 0b0 */ +#define XTS_ATSD_LNCH_OCAPI_SLBI PPC_BIT(40) +#define XTS_ATSD_LNCH_OCAPI_SINGLETON PPC_BIT(41) +#define XTS_ATSD_AVA 0x08 +#define XTS_ATSD_AVA_AVA PPC_BITMASK(0, 51) /* au lieu de 35*/ +#define XTS_ATSD_STAT 0x10 + #define MAX_IRQ_PER_LINK 2000 #define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK @@ -84,7 +126,9 @@ struct ocxl_context { struct ocxl_process_element { __be64 config_state; - __be32 reserved1[11]; + __be32 pasid; + __be32 bdf; + __be32 reserved1[9]; __be32 lpid; __be32 tid; __be32 pid; diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h index 17e21cb2addd..6171069d071a 100644 --- a/drivers/misc/ocxl/trace.h +++ b/drivers/misc/ocxl/trace.h
I haven't looked at this yet. It should probably be in a separate patch. Fred
@@ -8,6 +8,131 @@ #include <linux/tracepoint.h> + +TRACE_EVENT(ocxl_mmu_notifier_range, + TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr), + TP_ARGS(start, end, pidr), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->pidr = pidr; + ), + + TP_printk("start=0x%lx end=0x%lx pidr=0x%lx", + __entry->start, + __entry->end, + __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava, + TP_PROTO(u64 val, unsigned long pidr), + TP_ARGS(val, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->pidr = pidr; + ), + + TP_printk("ATSD AVA: 0x%llx pidr=0x%lx", + __entry->val, __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch, + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), + TP_ARGS(val, addr, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, addr) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->addr = addr; + __entry->pidr = pidr; + ), + + TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx", + __entry->val, __entry->addr, __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat, + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), + TP_ARGS(val, addr, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, addr) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->addr = addr; + __entry->pidr = pidr; + ), + + TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx", + __entry->val, __entry->addr, __entry->pidr + ) +); + +TRACE_EVENT(init_mmu_notifier, + TP_PROTO(int pasid, unsigned long pidr), + TP_ARGS(pasid, pidr), + + TP_STRUCT__entry( + __field(int, pasid) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->pasid = pasid; + __entry->pidr = pidr; + ), + + TP_printk("pasid=%d, pidr=0x%lx", + __entry->pasid, + __entry->pidr + ) +); + +TRACE_EVENT(release_mmu_notifier, + TP_PROTO(int pasid, unsigned long pidr), + TP_ARGS(pasid, pidr), + + TP_STRUCT__entry( + __field(int, pasid) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->pasid = pasid; + __entry->pidr = pidr; + ), + + TP_printk("pasid=%d, pidr=0x%lx", + __entry->pasid, + __entry->pidr + ) +); + DECLARE_EVENT_CLASS(ocxl_context, TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr), TP_ARGS(pid, spa, pasid, pidr, tidr),