On 03/09/2016 03:45 PM, wenxi...@linux.vnet.ibm.com wrote: > From: Wen Xiong <wenxi...@linux.vnet.ibm.com> > > Enable MSIX vector mapping to CPU affinity. This feature is > enabled by setting ipr_cpu_map=1 when loading ipr module. > > Signed-off-by: Wen Xiong <wenxi...@linux.vnet.ibm.com> > --- > drivers/scsi/ipr.c | 198 > +++++++++++++++++++++++++++++++++++++++++++++++++++- > drivers/scsi/ipr.h | 16 ++++ > 2 files changed, 213 insertions(+), 1 deletions(-) > > diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c > index bb9ec50..468c690 100644 > --- a/drivers/scsi/ipr.c > +++ b/drivers/scsi/ipr.c > @@ -100,6 +100,11 @@ static unsigned int ipr_max_devs = > IPR_DEFAULT_SIS64_DEVS; > static unsigned int ipr_dual_ioa_raid = 1; > static unsigned int ipr_number_of_msix = 16; > static unsigned int ipr_fast_reboot; > +static unsigned int ipr_cpu_map = 1; > + > +static unsigned int ipr_used_cpu[NR_CPUS]; > +static unsigned int ipr_possible_cpu_cnt; > + > static DEFINE_SPINLOCK(ipr_driver_lock); > > /* This table describes the differences between DMA controller chips */ > @@ -224,6 +229,8 @@ module_param_named(number_of_msix, ipr_number_of_msix, > int, 0); > MODULE_PARM_DESC(number_of_msix, "Specify the number of MSIX interrupts to > use on capable adapters (1 - 16). (default:16)"); > module_param_named(fast_reboot, ipr_fast_reboot, int, S_IRUGO | S_IWUSR); > MODULE_PARM_DESC(fast_reboot, "Skip adapter shutdown during reboot. Set to 1 > to enable. (default: 0)"); > +module_param_named(cpu_map, ipr_cpu_map, int, 0); > +MODULE_PARM_DESC(cpu_map, "Enable CPU affinity per adapter. (default:1)"); > MODULE_LICENSE("GPL"); > MODULE_VERSION(IPR_DRIVER_VERSION); > > @@ -1053,6 +1060,17 @@ static void ipr_send_blocking_cmd(struct ipr_cmnd > *ipr_cmd, > static int ipr_get_hrrq_index(struct ipr_ioa_cfg *ioa_cfg) > { > unsigned int hrrq; > + struct ipr_vector_map_info *map_per_cpu; > + u32 cpu; > + > + if (ioa_cfg->cpu_map && ioa_cfg->hrrq_num > 1) { > + cpu = smp_processor_id(); > + map_per_cpu = ioa_cfg->cpu_map_tbl; > + if (cpu < ioa_cfg->possible_cpu_cnt) { > + map_per_cpu += cpu; > + return map_per_cpu->hrrq_id; > + } > + } > > if (ioa_cfg->hrrq_num == 1) > hrrq = 0; > @@ -4089,6 +4107,68 @@ static struct device_attribute ipr_ioa_fw_type_attr = { > .show = ipr_show_fw_type > }; > > +static ssize_t > +ipr_show_cpu_map_tbl(struct device *dev, struct device_attribute *attr, > + char *buf) > +{ > + struct Scsi_Host *shost = class_to_shost(dev); > + struct ipr_ioa_cfg *ioa_cfg = (struct ipr_ioa_cfg *)shost->hostdata; > + struct ipr_vector_map_info *map_per_cpu; > + int len = 0, show_current_cpu = 0; > + > + if (ioa_cfg->intr_flag != IPR_USE_MSIX) > + return len; > + > + switch (ioa_cfg->cpu_map) { > + case 0: > + len += snprintf(buf + len, PAGE_SIZE - len, > + "ipr driver cpu map: No mapping (%d)\n", > + ioa_cfg->cpu_map); > + return len; > + case 1: > + len += snprintf(buf + len, PAGE_SIZE - len, > + "ipr driver cpu map: IRQ vector to CPU mapping (%d): " > + "%d online CPUs\n", > + ioa_cfg->cpu_map, > + num_online_cpus()); > + break; > + } > + > + while (show_current_cpu < ioa_cfg->possible_cpu_cnt) { > + map_per_cpu = &ioa_cfg->cpu_map_tbl[show_current_cpu]; > + > + if (map_per_cpu->irq_id == IPR_VECTOR_MAP_EMPTY) > + len += snprintf(buf + len, PAGE_SIZE - len, > + "CPU %02d io_hrrq %02d\n", > + show_current_cpu, > + map_per_cpu->hrrq_id); > + else > + len += snprintf(buf + len, PAGE_SIZE - len, > + "CPU %02d io_hrrq %02d " > + " IRQ=%d\n", > + show_current_cpu, > + map_per_cpu->hrrq_id, > map_per_cpu->irq_id); > + > + show_current_cpu++; > + > + if (show_current_cpu < ioa_cfg->possible_cpu_cnt > + && (len >= (PAGE_SIZE - 64))) { > + len += snprintf(buf + len, PAGE_SIZE - len, > "more...\n"); > + break; > + } > + } > + > + return len; > +} > + > +static struct device_attribute ipr_cpu_map_attr = { > + .attr = { > + .name = "cpu_map", > + .mode = S_IRUGO | S_IWUSR,
You indicate this attribute can be written to by root, but you don't implement a store function. > + }, > + .show = ipr_show_cpu_map_tbl, > +}; > + > static struct device_attribute *ipr_ioa_attrs[] = { > &ipr_fw_version_attr, > &ipr_log_level_attr, > @@ -4098,6 +4178,7 @@ static struct device_attribute *ipr_ioa_attrs[] = { > &ipr_update_fw_attr, > &ipr_ioa_fw_type_attr, > &ipr_iopoll_weight_attr, > + &ipr_cpu_map_attr, > NULL, > }; > > @@ -9335,6 +9416,7 @@ static void ipr_free_mem(struct ipr_ioa_cfg *ioa_cfg) > > ipr_free_dump(ioa_cfg); > kfree(ioa_cfg->trace); > + kfree(ioa_cfg->cpu_map_tbl); > } > > /** > @@ -9354,9 +9436,13 @@ static void ipr_free_irqs(struct ipr_ioa_cfg *ioa_cfg) > if (ioa_cfg->intr_flag == IPR_USE_MSI || > ioa_cfg->intr_flag == IPR_USE_MSIX) { > int i; > - for (i = 0; i < ioa_cfg->nvectors; i++) > + for (i = 0; i < ioa_cfg->nvectors; i++) { > + if (ioa_cfg->cpu_map) > + irq_set_affinity_hint( > + ioa_cfg->vectors_info[i].vec, NULL); > free_irq(ioa_cfg->vectors_info[i].vec, > &ioa_cfg->hrrq[i]); > + } > } else > free_irq(pdev->irq, &ioa_cfg->hrrq[0]); > > @@ -9587,11 +9673,25 @@ static int ipr_alloc_mem(struct ipr_ioa_cfg *ioa_cfg) > if (!ioa_cfg->trace) > goto out_free_hostrcb_dma; > > + if (ioa_cfg->cpu_map) { > + ioa_cfg->cpu_map_tbl = > + kzalloc(sizeof(struct ipr_vector_map_info) * > + ioa_cfg->possible_cpu_cnt, GFP_KERNEL); > + > + if (!ioa_cfg->cpu_map_tbl) > + goto out_free_trace; > + } > + > + for (i = 0; i < ipr_possible_cpu_cnt; i++) > + ipr_used_cpu[i] = IPR_VECTOR_MAP_EMPTY; It seems like this shouldn't be a global. You seem to only use it at probe time, so I can see why you made it a global, but it just seems wrong. Seems like we should either allocate this temp buffer dynamically at probe time, then free it, since it seems to be needed only during probe, or we do something different. Perhaps another approach would be to just define a bitmap in ioa_cfg like: unsigned long used_cpu[(NR_CPUS / BITS_PER_LONG) + 1] and then use bitops to access and set the bits. Then we'd likely just leave the memory around, since it wouldn't be very large. > + > rc = 0; > out: > LEAVE; > return rc; > > +out_free_trace: > + kfree(ioa_cfg->trace); > out_free_hostrcb_dma: > while (i-- > 0) { > dma_free_coherent(&pdev->dev, sizeof(struct ipr_hostrcb), > @@ -9793,6 +9893,92 @@ static void ipr_wait_for_pci_err_recovery(struct > ipr_ioa_cfg *ioa_cfg) > } > } > > +static int > +ipr_find_next_free_cpu(struct ipr_ioa_cfg *ioa_cfg, u32 cpu_id) > +{ > + struct ipr_vector_map_info *map_per_cpu; > + int i; > + > + map_per_cpu = ioa_cfg->cpu_map_tbl; > + for (i = 0; i < ioa_cfg->possible_cpu_cnt; i++) { > + if (cpu_online(i)) { > + if ((map_per_cpu->irq_id == IPR_VECTOR_MAP_EMPTY) && > + (ipr_used_cpu[i] == IPR_VECTOR_MAP_EMPTY) && > + (map_per_cpu->cpu_id == cpu_id)) > + return i; > + } > + map_per_cpu++; > + } > + > + return IPR_VECTOR_MAP_EMPTY; > +} > + > +static int ipr_set_cpu_affinity_for_hrrq(struct ipr_ioa_cfg *ioa_cfg, > + int vectors) > +{ > + struct ipr_vector_map_info *map_per_cpu; > + int cpu_idx = 0, hrrq_num, next_cpu_id, i; > + > + memset(ioa_cfg->cpu_map_tbl, 0xff, > + (sizeof(struct ipr_vector_map_info) * > + ioa_cfg->possible_cpu_cnt)); > + > + map_per_cpu = ioa_cfg->cpu_map_tbl; > + for (i = 0; i < ioa_cfg->possible_cpu_cnt; i++, map_per_cpu++) > + map_per_cpu->cpu_id = i; > + > + for (i = 0; i < ipr_possible_cpu_cnt; i++) { > + if (ipr_used_cpu[i] == IPR_VECTOR_MAP_EMPTY) { > + cpu_idx = i; > + break; > + } > + } > + > + hrrq_num = 0; > + for (i = 0; i < vectors; i++) { > + map_per_cpu = ioa_cfg->cpu_map_tbl; > + next_cpu_id = ipr_find_next_free_cpu(ioa_cfg, cpu_idx); > + > + if (next_cpu_id == IPR_VECTOR_MAP_EMPTY) { > + int j; > + > + for (j = 0; j < i; j++) > + irq_set_affinity_hint( > + ioa_cfg->vectors_info[j].vec, NULL); > + ioa_cfg->cpu_map = 0; > + dev_info(&ioa_cfg->pdev->dev, "Cannot set cpu > affinity\n"); > + return 0; > + } > + > + map_per_cpu += next_cpu_id; > + ipr_used_cpu[next_cpu_id] = next_cpu_id; > + > + map_per_cpu->irq_id = ioa_cfg->vectors_info[i].vec; > + map_per_cpu->hrrq_id = hrrq_num; > + irq_set_affinity_hint(ioa_cfg->vectors_info[i].vec, > + get_cpu_mask(next_cpu_id)); > + cpu_idx++; > + hrrq_num++; > + } > + > + hrrq_num = 0; > + if (vectors < ioa_cfg->possible_cpu_cnt) { Seems like there is room for improvement here when the number of CPUs is much larger than the number of HRRQs. Perhaps we do this in a follow on patch. Seems like for a larger CPU config, we'd want to start taking into account numa affinity and CPU topology and try to set the irq affinity hint on multiple threads on the same core, in order to spread things as evenly as we can. > + map_per_cpu = ioa_cfg->cpu_map_tbl; > + for (i = 0; i < ioa_cfg->possible_cpu_cnt; i++) { > + if (map_per_cpu->hrrq_id == IPR_VECTOR_MAP_EMPTY) { > + map_per_cpu->hrrq_id = hrrq_num; > + hrrq_num++; > + } > + > + if (hrrq_num == ioa_cfg->hrrq_num) > + hrrq_num = 0; > + map_per_cpu++; > + } > + } > + > + return 1; > +} > + > static int ipr_enable_msix(struct ipr_ioa_cfg *ioa_cfg) > { > struct msix_entry entries[IPR_MAX_MSIX_VECTORS]; > @@ -10047,6 +10233,7 @@ static int ipr_probe_ioa(struct pci_dev *pdev, > ioa_cfg->hdw_dma_regs = ipr_regs; > ioa_cfg->hdw_dma_regs_pci = ipr_regs_pci; > ioa_cfg->ioa_mailbox = ioa_cfg->chip_cfg->mailbox + ipr_regs; > + ioa_cfg->cpu_map = ipr_cpu_map; > > ipr_init_regs(ioa_cfg); > > @@ -10108,6 +10295,8 @@ static int ipr_probe_ioa(struct pci_dev *pdev, > } > } > > + ioa_cfg->possible_cpu_cnt = ipr_possible_cpu_cnt; > + > if (ioa_cfg->intr_flag == IPR_USE_MSI || > ioa_cfg->intr_flag == IPR_USE_MSIX) { > rc = ipr_test_msi(ioa_cfg, pdev); > @@ -10202,6 +10391,9 @@ static int ipr_probe_ioa(struct pci_dev *pdev, > goto cleanup_nolog; > } > > + if (ioa_cfg->cpu_map) > + ipr_set_cpu_affinity_for_hrrq(ioa_cfg, ioa_cfg->nvectors); > + > if ((dev_id->driver_data & IPR_USE_PCI_WARM_RESET) || > (dev_id->device == PCI_DEVICE_ID_IBM_OBSIDIAN_E && > !ioa_cfg->revid)) { > ioa_cfg->needs_warm_reset = 1; > @@ -10645,9 +10837,13 @@ static struct notifier_block ipr_notifier = { > **/ > static int __init ipr_init(void) > { > + int cpu; > ipr_info("IBM Power RAID SCSI Device Driver version: %s %s\n", > IPR_DRIVER_VERSION, IPR_DRIVER_DATE); > > + for_each_present_cpu(cpu) > + ipr_possible_cpu_cnt++; This could be simplified to ipr_possible_cpu_cnt = num_present_cpus(); Also, I don't think this works quite right with CPU hotplug. according to include/linux/cpumask.h, cpu_present_mask varies dynamically depending on what is reported as currently plugged in. I think if we use the following it would might better handle CPU hotplug: ipr_possible_cpu_cnt = num_possible_cpus(); I'd suggest renaming ipr_possible_cpu_cnt to something like ipr_irq_vector_map_entries to make it clearer what it is intended to do rather than where the value came from. Reading the patch further, I'm thinking we should just ditch this global altogether. Its not needed that I can see. You can simply initialize ioa_cfg->possible_cpu_cnt to num_possible_cpus() early at probe time and then use that value everywhere. I'd still recommend renaming ioa_cfg->possible_cpu_cnt, though, to something like ioa_cfg->num_irq_vector_map_entries. > + > register_reboot_notifier(&ipr_notifier); > return pci_register_driver(&ipr_driver); > } > diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h > index 56c5706..8f8ab6b 100644 > --- a/drivers/scsi/ipr.h > +++ b/drivers/scsi/ipr.h > @@ -1464,6 +1464,16 @@ enum ipr_sdt_state { > DUMP_OBTAINED > }; > > +#define IPR_VECTOR_MAP_EMPTY 0xffff > +#define IPR_SET_IO_CPU_AFFINITY 0x1 > + > +/* IRQ vector to CPU mapping when set CPU affinity*/ > +struct ipr_vector_map_info { > + u16 cpu_id; cpu_id should probably be an unsigned int, to be consistent with the rest of the kernel. > + u16 irq_id; > + u16 hrrq_id; > +}; > + > /* Per-controller data */ > struct ipr_ioa_cfg { > char eye_catcher[8]; > @@ -1595,6 +1605,12 @@ struct ipr_ioa_cfg { > > u32 iopoll_weight; > > + u32 cpu_map; > + struct ipr_vector_map_info *cpu_map_tbl; > + u16 possible_cpu_cnt; And if we change cpu_id above, then this should become an unsigned int as well. > + > + u32 use_blk_mq; > + > }; /* struct ipr_ioa_cfg */ > > struct ipr_cmnd { > -- Brian King Power Linux I/O IBM Linux Technology Center ------------------------------------------------------------------------------ Transform Data into Opportunity. Accelerate data analysis in your applications with Intel Data Analytics Acceleration Library. Click to learn more. http://pubads.g.doubleclick.net/gampad/clk?id=278785111&iu=/4140 _______________________________________________ Iprdd-devel mailing list Iprdd-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/iprdd-devel