[PATCH RFC 11/11] PCI: hotplug: movable bus numbers: compact the gaps in numbering

2019-10-24 Thread Sergey Miroshnichenko
If bus numbers are distributed sparsely and there are lot of devices in the
tree, hotplugging a bridge into the end of the tree may fail even if it has
less slots then the total number of unused bus numbers.

Thus, the feature of bus renaming relies on the continuity of bus numbers,
so if a bridge was unplugged, the gap in bus numbers must be compacted.

Let's densify the bus numbering at the beginning of a next PCI rescan.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index fe9bf012ef33..0c91b9d453dd 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1319,6 +1319,30 @@ static bool pci_new_bus_needed(struct pci_bus *bus, 
const struct pci_dev *self)
return true;
 }
 
+static void pci_compact_bus_numbers(const int domain, const struct resource 
*valid_range)
+{
+   int busnr_p1 = valid_range->start;
+
+   while (busnr_p1 < valid_range->end) {
+   int busnr_p2 = busnr_p1 + 1;
+   struct pci_bus *bus_p2;
+   int delta;
+
+   while (busnr_p2 <= valid_range->end &&
+  !(bus_p2 = pci_find_bus(domain, busnr_p2)))
+   ++busnr_p2;
+
+   if (!bus_p2 || busnr_p2 > valid_range->end)
+   break;
+
+   delta = busnr_p1 - busnr_p2 + 1;
+   if (delta)
+   pci_move_buses(domain, busnr_p2, delta, valid_range);
+
+   ++busnr_p1;
+   }
+}
+
 static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
  unsigned int available_buses);
 /**
@@ -3691,6 +3715,9 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_bus_update_immovable_range(root);
pci_bus_release_root_bridge_resources(root);
 
+   pci_compact_bus_numbers(pci_domain_nr(bus),
+   >busn_res);
+
max = pci_scan_child_bus(root);
 
pci_reassign_root_bus_resources(root);
-- 
2.23.0



[PATCH RFC 10/11] PCI: hotplug: movable bus numbers: rename proc and sysfs entries

2019-10-24 Thread Sergey Miroshnichenko
Changing the number of a bus (therefore changing addresses of this bus, of
its children and all the buses next in the tree) invalidates entries in
/sys/devices/pci*, /proc/bus/pci/* and symlinks in /sys/bus/pci/devices/*
for all the renamed devices and buses.

Remove the affected proc and sysfs entries and symlinks before renaming the
bus, then created them back.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 105 +++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index be9e5754cac7..fe9bf012ef33 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1096,12 +1096,99 @@ static void pci_enable_crs(struct pci_dev *pdev)
 PCI_EXP_RTCTL_CRSSVE);
 }
 
+static void pci_buses_remove_sysfs(int domain, int busnr, int max_bus_number)
+{
+   struct pci_bus *bus;
+   struct pci_dev *dev = NULL;
+
+   bus = pci_find_bus(domain, busnr);
+   if (!bus)
+   return;
+
+   if (busnr < max_bus_number)
+   pci_buses_remove_sysfs(domain, busnr + 1, max_bus_number);
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   device_remove_class_symlinks(>dev);
+   pci_remove_sysfs_dev_files(dev);
+   pci_proc_detach_device(dev);
+   bus_disconnect_device(>dev);
+   }
+
+   device_remove_class_symlinks(>dev);
+   pci_proc_detach_bus(bus);
+}
+
+static void pci_buses_create_sysfs(int domain, int busnr, int max_bus_number)
+{
+   struct pci_bus *bus;
+   struct pci_dev *dev = NULL;
+
+   bus = pci_find_bus(domain, busnr);
+   if (!bus)
+   return;
+
+   device_add_class_symlinks(>dev);
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   bus_add_device(>dev);
+   if (pci_dev_is_added(dev)) {
+   pci_proc_attach_device(dev);
+   pci_create_sysfs_dev_files(dev);
+   device_add_class_symlinks(>dev);
+   }
+   }
+
+   if (busnr < max_bus_number)
+   pci_buses_create_sysfs(domain, busnr + 1, max_bus_number);
+}
+
+static void pci_rename_bus(struct pci_bus *bus, const char *new_bus_name)
+{
+   struct class *class;
+   int err;
+
+   class = bus->dev.class;
+   bus->dev.class = NULL;
+   err = device_rename(>dev, new_bus_name);
+   bus->dev.class = class;
+}
+
+static void pci_rename_bus_devices(struct pci_bus *bus, const int domain,
+  const int new_busnr)
+{
+   struct pci_dev *dev = NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   char old_name[64];
+   char new_name[64];
+   struct class *class;
+   int err;
+   int i;
+
+   strncpy(old_name, dev_name(>dev), sizeof(old_name));
+   sprintf(new_name, "%04x:%02x:%02x.%d", domain, new_busnr,
+   PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+   class = dev->dev.class;
+   dev->dev.class = NULL;
+   err = device_rename(>dev, new_name);
+   dev->dev.class = class;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; i++)
+   dev->resource[i].name = pci_name(dev);
+   }
+}
+
 static void pci_do_move_buses(const int domain, int busnr, int 
first_moved_busnr,
  int delta, const struct resource *valid_range)
 {
struct pci_bus *bus;
-   int subordinate;
+   int subordinate, old_primary;
u32 old_buses, buses;
+   char old_bus_name[64];
+   char new_bus_name[64];
+   struct resource old_res;
+   int new_busnr = busnr + delta;
 
if (busnr < valid_range->start || busnr > valid_range->end)
return;
@@ -1110,11 +1197,21 @@ static void pci_do_move_buses(const int domain, int 
busnr, int first_moved_busnr
if (!bus)
return;
 
+   old_primary = bus->primary;
+   strncpy(old_bus_name, dev_name(>dev), sizeof(old_bus_name));
+   sprintf(new_bus_name, "%04x:%02x", domain, new_busnr);
+
if (delta > 0) {
pci_do_move_buses(domain, busnr + 1, first_moved_busnr,
  delta, valid_range);
+   pci_rename_bus_devices(bus, domain, new_busnr);
+   pci_rename_bus(bus, new_bus_name);
+   } else {
+   pci_rename_bus(bus, new_bus_name);
+   pci_rename_bus_devices(bus, domain, new_busnr);
}
 
+   memcpy(_res, >busn_res, sizeof(old_res));
bus->number += delta;
bus->busn_res.start += delta;
 
@@ -1132,6 +1229,10 @@ static void pci_do_move_buses(const int domain, int 

[PATCH RFC 09/11] PCI: hotplug: Add initial support for movable bus numbers

2019-10-24 Thread Sergey Miroshnichenko
Currently, hot-adding a bridge requires enough bus numbers to be reserved
on the slot. Choosing a favorable number of reserved buses per slot is
relatively simple for predictable cases, but it gets trickier when bridges
can be hot-plugged into hot-plugged bridges: there may be either not enough
buses in a slot for a new big bridge, or all the 255 possible numbers will
be depleted. So hot-add may fail still having unused buses somewhere in the
PCI topology.

Instead of reserving, the bus numbers can be allocated continuously, and
during a hot-adding a bridge in the middle of the PCI tree, the conflicting
buses can increment their numbers, creating a gap for the new bridge.

Before the moving, ensure there are enough space to move on, and there will
be no conflicts with other buses, taking into consideration that it may be
more than one root bridge in the domain (e.g. on some Intel Xeons one root
has buses 00-7f, and the second one - 80-ff).

The feature is disabled by default to not break the ABI, and can be enabled
by the "pci=movable_buses" command line argument, if all risks accepted.

The following set of parameters provides a safe activation of the feature:

  pci=realloc,pcie_bus_peer2peer,movable_buses

On x86, the "pci=assign-busses" is also required:

  pci=realloc,pcie_bus_peer2peer,movable_buses,assign-busses

This series is the second half of the work started by the "Movable BARs"
patches, and relies on fixes made there.

Following patches will resolve the introduced issues:
 - fix desynchronization in /sys/devices/pci*, /sys/bus/pci/devices/* and
   /proc/bus/pci/* after changes in PCI topology;
 - compact gaps in numbering, which may appear after removing a bridge, to
   maintain the number continuity.

Signed-off-by: Sergey Miroshnichenko 
---
 .../admin-guide/kernel-parameters.txt |   3 +
 drivers/pci/pci.c |   3 +
 drivers/pci/pci.h |   2 +
 drivers/pci/probe.c   | 153 +-
 4 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index c6243aaed0c9..1bf8dea1f08a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3529,6 +3529,9 @@
force_floating  [S390] Force usage of floating interrupts.
nomio   [S390] Do not use MIO instructions.
no_movable_bars Don't allow BARs to be moved during hotplug
+   movable_buses   Prefer bus renaming over the number reserving. 
This
+   inflicts the deleting+recreating of sysfs and 
procfs
+   entries.
 
pcie_aspm=  [PCIE] Forcibly enable or disable PCIe Active State 
Power
Management.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6ec1b70e4a96..9b2dcaa268e8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -79,6 +79,7 @@ int pci_domains_supported = 1;
 #endif
 
 bool pci_can_move_bars = true;
+bool pci_movable_buses;
 
 #define DEFAULT_CARDBUS_IO_SIZE(256)
 #define DEFAULT_CARDBUS_MEM_SIZE   (64*1024*1024)
@@ -6335,6 +6336,8 @@ static int __init pci_setup(char *str)
disable_acs_redir_param = str + 18;
} else if (!strncmp(str, "no_movable_bars", 15)) {
pci_can_move_bars = false;
+   } else if (!strncmp(str, "movable_buses", 13)) {
+   pci_movable_buses = true;
} else {
pr_err("PCI: Unknown option `%s'\n", str);
}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9b5164d10499..804176bb1d1b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -289,6 +289,8 @@ void pci_bus_put(struct pci_bus *bus);
 
 bool pci_dev_bar_movable(struct pci_dev *dev, struct resource *res);
 
+extern bool pci_movable_buses;
+
 int assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r);
 
 /* PCIe link information */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3494b5d265d5..be9e5754cac7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1096,6 +1096,126 @@ static void pci_enable_crs(struct pci_dev *pdev)
 PCI_EXP_RTCTL_CRSSVE);
 }
 
+static void pci_do_move_buses(const int domain, int busnr, int 
first_moved_busnr,
+ int delta, const struct resource *valid_range)
+{
+   struct pci_bus *bus;
+   int subordinate;
+   u32 old_buses, buses;
+
+   if (busnr < valid_range->start || busnr > valid_range->end)
+   return;
+
+   bus = pci_find_bus(domain, busnr);
+   if (!bus)
+   

[PATCH RFC 06/11] powerpc/pci: Enable assigning bus numbers instead of reading them from DT

2019-10-24 Thread Sergey Miroshnichenko
If the firmware indicates support of reassigning bus numbers via the PHB's
"ibm,supported-movable-bdfs" property in DT, PowerNV will not depend on PCI
topology info from DT anymore.

This makes possible to re-enumerate the fabric, assign the new bus numbers
and switch from the pnv_php module to the standard pciehp driver for PCI
hotplug functionality.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci_dn.c | 5 +
 arch/powerpc/platforms/powernv/eeh-powernv.c | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index ad0ecf48e943..b9b7518eb2b4 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -559,6 +559,11 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
phb->pci_data = pdn;
}
 
+   if (of_get_property(dn, "ibm,supported-movable-bdfs", NULL)) {
+   pci_add_flags(PCI_REASSIGN_ALL_BUS);
+   return;
+   }
+
/* Update dn->phb ptrs for new phb and children devices */
pci_traverse_device_nodes(dn, add_pdn, phb);
 }
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6bc24a47e9ef..6c126aa2a6b7 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -42,7 +42,8 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
 
-   if (eeh_has_flag(EEH_FORCE_DISABLED))
+   if (eeh_has_flag(EEH_FORCE_DISABLED) ||
+   !pci_has_flag(PCI_REASSIGN_ALL_BUS))
return;
 
dev_dbg(>dev, "EEH: Setting up device\n");
-- 
2.23.0



[PATCH RFC 08/11] PCI: Allow expanding the bridges

2019-10-24 Thread Sergey Miroshnichenko
When hotplugging a bridge, the parent bus may not have [enough] reserved
bus numbers. So before rescanning the bus, set its subordinate number to
the maximum possible value: it is 255 when there is only one root bridge
in the domain.

During the PCI rescan, the subordinate bus number of every bus will be
contracted to the actual value.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 539f5d39bb6d..3494b5d265d5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3195,20 +3195,22 @@ static unsigned int pci_dev_count_res_mask(struct 
pci_dev *dev)
return res_mask;
 }
 
-static void pci_bus_rescan_prepare(struct pci_bus *bus)
+static void pci_bus_rescan_prepare(struct pci_bus *bus, int last_bus_number)
 {
struct pci_dev *dev;
 
if (bus->self)
pci_config_pm_runtime_get(bus->self);
 
+   bus->busn_res.end = last_bus_number;
+
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child = dev->subordinate;
 
dev->res_mask = pci_dev_count_res_mask(dev);
 
if (child)
-   pci_bus_rescan_prepare(child);
+   pci_bus_rescan_prepare(child, last_bus_number);
 
if (dev->driver &&
dev->driver->rescan_prepare)
@@ -3439,7 +3441,7 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
 
if (pci_can_move_bars) {
pcibios_root_bus_rescan_prepare(root);
-   pci_bus_rescan_prepare(root);
+   pci_bus_rescan_prepare(root, root->busn_res.end);
pci_bus_update_immovable_range(root);
pci_bus_release_root_bridge_resources(root);
 
-- 
2.23.0



[PATCH RFC 05/11] drivers: base: Add bus_disconnect_device()

2019-10-24 Thread Sergey Miroshnichenko
Add bus_disconnect_device(), which is similar to bus_remove_device(), but
it doesn't detach the device from its driver, so it can be reconnected to
the same or another bus later.

This is a yet another preparation to hotplugging large PCIe bridges, which
may entail changes in BDF addresses of working devices due to movable bus
numbers. Changed addresses require rebuilding the affected entries in
/sys/bus/pci and /proc/bus/pci.

Using bus_disconnect_device()+bus_add_device() during PCI rescan allows the
drivers to work with their devices uninterruptedly, regardless of changes
in PCI addresses.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/base/bus.c | 36 
 include/linux/device.h |  1 +
 2 files changed, 37 insertions(+)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 8f3445cc533e..52d77fb90218 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -497,6 +497,42 @@ void bus_probe_device(struct device *dev)
mutex_unlock(>p->mutex);
 }
 
+/**
+ * bus_disconnect_device - disconnect device from bus,
+ * but don't detach it from driver
+ * @dev: device to be disconnected
+ *
+ * - Remove device from all interfaces.
+ * - Remove symlink from bus' directory.
+ * - Delete device from bus's list.
+ */
+void bus_disconnect_device(struct device *dev)
+{
+   struct bus_type *bus = dev->bus;
+   struct subsys_interface *sif;
+
+   if (!bus)
+   return;
+
+   mutex_lock(>p->mutex);
+   list_for_each_entry(sif, >p->interfaces, node)
+   if (sif->remove_dev)
+   sif->remove_dev(dev, sif);
+   mutex_unlock(>p->mutex);
+
+   sysfs_remove_link(>kobj, "subsystem");
+   sysfs_remove_link(>bus->p->devices_kset->kobj,
+ dev_name(dev));
+   device_remove_groups(dev, dev->bus->dev_groups);
+   if (klist_node_attached(>p->knode_bus))
+   klist_del(>p->knode_bus);
+
+   pr_debug("bus: '%s': remove device %s\n",
+dev->bus->name, dev_name(dev));
+   bus_put(dev->bus);
+}
+EXPORT_SYMBOL_GPL(bus_disconnect_device);
+
 /**
  * bus_remove_device - remove device from bus
  * @dev: device to be removed
diff --git a/include/linux/device.h b/include/linux/device.h
index 420228ab9c4b..9f098c32a4ad 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -268,6 +268,7 @@ void bus_sort_breadthfirst(struct bus_type *bus,
   int (*compare)(const struct device *a,
  const struct device *b));
 extern int bus_add_device(struct device *dev);
+extern void bus_disconnect_device(struct device *dev);
 extern int device_add_class_symlinks(struct device *dev);
 extern void device_remove_class_symlinks(struct device *dev);
 
-- 
2.23.0



[PATCH RFC 07/11] powerpc/pci: Don't reduce the host bridge bus range

2019-10-24 Thread Sergey Miroshnichenko
Currently the last possible bus number of the PHB is set to the last
used bus number during the boot. So when hotplugging a bridge later,
no new buses can be allocated because they are limited by this value.

Let the host bridge contain any number of buses up to 255.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci-common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 1c448cf25506..5877ef7a39a0 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1631,7 +1631,6 @@ void pcibios_scan_phb(struct pci_controller *hose)
if (mode == PCI_PROBE_NORMAL) {
pci_bus_update_busn_res_end(bus, 255);
hose->last_busno = pci_scan_child_bus(bus);
-   pci_bus_update_busn_res_end(bus, hose->last_busno);
}
 
/* Platform gets a chance to do some global fixups before
-- 
2.23.0



[PATCH RFC 04/11] drivers: base: Make device_{add|remove}_class_symlinks() public

2019-10-24 Thread Sergey Miroshnichenko
When updating the /sys/devices/pci* entries affected by changes in the PCI
topology, their symlinks in /sys/bus/pci/devices/* must also be rebuilt.

Moving device_add_class_symlinks() and device_remove_class_symlinks() to a
public API allows the PCI subsystem to update the sysfs without destroying
the working affected devices.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/base/core.c| 6 --
 include/linux/device.h | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 7bd9cd366d41..23e689fc8478 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1922,7 +1922,7 @@ static void cleanup_glue_dir(struct device *dev, struct 
kobject *glue_dir)
mutex_unlock(_mutex);
 }
 
-static int device_add_class_symlinks(struct device *dev)
+int device_add_class_symlinks(struct device *dev)
 {
struct device_node *of_node = dev_of_node(dev);
int error;
@@ -1973,8 +1973,9 @@ static int device_add_class_symlinks(struct device *dev)
sysfs_remove_link(>kobj, "of_node");
return error;
 }
+EXPORT_SYMBOL_GPL(device_add_class_symlinks);
 
-static void device_remove_class_symlinks(struct device *dev)
+void device_remove_class_symlinks(struct device *dev)
 {
if (dev_of_node(dev))
sysfs_remove_link(>kobj, "of_node");
@@ -1991,6 +1992,7 @@ static void device_remove_class_symlinks(struct device 
*dev)
 #endif
sysfs_delete_link(>class->p->subsys.kobj, >kobj, 
dev_name(dev));
 }
+EXPORT_SYMBOL_GPL(device_remove_class_symlinks);
 
 /**
  * dev_set_name - set a device name
diff --git a/include/linux/device.h b/include/linux/device.h
index 4d8bbc8ae73d..420228ab9c4b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -268,6 +268,8 @@ void bus_sort_breadthfirst(struct bus_type *bus,
   int (*compare)(const struct device *a,
  const struct device *b));
 extern int bus_add_device(struct device *dev);
+extern int device_add_class_symlinks(struct device *dev);
+extern void device_remove_class_symlinks(struct device *dev);
 
 /*
  * Bus notifiers: Get notified of addition/removal of devices
-- 
2.23.0



[PATCH RFC 03/11] drivers: base: Make bus_add_device() public

2019-10-24 Thread Sergey Miroshnichenko
Move the bus_add_device() to a public API, so it can be applied to devices
which are temporarily detached from their buses without being destroyed.

This will be used after changes in PCI topology after hotplugging a bridge:
buses may get their numbers changed, so their child devices must be
reattached and their sysfs and proc files recreated.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/base/base.h| 1 -
 drivers/base/bus.c | 1 +
 include/linux/device.h | 2 ++
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0d32544b6f91..c93d302e6345 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -110,7 +110,6 @@ extern void container_dev_init(void);
 
 struct kobject *virtual_device_parent(struct device *dev);
 
-extern int bus_add_device(struct device *dev);
 extern void bus_probe_device(struct device *dev);
 extern void bus_remove_device(struct device *dev);
 
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index a1d1e8256324..8f3445cc533e 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -471,6 +471,7 @@ int bus_add_device(struct device *dev)
bus_put(dev->bus);
return error;
 }
+EXPORT_SYMBOL_GPL(bus_add_device);
 
 /**
  * bus_probe_device - probe drivers for a new device
diff --git a/include/linux/device.h b/include/linux/device.h
index 297239a08bb7..4d8bbc8ae73d 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -267,6 +267,8 @@ int bus_for_each_drv(struct bus_type *bus, struct 
device_driver *start,
 void bus_sort_breadthfirst(struct bus_type *bus,
   int (*compare)(const struct device *a,
  const struct device *b));
+extern int bus_add_device(struct device *dev);
+
 /*
  * Bus notifiers: Get notified of addition/removal of devices
  * and binding/unbinding of drivers to devices.
-- 
2.23.0



[PATCH RFC 02/11] PCI: proc: Nullify a freed pointer

2019-10-24 Thread Sergey Miroshnichenko
A PCI device may be detached from /proc/bus/pci/devices not only when it is
removed, but also when its bus had changed the number - in this case the
proc entry must be recreated to reflect the new PCI topology.

Nullify freed pointers to mark them as valid for allocating again.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/proc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 5495537c60c2..c85654dd315b 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -443,6 +443,7 @@ int pci_proc_detach_device(struct pci_dev *dev)
 int pci_proc_detach_bus(struct pci_bus *bus)
 {
proc_remove(bus->procdir);
+   bus->procdir = NULL;
return 0;
 }
 
-- 
2.23.0



[PATCH RFC 01/11] PCI: sysfs: Nullify freed pointers

2019-10-24 Thread Sergey Miroshnichenko
After hotplugging a bridge the PCI topology will be changed: buses may have
their numbers changed. In this case all the affected sysfs entries/symlinks
must be recreated, because they have BDF address in their names.

Set the freed pointers to NULL, so the !NULL checks will be satisfied when
its time to recreate the sysfs entries.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci-sysfs.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 793412954529..a238935c1193 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1129,12 +1129,14 @@ static void pci_remove_resource_files(struct pci_dev 
*pdev)
if (res_attr) {
sysfs_remove_bin_file(>dev.kobj, res_attr);
kfree(res_attr);
+   pdev->res_attr[i] = NULL;
}
 
res_attr = pdev->res_attr_wc[i];
if (res_attr) {
sysfs_remove_bin_file(>dev.kobj, res_attr);
kfree(res_attr);
+   pdev->res_attr_wc[i] = NULL;
}
}
 }
@@ -1175,8 +1177,11 @@ static int pci_create_attr(struct pci_dev *pdev, int 
num, int write_combine)
res_attr->size = pci_resource_len(pdev, num);
res_attr->private = (void *)(unsigned long)num;
retval = sysfs_create_bin_file(>dev.kobj, res_attr);
-   if (retval)
+   if (retval) {
kfree(res_attr);
+   if (pdev->res_attr[num] == res_attr)
+   pdev->res_attr[num] = NULL;
+   }
 
return retval;
 }
-- 
2.23.0



[PATCH RFC 00/11] PCI: hotplug: Movable bus numbers

2019-10-24 Thread Sergey Miroshnichenko
To allow hotplugging bridges, the kernel or BIOS/bootloader/firmware add
extra bus numbers per slot, but this range may be not enough for a large
bridge and/or nested bridges when hot-adding a chassis full of devices.

This patchset proposes an approach similar to movable BARs: bus numbers are
not reserved anymore, instead the kernel moves the "tail" of the PCI tree
by one, when needed a new bus.

When something like this is going to happen:
   *LARGE*
 +-[0020:00]---00.0-[01-20]--+-00.0-[02-08]--+-00.0-[03]--   <--  *NESTED*
 |   |   +-01.0-[04]--*BRIDGE*
 |   |   +-02.0-[05]--
 |   |   +-03.0-[06]--
 |   |   +-04.0-[07]--
 |   |   \-05.0-[08]--
 ...

, this will result into the following:

 
+-[0020:00]---00.0-[01-22]--+-00.0-[02-22]--+-00.0-[03-1d]04.0-[04-1d]--+-00.0-[05]--
 |   |   |   
+-04.0-[06]--
 |   |   |   
+-09.0-[07]--
 |   |   |   
+-0c.0-[08-19]00.0-[09-19]--+-01.0-[0a]--
 |   |   |   |  
 ...
 |   |   |   |  
 \-11.0-[19]--
 |   |   |   ...
 |   |   |   
\-15.0-[1d]--
 |   |   +-01.0-[1e]--  <-- Renamed from 04
 |   |   +-02.0-[1f]--  <-- Renamed from 05
 |   |   +-03.0-[20]--  <-- Renamed from 06
 |   |   +-04.0-[21]--  <-- Renamed from 07
 |   |   \-05.0-[22]--  <-- Renamed from 08
 ...


This looks to be safe in the kernel, because drivers don't use the raw PCI
BDF ID, and we've tested that on our x86 and PowerNV machines: mass storage
with roots and network adapters just continue their work while their bus
numbers had moved.

But here comes the userspace:

 - procfs entries:

% ls -la /proc/bus/pci/*
/proc/bus/pci/00:
00.0
02.0
...
1f.4
1f.6

/proc/bus/pci/04:
00.0

/proc/bus/pci/40:
00.0

 - sysfs entries:

% ls -la /sys/devices/pci:00/
:00:00.0
:00:02.0
...
:00:1f.3
:00:1f.4
:00:1f.6

% ls -la /sys/devices/pci:00/:00:1c.6/:04:00.0/driver
driver -> ../../../../bus/pci/drivers/iwlwifi

 - sysfs symlinks:

% ls -la /sys/bus/pci/devices
:00:00.0 -> ../../../devices/pci:00/:00:00.0
:00:02.0 -> ../../../devices/pci:00/:00:02.0
...
:04:00.0 -> ../../../devices/pci:00/:00:1c.6/:04:00.0
:40:00.0 -> ../../../devices/pci:00/:00:1d.2/:40:00.0


These patches alter the kernel public API and some internals to be able to
remove these files before changing a bus number, and create new versions
of them after device has changed its BDF.

On one hand, this makes the hotplug predictable, independent of non-kernel
program components (BIOS, bootloader, etc.) and cross-platform, but this is
also a severe ABI violation.

Probably, the udev should have a new action like "rename" in addition to
"add" and "remove".

Is it feasible to have this feature disabled by default, but with a chance
to enable by a kernel command line argument like this:

  pci=realloc,movable_buses

?

This code is follow-up of the "PCI: Allow BAR movement during hotplug"
series (v6).

Sergey Miroshnichenko (11):
  PCI: sysfs: Nullify freed pointers
  PCI: proc: Nullify a freed pointer
  drivers: base: Make bus_add_device() public
  drivers: base: Make device_{add|remove}_class_symlinks() public
  drivers: base: Add bus_disconnect_device()
  powerpc/pci: Enable assigning bus numbers instead of reading them from
DT
  powerpc/pci: Don't reduce the host bridge bus range
  PCI: Allow expanding the bridges
  PCI: hotplug: Add initial support for movable bus numbers
  PCI: hotplug: movable bus numbers: rename proc and sysfs entries
  PCI: hotplug: movable bus numbers: compact the gaps in numbering

 .../admin-guide/kernel-parameters.txt |   3 +
 arch/powerpc/kernel/pci-common.c  |   1 -
 arch/powerpc/kernel/pci_dn.c  |   5 +
 arch/powerpc/platforms/powernv/eeh-powernv.c  |   3 +-
 drivers/base/base.h   |   1 -
 drivers/base/bus.c|  37 +

[PATCH v6 29/30] PCI: pciehp: movable BARs: Trigger a domain rescan on hp events

2019-10-24 Thread Sergey Miroshnichenko
With movable BARs, adding a hotplugged device is not local to its bridge
anymore, but it affects the whole domain: BARs, bridge windows and bus
numbers can be substantially rearranged. So instead of trying to fit the
new devices into preallocated reserved gaps, initiate a full domain rescan.

The pci_rescan_bus() covers all the operations of the replaced functions:
 - assigning new bus numbers, as the pci_hp_add_bridge() does it;
 - allocating BARs (pci_assign_unassigned_bridge_resources());
 - cofiguring MPS settings (pcie_bus_configure_settings());
 - binding devices to their drivers (pci_bus_add_devices()).

CC: Lukas Wunner 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/hotplug/pciehp_pci.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index d17f3bf36f70..6d4c1ef38210 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -58,6 +58,11 @@ int pciehp_configure_device(struct controller *ctrl)
goto out;
}
 
+   if (pci_can_move_bars) {
+   pci_rescan_bus(parent);
+   goto out;
+   }
+
for_each_pci_bridge(dev, parent)
pci_hp_add_bridge(dev);
 
-- 
2.23.0



[PATCH v6 30/30] Revert "powerpc/powernv/pci: Work around races in PCI bridge enabling"

2019-10-24 Thread Sergey Miroshnichenko
This reverts commit db2173198b9513f7add8009f225afa1f1c79bcc6.

The root cause of this bug is fixed by the following two commits:

  1. "PCI: Fix race condition in pci_enable/disable_device()"
  2. "PCI: Enable bridge's I/O and MEM access for hotplugged devices"

The x86 is also affected by this bug if a PCIe bridge has been hotplugged
without pre-enabling by the BIOS.

CC: Benjamin Herrenschmidt 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 37 ---
 1 file changed, 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 33d5ed8c258f..f12f3a49d3bb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3119,49 +3119,12 @@ static void pnv_pci_ioda_create_dbgfs(void)
 #endif /* CONFIG_DEBUG_FS */
 }
 
-static void pnv_pci_enable_bridge(struct pci_bus *bus)
-{
-   struct pci_dev *dev = bus->self;
-   struct pci_bus *child;
-
-   /* Empty bus ? bail */
-   if (list_empty(>devices))
-   return;
-
-   /*
-* If there's a bridge associated with that bus enable it. This works
-* around races in the generic code if the enabling is done during
-* parallel probing. This can be removed once those races have been
-* fixed.
-*/
-   if (dev) {
-   int rc = pci_enable_device(dev);
-   if (rc)
-   pci_err(dev, "Error enabling bridge (%d)\n", rc);
-   pci_set_master(dev);
-   }
-
-   /* Perform the same to child busses */
-   list_for_each_entry(child, >children, node)
-   pnv_pci_enable_bridge(child);
-}
-
-static void pnv_pci_enable_bridges(void)
-{
-   struct pci_controller *hose;
-
-   list_for_each_entry(hose, _list, list_node)
-   pnv_pci_enable_bridge(hose->bus);
-}
-
 static void pnv_pci_ioda_fixup(void)
 {
pnv_pci_ioda_setup_PEs();
pnv_pci_ioda_setup_iommu_api();
pnv_pci_ioda_create_dbgfs();
 
-   pnv_pci_enable_bridges();
-
 #ifdef CONFIG_EEH
pnv_eeh_post_init();
 #endif
-- 
2.23.0



[PATCH v6 27/30] nvme-pci: Handle movable BARs

2019-10-24 Thread Sergey Miroshnichenko
Hotplugged devices can affect the existing ones by moving their BARs. The
PCI subsystem will inform the NVME driver about this by invoking the
.rescan_prepare() and .rescan_done() hooks, so the BARs can by re-mapped.

Tested under the "randrw" mode of the fio tool. Before the hotplugging:

  % sudo cat /proc/iomem
  ...
3fe8-3fe8007f : PCI Bus 0020:0b
  3fe8-3fe8007f : PCI Bus 0020:18
3fe8-3fe8000f : 0020:18:00.0
  3fe8-3fe8000f : nvme
3fe80010-3fe80017 : 0020:18:00.0
  ...

, then another NVME drive was hot-added, so BARs of the 0020:18:00.0 are
moved:

  % sudo cat /proc/iomem
...
3fe8-3fe800ff : PCI Bus 0020:0b
  3fe8-3fe8007f : PCI Bus 0020:10
3fe8-3fe83fff : 0020:10:00.0
  3fe8-3fe83fff : nvme
3fe80001-3fe80001 : 0020:10:00.0
  3fe80080-3fe800ff : PCI Bus 0020:18
3fe80080-3fe8008f : 0020:18:00.0
  3fe80080-3fe8008f : nvme
3fe80090-3fe80097 : 0020:18:00.0
...

During the rescanning, both READ and WRITE speeds drop to zero for a while
due to driver's pause, then restore.

Also tested with an NVME as a system drive.

Cc: linux-n...@lists.infradead.org
Cc: Christoph Hellwig 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/nvme/host/pci.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 869f462e6b6e..5f162ea5a5f1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1650,7 +1650,7 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned 
long size)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-   if (size <= dev->bar_mapped_size)
+   if (dev->bar && size <= dev->bar_mapped_size)
return 0;
if (size > pci_resource_len(pdev, 0))
return -ENOMEM;
@@ -3059,6 +3059,23 @@ static void nvme_error_resume(struct pci_dev *pdev)
flush_work(>ctrl.reset_work);
 }
 
+static void nvme_rescan_prepare(struct pci_dev *pdev)
+{
+   struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+   nvme_dev_disable(dev, false);
+   nvme_dev_unmap(dev);
+   dev->bar = NULL;
+}
+
+static void nvme_rescan_done(struct pci_dev *pdev)
+{
+   struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+   nvme_dev_map(dev);
+   nvme_reset_ctrl_sync(>ctrl);
+}
+
 static const struct pci_error_handlers nvme_err_handler = {
.error_detected = nvme_error_detected,
.slot_reset = nvme_slot_reset,
@@ -3135,6 +3152,8 @@ static struct pci_driver nvme_driver = {
 #endif
.sriov_configure = pci_sriov_configure_simple,
.err_handler= _err_handler,
+   .rescan_prepare = nvme_rescan_prepare,
+   .rescan_done= nvme_rescan_done,
 };
 
 static int __init nvme_init(void)
-- 
2.23.0



[PATCH v6 28/30] PCI/portdrv: Declare support of movable BARs

2019-10-24 Thread Sergey Miroshnichenko
Switch's BARs are not used by the portdrv driver, but they are still
considered as immovable until the .rescan_prepare() and .rescan_done()
hooks are added. Add these hooks to increase chances to allocate new BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pcie/portdrv_pci.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 0a87091a0800..9dbddc7faaa7 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -197,6 +197,14 @@ static const struct pci_error_handlers 
pcie_portdrv_err_handler = {
.resume = pcie_portdrv_err_resume,
 };
 
+static void pcie_portdrv_rescan_prepare(struct pci_dev *pdev)
+{
+}
+
+static void pcie_portdrv_rescan_done(struct pci_dev *pdev)
+{
+}
+
 static struct pci_driver pcie_portdriver = {
.name   = "pcieport",
.id_table   = _pci_ids[0],
@@ -207,6 +215,9 @@ static struct pci_driver pcie_portdriver = {
 
.err_handler= _portdrv_err_handler,
 
+   .rescan_prepare = pcie_portdrv_rescan_prepare,
+   .rescan_done= pcie_portdrv_rescan_done,
+
.driver.pm  = PCIE_PORTDRV_PM_OPS,
 };
 
-- 
2.23.0



[PATCH v6 26/30] PCI: hotplug: movable BARs: Enable the feature by default

2019-10-24 Thread Sergey Miroshnichenko
This is the last patch in the series which implements the essentials of the
Movable BARs feature, so it is turned by default now. Tested on:

 - x86_64 with "pci=realloc,pcie_bus_peer2peer" command line argument;
 - POWER8 PowerNV+PHB3 ppc64le with "pci=realloc,pcie_bus_peer2peer".

In case of problems it is still can be overridden by the following command
line option:

  pcie_movable_bars=off

CC: Oliver O'Halloran 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 85014c6b2817..6ec1b70e4a96 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -78,7 +78,7 @@ static void pci_dev_d3_sleep(struct pci_dev *dev)
 int pci_domains_supported = 1;
 #endif
 
-bool pci_can_move_bars;
+bool pci_can_move_bars = true;
 
 #define DEFAULT_CARDBUS_IO_SIZE(256)
 #define DEFAULT_CARDBUS_MEM_SIZE   (64*1024*1024)
-- 
2.23.0



[PATCH v6 25/30] PNP: Don't reserve BARs for PCI when enabled movable BARs

2019-10-24 Thread Sergey Miroshnichenko
When the Movable BARs feature is supported, the PCI subsystem is able to
distribute existing BARs and allocate the new ones itself, without need to
reserve gaps by BIOS.

CC: Rafael J. Wysocki 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pnp/system.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/pnp/system.c b/drivers/pnp/system.c
index 6950503741eb..5977bd11f4d4 100644
--- a/drivers/pnp/system.c
+++ b/drivers/pnp/system.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -58,6 +59,9 @@ static void reserve_resources_of_dev(struct pnp_dev *dev)
struct resource *res;
int i;
 
+   if (pci_can_move_bars)
+   return;
+
for (i = 0; (res = pnp_get_resource(dev, IORESOURCE_IO, i)); i++) {
if (res->flags & IORESOURCE_DISABLED)
continue;
-- 
2.23.0



[PATCH v6 23/30] powerpc/pci: hotplug: Add support for movable BARs

2019-10-24 Thread Sergey Miroshnichenko
Add pcibios_root_bus_rescan_prepare()/_done() hooks for the powerpc, so it
can reassign the PE numbers (which depend on BAR sizes and locations) and
update the EEH address cache during a PCI rescan.

New PE numbers are assigned during pci_setup_bridges(root) after the rescan
is done.

CC: Oliver O'Halloran 
CC: Sam Bobroff 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci-hotplug.c | 43 +++
 drivers/pci/probe.c   | 10 +++
 include/linux/pci.h   |  3 +++
 3 files changed, 56 insertions(+)

diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index fc62c4bc47b1..42847f5b0f08 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
   struct device_node *dn)
@@ -151,3 +152,45 @@ void pci_hp_add_devices(struct pci_bus *bus)
pcibios_finish_adding_to_bus(bus);
 }
 EXPORT_SYMBOL_GPL(pci_hp_add_devices);
+
+static void pci_hp_bus_rescan_prepare(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child = dev->subordinate;
+
+   if (child)
+   pci_hp_bus_rescan_prepare(child);
+
+   iommu_del_device(>dev);
+   }
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   pcibios_release_device(dev);
+   }
+}
+
+static void pci_hp_bus_rescan_done(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child = dev->subordinate;
+
+   pcibios_bus_add_device(dev);
+
+   if (child)
+   pci_hp_bus_rescan_done(child);
+   }
+}
+
+void pcibios_root_bus_rescan_prepare(struct pci_bus *root)
+{
+   pci_hp_bus_rescan_prepare(root);
+}
+
+void pcibios_root_bus_rescan_done(struct pci_bus *root)
+{
+   pci_hp_bus_rescan_done(root);
+}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 73452aa81417..539f5d39bb6d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3235,6 +3235,14 @@ static void pci_bus_rescan_done(struct pci_bus *bus)
pci_config_pm_runtime_put(bus->self);
 }
 
+void __weak pcibios_root_bus_rescan_prepare(struct pci_bus *root)
+{
+}
+
+void __weak pcibios_root_bus_rescan_done(struct pci_bus *root)
+{
+}
+
 static void pci_setup_bridges(struct pci_bus *bus)
 {
struct pci_dev *dev;
@@ -3430,6 +3438,7 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
root = root->parent;
 
if (pci_can_move_bars) {
+   pcibios_root_bus_rescan_prepare(root);
pci_bus_rescan_prepare(root);
pci_bus_update_immovable_range(root);
pci_bus_release_root_bridge_resources(root);
@@ -3440,6 +3449,7 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
 
pci_setup_bridges(root);
pci_bus_rescan_done(root);
+   pcibios_root_bus_rescan_done(root);
} else {
max = pci_scan_child_bus(bus);
pci_assign_unassigned_bus_resources(bus);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e1edcb3fad31..b5821134bdae 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1275,6 +1275,9 @@ unsigned int pci_rescan_bus(struct pci_bus *bus);
 void pci_lock_rescan_remove(void);
 void pci_unlock_rescan_remove(void);
 
+void pcibios_root_bus_rescan_prepare(struct pci_bus *root);
+void pcibios_root_bus_rescan_done(struct pci_bus *root);
+
 /* Vital Product Data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const 
void *buf);
-- 
2.23.0



[PATCH v6 24/30] powerpc/powernv/pci: Suppress an EEH error when reading an empty slot

2019-10-24 Thread Sergey Miroshnichenko
Reading an empty slot returns all ones, which triggers a false EEH
error event on PowerNV. A rescan is performed after all the PEs have
been unmapped, so the reserved PE index is used for unfreezing.

CC: Oliver O'Halloran 
CC: Sam Bobroff 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index ffd546cf9204..e1b45dc96474 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -768,9 +768,16 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 
*val = 0x;
pdn = pci_get_pdn_by_devfn(bus, devfn);
-   if (!pdn)
-   return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
-   where, size, val);
+   if (!pdn) {
+   ret = pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
+  where, size, val);
+
+   if (!ret && (*val == EEH_IO_ERROR_VALUE(size)) && 
phb->unfreeze_pe)
+   phb->unfreeze_pe(phb, phb->ioda.reserved_pe_idx,
+OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+   return ret;
+   }
 
if (!pnv_pci_cfg_check(pdn))
return PCIBIOS_DEVICE_NOT_FOUND;
-- 
2.23.0



[PATCH v6 22/30] powerpc/pci: Create pci_dn on demand

2019-10-24 Thread Sergey Miroshnichenko
If a struct pci_dn hasn't yet been created for the PCIe device (there was
no DT node for it), allocate this structure and fill with info read from
the device directly.

CC: Oliver O'Halloran 
CC: Sam Bobroff 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci_dn.c | 88 ++--
 1 file changed, 74 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 9524009ca1ae..ad0ecf48e943 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -20,6 +20,9 @@
 #include 
 #include 
 
+static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
+ struct pci_dn *parent);
+
 /*
  * The function is used to find the firmware data of one
  * specific PCI device, which is attached to the indicated
@@ -52,6 +55,9 @@ static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
dn = pci_bus_to_OF_node(pbus);
pdn = dn ? PCI_DN(dn) : NULL;
 
+   if (!pdn && pbus->self)
+   pdn = pbus->self->dev.archdata.pci_data;
+
return pdn;
 }
 
@@ -61,10 +67,13 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
struct device_node *dn = NULL;
struct pci_dn *parent, *pdn;
struct pci_dev *pdev = NULL;
+   bool pdev_found = false;
 
/* Fast path: fetch from PCI device */
list_for_each_entry(pdev, >devices, bus_list) {
if (pdev->devfn == devfn) {
+   pdev_found = true;
+
if (pdev->dev.archdata.pci_data)
return pdev->dev.archdata.pci_data;
 
@@ -73,6 +82,9 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
}
}
 
+   if (!pdev_found)
+   pdev = NULL;
+
/* Fast path: fetch from device node */
pdn = dn ? PCI_DN(dn) : NULL;
if (pdn)
@@ -85,9 +97,12 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
 
list_for_each_entry(pdn, >child_list, list) {
if (pdn->busno == bus->number &&
-pdn->devfn == devfn)
-return pdn;
-}
+   pdn->devfn == devfn) {
+   if (pdev)
+   pdev->dev.archdata.pci_data = pdn;
+   return pdn;
+   }
+   }
 
return NULL;
 }
@@ -117,17 +132,17 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
 
list_for_each_entry(pdn, >child_list, list) {
if (pdn->busno == pdev->bus->number &&
-   pdn->devfn == pdev->devfn)
+   pdn->devfn == pdev->devfn) {
+   pdev->dev.archdata.pci_data = pdn;
return pdn;
+   }
}
 
-   return NULL;
+   return pci_create_pdn_from_dev(pdev, parent);
 }
 
-#ifdef CONFIG_PCI_IOV
-static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
-  int vf_index,
-  int busno, int devfn)
+static struct pci_dn *pci_alloc_pdn(struct pci_dn *parent,
+   int busno, int devfn)
 {
struct pci_dn *pdn;
 
@@ -143,7 +158,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
pdn->parent = parent;
pdn->busno = busno;
pdn->devfn = devfn;
-   pdn->vf_index = vf_index;
pdn->pe_number = IODA_INVALID_PE;
INIT_LIST_HEAD(>child_list);
INIT_LIST_HEAD(>list);
@@ -151,7 +165,51 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
 
return pdn;
 }
-#endif
+
+static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
+ struct pci_dn *parent)
+{
+   struct pci_dn *pdn = NULL;
+   u32 class_code;
+   u16 device_id;
+   u16 vendor_id;
+
+   if (!parent)
+   return NULL;
+
+   pdn = pci_alloc_pdn(parent, pdev->bus->busn_res.start, pdev->devfn);
+   pci_info(pdev, "Create a new pdn for devfn %2x\n", pdev->devfn / 8);
+
+   if (!pdn) {
+   pci_err(pdev, "%s: Failed to allocate pdn\n", __func__);
+   return NULL;
+   }
+
+   #ifdef CONFIG_EEH
+   if (!eeh_dev_init(pdn)) {
+   kfree(pdn);
+   pci_err(pdev, "%s: Failed to allocate edev\n", __func__);
+   return NULL;
+   }
+   #endif /* CONFIG_EEH */
+
+   pci_bus_read_config_word(pdev->bus, pdev->devfn,
+PCI_VENDOR_ID, _id);
+   pdn->vendor_id = vendor_id;
+
+   pci_bus_read_config_word(pdev->bus, pdev->devfn,
+PCI_DEVICE_ID, _id);
+   

[PATCH v6 21/30] powerpc/pci: Access PCI config space directly w/o pci_dn

2019-10-24 Thread Sergey Miroshnichenko
To fetch an updated DT for the newly hotplugged device, OS must explicitly
request it from the firmware via the pnv_php driver.

If pnv_php wasn't triggered/loaded, it is still possible to discover new
devices if PCIe I/O will not stop in absence of the pci_dn structure.

Reviewed-by: Oliver O'Halloran 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/rtas_pci.c   | 97 +++-
 arch/powerpc/platforms/powernv/pci.c | 64 --
 2 files changed, 109 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index ae5e43eaca48..912da28b3737 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -42,10 +42,26 @@ static inline int config_access_valid(struct pci_dn *dn, 
int where)
return 0;
 }
 
-int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+static int rtas_read_raw_config(unsigned long buid, int busno, unsigned int 
devfn,
+   int where, int size, u32 *val)
 {
int returnval = -1;
-   unsigned long buid, addr;
+   unsigned long addr = rtas_config_addr(busno, devfn, where);
+   int ret;
+
+   if (buid) {
+   ret = rtas_call(ibm_read_pci_config, 4, 2, ,
+   addr, BUID_HI(buid), BUID_LO(buid), size);
+   } else {
+   ret = rtas_call(read_pci_config, 2, 2, , addr, size);
+   }
+   *val = returnval;
+
+   return ret;
+}
+
+int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+{
int ret;
 
if (!pdn)
@@ -58,16 +74,8 @@ int rtas_read_config(struct pci_dn *pdn, int where, int 
size, u32 *val)
return PCIBIOS_SET_FAILED;
 #endif
 
-   addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
-   buid = pdn->phb->buid;
-   if (buid) {
-   ret = rtas_call(ibm_read_pci_config, 4, 2, ,
-   addr, BUID_HI(buid), BUID_LO(buid), size);
-   } else {
-   ret = rtas_call(read_pci_config, 2, 2, , addr, size);
-   }
-   *val = returnval;
-
+   ret = rtas_read_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
+  where, size, val);
if (ret)
return PCIBIOS_DEVICE_NOT_FOUND;
 
@@ -85,18 +93,44 @@ static int rtas_pci_read_config(struct pci_bus *bus,
 
pdn = pci_get_pdn_by_devfn(bus, devfn);
 
-   /* Validity of pdn is checked in here */
-   ret = rtas_read_config(pdn, where, size, val);
-   if (*val == EEH_IO_ERROR_VALUE(size) &&
-   eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   if (pdn) {
+   /* Validity of pdn is checked in here */
+   ret = rtas_read_config(pdn, where, size, val);
+
+   if (*val == EEH_IO_ERROR_VALUE(size) &&
+   eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
+   ret = PCIBIOS_DEVICE_NOT_FOUND;
+   } else {
+   struct pci_controller *phb = pci_bus_to_host(bus);
+
+   ret = rtas_read_raw_config(phb->buid, bus->number, devfn,
+  where, size, val);
+   }
 
return ret;
 }
 
+static int rtas_write_raw_config(unsigned long buid, int busno, unsigned int 
devfn,
+int where, int size, u32 val)
+{
+   unsigned long addr = rtas_config_addr(busno, devfn, where);
+   int ret;
+
+   if (buid) {
+   ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
+   BUID_HI(buid), BUID_LO(buid), size, (ulong)val);
+   } else {
+   ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, 
(ulong)val);
+   }
+
+   if (ret)
+   return PCIBIOS_DEVICE_NOT_FOUND;
+
+   return PCIBIOS_SUCCESSFUL;
+}
+
 int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
 {
-   unsigned long buid, addr;
int ret;
 
if (!pdn)
@@ -109,15 +143,8 @@ int rtas_write_config(struct pci_dn *pdn, int where, int 
size, u32 val)
return PCIBIOS_SET_FAILED;
 #endif
 
-   addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
-   buid = pdn->phb->buid;
-   if (buid) {
-   ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
-   BUID_HI(buid), BUID_LO(buid), size, (ulong) val);
-   } else {
-   ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, 
(ulong)val);
-   }
-
+   ret = rtas_write_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
+   where, size, val);
if (ret)
return PCIBIOS_DEVICE_NOT_FOUND;
 
@@ -128,12 +155,20 @@ static int rtas_pci_write_config(struct pci_bus *bus,
 un

[PATCH v6 20/30] powerpc/pci: Fix crash with enabled movable BARs

2019-10-24 Thread Sergey Miroshnichenko
Add a check for the UNSET resource flag to skip the released BARs

CC: Alexey Kardashevskiy 
CC: Oliver O'Halloran 
CC: Sam Bobroff 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c28d0d9b7ee0..33d5ed8c258f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2976,7 +2976,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
 
-   if (!res || !res->flags || res->start > res->end)
+   if (!res || !res->flags || res->start > res->end ||
+   (res->flags & IORESOURCE_UNSET))
return;
 
if (res->flags & IORESOURCE_IO) {
-- 
2.23.0



[PATCH v6 14/30] PCI: Make sure bridge windows include their fixed BARs

2019-10-24 Thread Sergey Miroshnichenko
When the time comes to select a start address for the bridge window during
the root bus rescan, it should be not just a lowest possible address: this
window must cover all the underlying fixed and immovable BARs. The lowest
address that satisfies this requirement is the .realloc_range field of
struct pci_bus, which is calculated during the preparation to the rescan.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/bus.c   |  2 +-
 drivers/pci/setup-res.c | 31 +--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 8e40b3e6da77..a1efa87e31b9 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -192,7 +192,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, 
struct resource *res,
 * this is an already-configured bridge window, its start
 * overrides "min".
 */
-   if (avail.start)
+   if (min_used < avail.start)
min_used = avail.start;
 
max = avail.end;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index a1657a8bf93d..1570bbd620cd 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -248,9 +248,23 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
struct resource *res = dev->resource + resno;
resource_size_t min;
int ret;
+   resource_size_t start = (resource_size_t)-1;
+   resource_size_t end = 0;
 
min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
 
+   if (dev->subordinate && resno >= PCI_BRIDGE_RESOURCES) {
+   struct pci_bus *child_bus = dev->subordinate;
+   int b_resno = resno - PCI_BRIDGE_RESOURCES;
+   struct resource *immovable_range = 
_bus->immovable_range[b_resno];
+
+   if (immovable_range->start < immovable_range->end) {
+   start = immovable_range->start;
+   end = immovable_range->end;
+   min = child_bus->realloc_range[b_resno].start;
+   }
+   }
+
/*
 * First, try exact prefetching match.  Even if a 64-bit
 * prefetchable bridge window is below 4GB, we can't put a 32-bit
@@ -262,7 +276,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH | IORESOURCE_MEM_64,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
 
/*
 * If the prefetchable window is only 32 bits wide, we can put
@@ -274,7 +288,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
}
 
/*
@@ -287,6 +301,19 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
ret = pci_bus_alloc_resource(bus, res, size, align, min, 0,
 pcibios_align_resource, dev);
 
+check_fixed:
+   if (ret == 0 && start < end) {
+   if (res->start > start || res->end < end) {
+   dev_err(>dev, "fixed area 0x%llx-0x%llx for %s 
doesn't fit in the allocated %pR (0x%llx-0x%llx)",
+   (unsigned long long)start, (unsigned long 
long)end,
+   dev_name(>dev),
+   res, (unsigned long long)res->start,
+   (unsigned long long)res->end);
+   release_resource(res);
+   return -1;
+   }
+   }
+
return ret;
 }
 
-- 
2.23.0



[PATCH v6 16/30] PCI: hotplug: movable BARs: Assign fixed and immovable BARs before others

2019-10-24 Thread Sergey Miroshnichenko
Reassign resources during rescan in two steps: first the fixed/immovable
BARs and bridge windows that have fixed areas, so the movable ones will not
steal these reserved areas; then the rest - so the movable BARs will divide
the rest of the space.

With this change, pci_assign_resource() is now able to assign all types of
BARs, so the pdev_assign_fixed_resources() became unused and thus removed.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  2 ++
 drivers/pci/setup-bus.c | 78 -
 drivers/pci/setup-res.c |  7 ++--
 3 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7cd108885598..9b5164d10499 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -289,6 +289,8 @@ void pci_bus_put(struct pci_bus *bus);
 
 bool pci_dev_bar_movable(struct pci_dev *dev, struct resource *res);
 
+int assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r);
+
 /* PCIe link information */
 #define PCIE_SPEED2STR(speed) \
((speed) == PCIE_SPEED_16_0GT ? "16 GT/s" : \
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index c7365998fbd6..675a612236d7 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -38,6 +38,15 @@ struct pci_dev_resource {
unsigned long flags;
 };
 
+enum assign_step {
+   assign_fixed_resources,
+   assign_float_resources,
+};
+
+static void _assign_requested_resources_sorted(struct list_head *head,
+  struct list_head *fail_head,
+  enum assign_step step);
+
 static void free_list(struct list_head *head)
 {
struct pci_dev_resource *dev_res, *tmp;
@@ -278,19 +287,47 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
  */
 static void assign_requested_resources_sorted(struct list_head *head,
 struct list_head *fail_head)
+{
+   _assign_requested_resources_sorted(head, fail_head, 
assign_fixed_resources);
+   _assign_requested_resources_sorted(head, fail_head, 
assign_float_resources);
+}
+
+static void _assign_requested_resources_sorted(struct list_head *head,
+  struct list_head *fail_head,
+  enum assign_step step)
 {
struct resource *res;
struct pci_dev_resource *dev_res;
int idx;
 
list_for_each_entry(dev_res, head, list) {
+   bool is_fixed = false;
+
if (!pci_dev_bars_enabled(dev_res->dev))
continue;
 
res = dev_res->res;
+   if (!resource_size(res))
+   continue;
+
idx = res - _res->dev->resource[0];
-   if (resource_size(res) &&
-   pci_assign_resource(dev_res->dev, idx)) {
+
+   if (idx < PCI_BRIDGE_RESOURCES) {
+   is_fixed = !pci_dev_bar_movable(dev_res->dev, res);
+   } else {
+   int b_res_idx = pci_get_bridge_resource_idx(res);
+   struct resource *fixed_res =
+   
_res->dev->subordinate->immovable_range[b_res_idx];
+
+   is_fixed = (fixed_res->start < fixed_res->end);
+   }
+
+   if (assign_fixed_resources == step && !is_fixed)
+   continue;
+   else if (assign_float_resources == step && is_fixed)
+   continue;
+
+   if (pci_assign_resource(dev_res->dev, idx)) {
if (fail_head) {
/*
 * If the failed resource is a ROM BAR and
@@ -1335,7 +1372,7 @@ void pci_bus_size_bridges(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-static void assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
+int assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
 {
int i;
struct resource *parent_r;
@@ -1352,35 +1389,14 @@ static void assign_fixed_resource_on_bus(struct pci_bus 
*b, struct resource *r)
!(r->flags & IORESOURCE_PREFETCH))
continue;
 
-   if (resource_contains(parent_r, r))
-   request_resource(parent_r, r);
-   }
-}
-
-/*
- * Try to assign any resources marked as IORESOURCE_PCI_FIXED, as they are
- * skipped by pbus_assign_resources_sorted().
- */
-static void pdev_assign_fixed_resources(struct pci_dev *dev)
-{
-   int i;
-
-   for (i = 0; i <  PCI_NUM_RESOURCES; i++) {
-   struct pci_bus *b;
-   struct resource *r = >resource[i];
-
-   if (r->parent || !(r->flags & IORESOURCE_PCI_FIXED) ||
-   !(r->flags

[PATCH v6 19/30] PCI: hotplug: movable BARs: Ignore the MEM BAR offsets from bootloader

2019-10-24 Thread Sergey Miroshnichenko
BAR allocation by BIOS/UEFI/bootloader/firmware may be non-optimal and
it may even clash with the kernel's BAR assignment algorithm.

For example, if no space was reserved for SR-IOV BARs, and this bridge
window is packed between immovable BARs (so it is unable to extend),
and if this window can't be moved, the next PCI rescan will fail, as
the kernel tries to find a space for all the BARs, including SR-IOV.

With this patch the kernel will use its own methods of BAR allocating
when possible, increasing the chances of successful hotplug.

Also add a workaround for implicitly used video BARs on x86.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 94bbdf9b9dc1..73452aa81417 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -305,6 +305,16 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type 
type,
 pos, (unsigned long long)region.start);
}
 
+   if (pci_can_move_bars &&
+   !(res->flags & IORESOURCE_IO) &&
+   (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
+   pci_warn(dev, "ignore the current offset of BAR %llx-%llx\n",
+l64, l64 + sz64 - 1);
+   res->start = 0;
+   res->end = sz64 - 1;
+   res->flags |= IORESOURCE_SIZEALIGN;
+   }
+
goto out;
 
 
-- 
2.23.0



[PATCH v6 18/30] PCI: hotplug: Configure MPS for hot-added bridges during bus rescan

2019-10-24 Thread Sergey Miroshnichenko
Assure that MPS settings are set up for bridges which are discovered during
manually triggered rescan via sysfs. This sequence of bridge init (using
pci_rescan_bus()) will be used for pciehp hot-add events when BARs are
movable.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d0d00cb3e965..94bbdf9b9dc1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3414,7 +3414,7 @@ static void pci_reassign_root_bus_resources(struct 
pci_bus *root)
 unsigned int pci_rescan_bus(struct pci_bus *bus)
 {
unsigned int max;
-   struct pci_bus *root = bus;
+   struct pci_bus *root = bus, *child;
 
while (!pci_is_root_bus(root))
root = root->parent;
@@ -3435,6 +3435,9 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_assign_unassigned_bus_resources(bus);
}
 
+   list_for_each_entry(child, >children, node)
+   pcie_bus_configure_settings(child);
+
pci_bus_add_devices(bus);
 
return max;
-- 
2.23.0



[PATCH v6 17/30] PCI: hotplug: movable BARs: Don't reserve IO/mem bus space

2019-10-24 Thread Sergey Miroshnichenko
A hotplugged bridge with many hotplug-capable ports may request
reserving more IO space than the machine has. This could be overridden
with the "hpiosize=" kernel argument though.

But when BARs are movable, there are no need to reserve space anymore:
new BARs are allocated not from reserved gaps, but via rearranging the
existing BARs. Requesting a precise amount of space for bridge windows
increases the chances of adding the new bridge successfully.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 675a612236d7..a68ec726010e 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1285,7 +1285,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
 
case PCI_HEADER_TYPE_BRIDGE:
pci_bridge_check_ranges(bus);
-   if (bus->self->is_hotplug_bridge) {
+   if (bus->self->is_hotplug_bridge && !pci_can_move_bars) {
additional_io_size  = pci_hotplug_io_size;
additional_mem_size = pci_hotplug_mem_size;
}
-- 
2.23.0



[PATCH v6 15/30] PCI: Fix assigning the fixed prefetchable resources

2019-10-24 Thread Sergey Miroshnichenko
Allow matching IORESOURCE_PCI_FIXED prefetchable BARs to non-prefetchable
windows, so they follow the same rules as immovable BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 653ba4d5f191..c7365998fbd6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1339,15 +1339,20 @@ static void assign_fixed_resource_on_bus(struct pci_bus 
*b, struct resource *r)
 {
int i;
struct resource *parent_r;
-   unsigned long mask = IORESOURCE_IO | IORESOURCE_MEM |
-IORESOURCE_PREFETCH;
+   unsigned long mask = IORESOURCE_TYPE_BITS;
 
pci_bus_for_each_resource(b, parent_r, i) {
if (!parent_r)
continue;
 
-   if ((r->flags & mask) == (parent_r->flags & mask) &&
-   resource_contains(parent_r, r))
+   if ((r->flags & mask) != (parent_r->flags & mask))
+   continue;
+
+   if (parent_r->flags & IORESOURCE_PREFETCH &&
+   !(r->flags & IORESOURCE_PREFETCH))
+   continue;
+
+   if (resource_contains(parent_r, r))
request_resource(parent_r, r);
}
 }
-- 
2.23.0



[PATCH v6 13/30] PCI: hotplug: movable BARs: Compute limits for relocated bridge windows

2019-10-24 Thread Sergey Miroshnichenko
With enabled movable BARs, bridge windows are recalculated during each pci
rescan. Some of the BARs below the bridge may be fixed/immovable: these
areas are represented by the .immovable_range field in struct pci_bus.

If a bridge window size is equal to its immovable range, it can only be
assigned to the start of this range. But if a bridge window size is larger,
and this difference in size is denoted as "delta", the window can start
from (immovable_range.start - delta) to (immovable_range.start), and it can
end from (immovable_range.end) to (immovable_range.end + delta). This range
(the new .realloc_range field in struct pci_bus) must then be compared with
immovable ranges of neighbouring bridges to guarantee no intersections.

This patch only calculates valid ranges for reallocated bridges during pci
rescan, and the next one will make use of these values during allocation.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 67 +
 include/linux/pci.h |  6 
 2 files changed, 73 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a7546e02ea7c..653ba4d5f191 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1819,6 +1819,72 @@ static enum enable_type pci_realloc_detect(struct 
pci_bus *bus,
 }
 #endif
 
+/*
+ * Calculate the address margins where the bridge windows may be allocated to 
fit all
+ * the fixed and immovable BARs beneath.
+ */
+static void pci_bus_update_realloc_range(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   struct pci_bus *parent = bus->parent;
+   int idx;
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_realloc_range(dev->subordinate);
+
+   if (!parent || !bus->self)
+   return;
+
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   struct resource *immovable_range = >immovable_range[idx];
+   resource_size_t window_size = resource_size(bus->resource[idx]);
+   resource_size_t realloc_start, realloc_end;
+
+   bus->realloc_range[idx].start = 0;
+   bus->realloc_range[idx].end = 0;
+
+   /* Check if there any immovable BARs under the bridge */
+   if (immovable_range->start >= immovable_range->end)
+   continue;
+
+   /* The lowest possible address where the bridge window can 
start */
+   realloc_start = immovable_range->end - window_size + 1;
+   /* The highest possible address where the bridge window can end 
*/
+   realloc_end = immovable_range->start + window_size - 1;
+
+   if (realloc_start > immovable_range->start)
+   realloc_start = immovable_range->start;
+
+   if (realloc_end < immovable_range->end)
+   realloc_end = immovable_range->end;
+
+   /*
+* Check that realloc range doesn't intersect with hard fixed 
ranges
+* of neighboring bridges
+*/
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *neighbor = dev->subordinate;
+   struct resource *n_imm_range;
+
+   if (!neighbor || neighbor == bus)
+   continue;
+
+   n_imm_range = >immovable_range[idx];
+
+   if (n_imm_range->start >= n_imm_range->end)
+   continue;
+
+   if (n_imm_range->end < immovable_range->start &&
+   n_imm_range->end > realloc_start)
+   realloc_start = n_imm_range->end;
+   }
+
+   bus->realloc_range[idx].start = realloc_start;
+   bus->realloc_range[idx].end = realloc_end;
+   }
+}
+
 /*
  * First try will not touch PCI bridge res.
  * Second and later try will clear small leaf bridge res.
@@ -1838,6 +1904,7 @@ void pci_assign_unassigned_root_bus_resources(struct 
pci_bus *bus)
 
if (pci_can_move_bars) {
__pci_bus_size_bridges(bus, NULL);
+   pci_bus_update_realloc_range(bus);
__pci_bus_assign_resources(bus, NULL, NULL);
 
goto dump;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ef41be0ce082..e1edcb3fad31 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -587,6 +587,12 @@ struct pci_bus {
 */
struct resource immovable_range[PCI_BRIDGE_RESOURCE_NUM];
 
+   /*
+* Acceptable address range, where the bridge window may reside, 
considering its
+* size, so it will cover all the fixed and immovable BARs below.
+*/
+   struct r

[PATCH v6 12/30] PCI: hotplug: movable BARs: Calculate immovable parts of bridge windows

2019-10-24 Thread Sergey Miroshnichenko
When movable BARs are enabled, and if a bridge contains a device with fixed
(IORESOURCE_PCI_FIXED) or immovable BARs, the corresponing windows can't be
moved too far away from their original positions - they must still contain
all the fixed/immovable BARs, like that:

  1) Window position before a bus rescan:

  | <--root bridge window--> |
  |  |
  | | <-- bridge window--> | |
  | | movable BARs | **fixed BAR** | |

  2) Possible valid outcome after rescan and move:

  | <--root bridge window--> |
  |  |
  || <-- bridge window--> |  |
  || **fixed BAR** | Movable BARs |  |

An immovable area of a bridge (separare for IO, MEM and MEM64 window types)
is a range that covers all the fixed and immovable BARs of direct children,
and all the fixed area of children bridges:

  | <--root bridge window--> |
  |  |
  |  | <--  bridge window level 1--> |   |
  |  |  immovable area of this bridge window |   |
  |  |   |   |
  |  | **fixed BAR**  | <--  bridge window level 2--> | BARs |   |
  |  || * fixed area of this bridge * |  |   |
  |  ||   |  |   |
  |  || ***fixed BAR*** |   | ***fixed BAR*** |  |   |

To store these areas, the .immovable_range field has been added to struct
pci_bus. It is filled recursively from leaves to the root before a rescan.

Also make pbus_size_io() and pbus_size_mem() return their usual result OR
the size of an immovable range of according type, depending on which one is
larger.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   | 14 +++
 drivers/pci/probe.c | 88 +
 drivers/pci/setup-bus.c | 17 
 include/linux/pci.h |  6 +++
 4 files changed, 125 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 55344f2c55bf..7cd108885598 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -401,6 +401,20 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
return dev->error_state == pci_channel_io_perm_failure;
 }
 
+static inline int pci_get_bridge_resource_idx(struct resource *r)
+{
+   int idx = 1;
+
+   if (r->flags & IORESOURCE_IO)
+   idx = 0;
+   else if (!(r->flags & IORESOURCE_PREFETCH))
+   idx = 1;
+   else if (r->flags & IORESOURCE_MEM_64)
+   idx = 2;
+
+   return idx;
+}
+
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
 #define PCI_DEV_DISABLED_BARS 1
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2d1157493e6a..d0d00cb3e965 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -545,6 +545,7 @@ void pci_read_bridge_bases(struct pci_bus *child)
 static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
 {
struct pci_bus *b;
+   int idx;
 
b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
@@ -561,6 +562,11 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus 
*parent)
if (parent)
b->domain_nr = parent->domain_nr;
 #endif
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   b->immovable_range[idx].start = 0;
+   b->immovable_range[idx].end = 0;
+   }
+
return b;
 }
 
@@ -3238,6 +3244,87 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static void pci_bus_update_immovable_range(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   int idx;
+   resource_size_t start, end;
+
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   bus->immovable_range[idx].start = 0;
+   bus->immovable_range[idx].end = 0;
+   }
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_immovable_range(dev->subordinate);
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   int i;
+   struct pci_bus *child = dev->subordinate;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; ++i) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & IORESOURCE_UNSET) || 
!r->parent)
+  

[PATCH v6 11/30] PCI: hotplug: movable BARs: Try to assign unassigned resources only once

2019-10-24 Thread Sergey Miroshnichenko
With enabled BAR movement, BARs and bridge windows can only be assigned to
their direct parents, so there can be only one variant of resource tree,
thus every retry within the pci_assign_unassigned_root_bus_resources() will
result in the same tree, and it is enough to try just once.

In case of failures the pci_reassign_root_bus_resources() disables BARs for
one of the hotplugged devices and tries the assignment again.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index cf325daae1b1..3deb1c343e89 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1819,6 +1819,13 @@ void pci_assign_unassigned_root_bus_resources(struct 
pci_bus *bus)
int pci_try_num = 1;
enum enable_type enable_local;
 
+   if (pci_can_move_bars) {
+   __pci_bus_size_bridges(bus, NULL);
+   __pci_bus_assign_resources(bus, NULL, NULL);
+
+   goto dump;
+   }
+
/* Don't realloc if asked to do so */
enable_local = pci_realloc_detect(bus, pci_realloc_enable);
if (pci_realloc_enabled(enable_local)) {
-- 
2.23.0



[PATCH v6 10/30] PCI: Prohibit assigning BARs and bridge windows to non-direct parents

2019-10-24 Thread Sergey Miroshnichenko
When movable BARs are enabled, the feature of resource relocating from
commit 2bbc6942273b5 ("PCI : ability to relocate assigned pci-resources")
is not used. Instead, inability to assign a resource is used as a signal
to retry BAR assignment with other configuration of bridge windows.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c |  2 ++
 drivers/pci/setup-res.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index ff33b47b1bb7..cf325daae1b1 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1355,6 +1355,8 @@ static void pdev_assign_fixed_resources(struct pci_dev 
*dev)
while (b && !r->parent) {
assign_fixed_resource_on_bus(b, r);
b = b->parent;
+   if (!r->parent && pci_can_move_bars)
+   break;
}
}
 }
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index d8ca40a97693..a1657a8bf93d 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -298,6 +298,18 @@ static int _pci_assign_resource(struct pci_dev *dev, int 
resno,
 
bus = dev->bus;
while ((ret = __pci_assign_resource(bus, dev, resno, size, min_align))) 
{
+   if (pci_can_move_bars) {
+   if (resno >= PCI_BRIDGE_RESOURCES &&
+   resno <= PCI_BRIDGE_RESOURCE_END) {
+   struct resource *res = dev->resource + resno;
+
+   res->start = 0;
+   res->end = 0;
+   res->flags = 0;
+   }
+   break;
+   }
+
if (!bus->parent || !bus->self->transparent)
break;
bus = bus->parent;
-- 
2.23.0



[PATCH v6 09/30] PCI: Include fixed and immovable BARs into the bus size calculating

2019-10-24 Thread Sergey Miroshnichenko
The only difference between the fixed/immovable and movable BARs is a size
and offset preservation after they are released (the corresponding struct
resource* detached from a bridge window for a while during a bus rescan).

Include fixed/immovable BARs into result of pbus_size_mem() and prohibit
assigning them to non-direct parents.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 4b538d132958..ff33b47b1bb7 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1011,12 +1011,20 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
struct resource *r = >resource[i];
resource_size_t r_size;
 
-   if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) ||
+   if (r->parent ||
((r->flags & mask) != type &&
 (r->flags & mask) != type2 &&
 (r->flags & mask) != type3))
continue;
r_size = resource_size(r);
+
+   if (!pci_dev_bar_movable(dev, r)) {
+   if (pci_can_move_bars)
+   size += r_size;
+
+   continue;
+   }
+
 #ifdef CONFIG_PCI_IOV
/* Put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
-- 
2.23.0



[PATCH v6 06/30] PCI: hotplug: movable BARs: Recalculate all bridge windows during rescan

2019-10-24 Thread Sergey Miroshnichenko
When the movable BARs feature is enabled and a rescan has been requested,
release all the bridge windows and recalculate them from scratch, taking
into account all kinds for BARs: fixed, immovable, movable, new.

This increases the chances to find a memory space to fit BARs for newly
hotplugged devices, especially if no/not enough gaps were reserved by the
BIOS/bootloader/firmware.

The last step of writing the recalculated windows to the bridges is done
by the new pci_setup_bridges() function.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 22 ++
 drivers/pci/setup-bus.c | 16 
 3 files changed, 39 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 19bc50597d12..4a3f2b69285b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -280,6 +280,7 @@ void __pci_bus_assign_resources(const struct pci_bus *bus,
struct list_head *realloc_head,
struct list_head *fail_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
+void pci_bus_release_root_bridge_resources(struct pci_bus *bus);
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3d8c0f653378..d2dbec51c4df 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3200,6 +3200,25 @@ static void pci_bus_rescan_done(struct pci_bus *bus)
pci_config_pm_runtime_put(bus->self);
 }
 
+static void pci_setup_bridges(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child;
+
+   if (!pci_dev_is_added(dev))
+   continue;
+
+   child = dev->subordinate;
+   if (child)
+   pci_setup_bridges(child);
+   }
+
+   if (bus->self)
+   pci_setup_bridge(bus);
+}
+
 /**
  * pci_rescan_bus - Scan a PCI bus for devices
  * @bus: PCI bus to scan
@@ -3221,8 +3240,11 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_bus_rescan_prepare(root);
 
max = pci_scan_child_bus(root);
+
+   pci_bus_release_root_bridge_resources(root);
pci_assign_unassigned_root_bus_resources(root);
 
+   pci_setup_bridges(root);
pci_bus_rescan_done(root);
} else {
max = pci_scan_child_bus(bus);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f2f02e6c9000..075e8185b936 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1635,6 +1635,22 @@ static void pci_bus_release_bridge_resources(struct 
pci_bus *bus,
pci_bridge_release_resources(bus, type);
 }
 
+void pci_bus_release_root_bridge_resources(struct pci_bus *root_bus)
+{
+   int i;
+   struct resource *r;
+
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_IO, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_MEM, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus,
+IORESOURCE_MEM_64 | 
IORESOURCE_PREFETCH,
+whole_subtree);
+
+   pci_bus_for_each_resource(root_bus, r, i) {
+   pci_release_child_resources(root_bus, r);
+   }
+}
+
 static void pci_bus_dump_res(struct pci_bus *bus)
 {
struct resource *res;
-- 
2.23.0



[PATCH v6 08/30] PCI: hotplug: movable BARs: Don't allow added devices to steal resources

2019-10-24 Thread Sergey Miroshnichenko
When movable BARs are enabled, the PCI subsystem at first releases all the
bridge windows and then attempts to assign resources both to previously
working devices and to the newly hotplugged ones, with the same priority.

If a hotplugged device gets its BARs first, this may lead to lack of space
for already working devices, which is unacceptable. If that happens, mark
one of the new devices with the newly introduced flag PCI_DEV_DISABLED_BARS
(if it is not yet marked) and retry the BAR recalculation.

The worst case would be no BARs for hotplugged devices, while all the rest
just continue working.

The algorithm is simple and it doesn't retry different subsets of hot-added
devices in case of a failure, e.g. if there are no space to allocate BARs
for both hotplugged devices A and B, but is enough for just A, the A will
be marked with PCI_DEV_DISABLED_BARS first, then (after the next failure) -
B. As a result, A will not get BARs while it could. This issue is only
relevant when hotplugging two and more devices simultaneously.

Add a new res_mask bitmask to the struct pci_dev for storing the indices of
assigned BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  11 +
 drivers/pci/probe.c | 102 ++--
 drivers/pci/setup-bus.c |  15 ++
 include/linux/pci.h |   1 +
 4 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4a3f2b69285b..55344f2c55bf 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -403,6 +403,7 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
 
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
+#define PCI_DEV_DISABLED_BARS 1
 
 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
 {
@@ -414,6 +415,16 @@ static inline bool pci_dev_is_added(const struct pci_dev 
*dev)
return test_bit(PCI_DEV_ADDED, >priv_flags);
 }
 
+static inline void pci_dev_disable_bars(struct pci_dev *dev)
+{
+   assign_bit(PCI_DEV_DISABLED_BARS, >priv_flags, true);
+}
+
+static inline bool pci_dev_bars_enabled(const struct pci_dev *dev)
+{
+   return !test_bit(PCI_DEV_DISABLED_BARS, >priv_flags);
+}
+
 #ifdef CONFIG_PCIEAER
 #include 
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d2dbec51c4df..2d1157493e6a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3162,6 +3162,23 @@ bool pci_dev_bar_movable(struct pci_dev *dev, struct 
resource *res)
return pci_dev_movable(dev, res->child);
 }
 
+static unsigned int pci_dev_count_res_mask(struct pci_dev *dev)
+{
+   unsigned int res_mask = 0;
+   int i;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent)
+   continue;
+
+   res_mask |= (1 << i);
+   }
+
+   return res_mask;
+}
+
 static void pci_bus_rescan_prepare(struct pci_bus *bus)
 {
struct pci_dev *dev;
@@ -3172,6 +3189,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus)
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child = dev->subordinate;
 
+   dev->res_mask = pci_dev_count_res_mask(dev);
+
if (child)
pci_bus_rescan_prepare(child);
 
@@ -3207,7 +3226,7 @@ static void pci_setup_bridges(struct pci_bus *bus)
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child;
 
-   if (!pci_dev_is_added(dev))
+   if (!pci_dev_is_added(dev) || !pci_dev_bars_enabled(dev))
continue;
 
child = dev->subordinate;
@@ -3219,6 +3238,83 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static struct pci_dev *pci_find_next_new_device(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   if (!bus)
+   return NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child_bus = dev->subordinate;
+
+   if (!pci_dev_is_added(dev) && pci_dev_bars_enabled(dev))
+   return dev;
+
+   if (child_bus) {
+   struct pci_dev *next_new_dev;
+
+   next_new_dev = pci_find_next_new_device(child_bus);
+   if (next_new_dev)
+   return next_new_dev;
+   }
+   }
+
+   return NULL;
+}
+
+static bool pci_bus_check_all_bars_reassigned(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   bool ret = true;
+
+   if (!bus)
+   return false;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child = dev->subordinate;
+   unsigned int res_mask = pci_

[PATCH v6 05/30] PCI: hotplug: movable BARs: Fix reassigning the released bridge windows

2019-10-24 Thread Sergey Miroshnichenko
When a bridge window is temporarily released during the rescan, its old
size is not relevant anymore - it will be recreated from pbus_size_*(), so
it's start value should be zero.

If such window can't be reassigned, don't apply reset_resource(), so the
next retry may succeed.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 2c02eb1acf5d..f2f02e6c9000 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -295,7 +295,8 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
0 /* don't care */,
0 /* don't care */);
}
-   reset_resource(res);
+   if (!pci_can_move_bars)
+   reset_resource(res);
}
}
 }
@@ -1579,8 +1580,8 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
type = old_flags = r->flags & PCI_RES_TYPE_MASK;
pci_info(dev, "resource %d %pR released\n",
 PCI_BRIDGE_RESOURCES + idx, r);
-   /* Keep the old size */
-   r->end = resource_size(r) - 1;
+   /* Don't keep the old size if the bridge will be recalculated */
+   r->end = pci_can_move_bars ? 0 : (resource_size(r) - 1);
r->start = 0;
r->flags = 0;
 
-- 
2.23.0



[PATCH v6 07/30] PCI: hotplug: movable BARs: Don't disable the released bridge windows

2019-10-24 Thread Sergey Miroshnichenko
On a hotplug event with enabled BAR movement, calculating the new bridge
windows takes some time. During this procedure, the structures that
represent these windows are released - marked for recalculation.

When new bridge windows are ready, they are written to the registers of
every bridge via pci_setup_bridges().

Currently, bridge's registers are updated immediately after releasing a
window to disable it. But if a driver doesn't yet support movable BARs, it
doesn't stop MEM transactions during the hotplug, so disabled bridge
windows will break them.

Let the bridge windows remain operating after releasing, as they will be
updated to the new values in the end of a hotplug event.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 075e8185b936..381ce964cb20 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1588,7 +1588,8 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
/* Avoiding touch the one without PREF */
if (type & IORESOURCE_PREFETCH)
type = IORESOURCE_PREFETCH;
-   __pci_setup_bridge(bus, type);
+   if (!pci_can_move_bars)
+   __pci_setup_bridge(bus, type);
/* For next child res under same bridge */
r->flags = old_flags;
}
-- 
2.23.0



[PATCH v6 04/30] PCI: Define PCI-specific version of the release_child_resources()

2019-10-24 Thread Sergey Miroshnichenko
If release the bridge resources with standard release_child_resources(), it
drops the .start field of children's BARs to zero, but with the STARTALIGN
flag remaining set, which makes the resource invalid for reassignment.

Some resources must preserve their offset and size: those marked with the
PCI_FIXED and the immovable ones - which are bound by drivers without
support of the movable BARs feature.

Add the pci_release_child_resources() to replace release_child_resources()
in handling the described PCI-specific cases.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 54 -
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index e7dbe21705ba..2c02eb1acf5d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1482,6 +1482,54 @@ static void __pci_bridge_assign_resources(const struct 
pci_dev *bridge,
(IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
 IORESOURCE_MEM_64)
 
+/*
+ * Similar to generic release_child_resources(), but aware of immovable BARs 
and
+ * PCI_FIXED and STARTALIGN flags
+ */
+static void pci_release_child_resources(struct pci_bus *bus, struct resource 
*r)
+{
+   struct pci_dev *dev;
+
+   if (!bus || !r)
+   return;
+
+   if (r->flags & IORESOURCE_PCI_FIXED)
+   return;
+
+   r->child = NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   int i;
+
+   for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+   struct resource *tmp = >resource[i];
+   resource_size_t size = resource_size(tmp);
+
+   if (!tmp->flags || tmp->parent != r)
+   continue;
+
+   tmp->parent = NULL;
+   tmp->sibling = NULL;
+
+   pci_release_child_resources(dev->subordinate, tmp);
+
+   tmp->flags &= ~IORESOURCE_STARTALIGN;
+   tmp->flags |= IORESOURCE_SIZEALIGN;
+
+   if (!pci_dev_bar_movable(dev, tmp)) {
+   pci_dbg(dev, "release immovable %pR (%s), keep 
its flags, base and size\n",
+   tmp, tmp->name);
+   continue;
+   }
+
+   pci_dbg(dev, "release %pR (%s)\n", tmp, tmp->name);
+
+   tmp->start = 0;
+   tmp->end = size - 1;
+   }
+   }
+}
+
 static void pci_bridge_release_resources(struct pci_bus *bus,
 unsigned long type)
 {
@@ -1522,7 +1570,11 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
return;
 
/* If there are children, release them all */
-   release_child_resources(r);
+   if (pci_can_move_bars)
+   pci_release_child_resources(bus, r);
+   else
+   release_child_resources(r);
+
if (!release_resource(r)) {
type = old_flags = r->flags & PCI_RES_TYPE_MASK;
pci_info(dev, "resource %d %pR released\n",
-- 
2.23.0



[PATCH v6 00/30] PCI: Allow BAR movement during hotplug

2019-10-24 Thread Sergey Miroshnichenko
Currently PCI hotplug works on top of resources, which are usually reserved
not by the kernel, but by BIOS, bootloader, firmware, etc. These resources
are gaps in the address space where BARs of new devices may fit, and extra
bus number per port, so bridges can be hot-added. This series aim the
former problem: it shows the kernel how to redistribute on the run, so the
hotplug becomes predictable and cross-platform. A follow-up patchset will
propose a solution for bus numbers.

If the memory is arranged in a way that doesn't provide enough space for
BARs of a new hotplugged device, the kernel can pause the drivers of the
"obstructing" devices and move their BARs, so the new BARs can fit into the
freed spaces.

To rearrange the BARs and bridge windows these patches releases all of them
after a rescan and re-assigns in the same way as during the initial PCIe
topology scan at system boot.

When a driver is un-paused by the kernel after the PCIe rescan, it should
ioremap() the new addresses of its BARs.

Drivers indicate their support of the feature by implementing the new hooks
.rescan_prepare() and .rescan_done() in the struct pci_driver. If a driver
doesn't yet support the feature, BARs of its devices will be considered as
immovable (by checking the pci_dev_movable_bars_supported(dev)) and handled
in the same way as resources with the IORESOURCE_PCI_FIXED flag.

If a driver doesn't yet support the feature, its devices are guaranteed to
have their BARs remaining untouched.

Tested on:
 - x86_64 with "pci=pcie_bus_peer2peer"
 - POWER8 PowerNV+OPAL+PHB3 ppc64le with "pci=pcie_bus_peer2peer".

This patchset is a part of our work on adding support for hotplugging
bridges full of other bridges, NVME drives, SAS HBAs and GPUs without
special requirements such as Hot-Plug Controller, reservation of bus
numbers or memory regions by firmware, etc.

Changes since v5:
 - Simplified the disable flag, now it is "pci=no_movable_buses";
 - More deliberate marking the BARs as immovable;
 - Mark as immovable BARs which are used by unbound drivers;
 - Ignoring BAR assignment by non-kernel program components, so the kernel
   is able now to distribute BARs in optimal and predictable way;
 - Move here PowerNV-specific patches from the older "powerpc/powernv/pci:
   Make hotplug self-sufficient, independent of FW and DT" series;
 - Fix EEH cache rebuilding and PE allocation for PowerNV during rescan.

Changes since v4:
 - Feature is enabled by default (turned on by one of the latest patches);
 - Add pci_dev_movable_bars_supported(dev) instead of marking the immovable
   BARs with the IORESOURCE_PCI_FIXED flag;
 - Set up PCIe bridges during rescan via sysfs, so MPS settings are now
   configured not only during system boot or pcihp events;
 - Allow movement of switch's BARs if claimed by portdrv;
 - Update EEH address caches after rescan for powerpc;
 - Don't disable completely hot-added devices which can't have BARs being
   fit - just disable their BARs, so they are still visible in lspci etc;
 - Clearer names: fixed_range_hard -> immovable_range, fixed_range_soft ->
   realloc_range;
 - Drop the patch for pci_restore_config_space() - fixed by properly using
   the runtime PM.

Changes since v3:
 - Rebased to the upstream, so the patches apply cleanly again.

Changes since v2:
 - Fixed double-assignment of bridge windows;
 - Fixed assignment of fixed prefetched resources;
 - Fixed releasing of fixed resources;
 - Fixed a debug message;
 - Removed auto-enabling the movable BARs for x86 - let's rely on the
   "pcie_movable_bars=force" option for now;
 - Reordered the patches - bugfixes first.

Changes since v1:
 - Add a "pcie_movable_bars={ off | force }" command line argument;
 - Handle the IORESOURCE_PCI_FIXED flag properly;
 - Don't move BARs of devices which don't support the feature;
 - Guarantee that new hotplugged devices will not steal memory from working
   devices by ignoring the failing new devices with the new PCI_DEV_IGNORE
   flag;
 - Add rescan_prepare()+rescan_done() to the struct pci_driver instead of
   using the reset_prepare()+reset_done() from struct pci_error_handlers;
 - Add a bugfix of a race condition;
 - Fixed hotplug in a non-pre-enabled (by BIOS/firmware) bridge;
 - Fix the compatibility of the feature with pm_runtime and D3-state;
 - Hotplug events from pciehp also can move BARs;
 - Add support of the feature to the NVME driver.

Sergey Miroshnichenko (30):
  PCI: Fix race condition in pci_enable/disable_device()
  PCI: Enable bridge's I/O and MEM access for hotplugged devices
  PCI: hotplug: Add a flag for the movable BARs feature
  PCI: Define PCI-specific version of the release_child_resources()
  PCI: hotplug: movable BARs: Fix reassigning the released bridge
windows
  PCI: hotplug: movable BARs: Recalculate all bridge windows during
rescan
  PCI: hotplug: movable BARs: Don't disable the released bridge windows
  

[PATCH v6 03/30] PCI: hotplug: Add a flag for the movable BARs feature

2019-10-24 Thread Sergey Miroshnichenko
When hot-adding a device, the bridge may have windows not big enough (or
fragmented too much) for newly requested BARs to fit in. And expanding
these bridge windows may be impossible because blocked by "neighboring"
BARs and bridge windows.

Still, it may be possible to allocate a memory region for new BARs with the
following procedure:

1) notify all the drivers which support movable BARs to pause and release
   the BARs; the rest of the drivers are guaranteed that their devices will
   not get BARs moved;

2) release all the bridge windows and movable BARs;

3) try to recalculate new bridge windows that will fit all the BAR types:
   - fixed;
   - immovable;
   - movable;
   - newly requested by hot-added devices;

4) if the previous step fails, disable BARs for one of the hot-added
   devices and retry from step 3;

5) notify the drivers, so they remap BARs and resume.

If bridge calculation and BAR assignment fails with a hot-added devices,
BARs of these devices will be disabled, falling back to the same amount and
size of BARs as they were before the hotplug event. The kernel succeeded in
assigning then, so the same algorithm will provide the same results again.

This makes the prior reservation of memory by BIOS/bootloader/firmware not
required anymore for the PCI hotplug.

Drivers indicate their support of movable BARs by implementing the new
.rescan_prepare() and .rescan_done() hooks in the struct pci_driver. All
device's activity must be paused during a rescan, and iounmap()+ioremap()
must be applied to every used BAR.

If a device is not bound to a driver, its BARs are considered movable.

For a higher probability of the successful BAR reassignment, all the BARs
and bridge windows should be released before the rescan, not only those
with higher addresses.

One example when it is needed, BAR(I) is moved to free a gap for the new
BAR(II):

  Before:
  parent bridge window ===
    hotplug bridge window 
|   BAR(I)|   fixed BAR   |   fixed BAR   | fixed BAR |
   ^
   |
   new BAR(II)

  After:
  parent bridge window =
 --- hotplug bridge window ---
| new BAR(II) |   fixed BAR   |   fixed BAR   | fixed BAR | BAR(I)  |

Another example is a fragmented bridge window jammed between fixed BARs:

  Before:
 = parent bridge window 
 -- hotplug bridge window --
| fixed BAR |   | BAR(I) || BAR(II) || BAR(III) | fixed BAR |
   ^
   |
   new BAR(IV)

 After:
  parent bridge window =
 -- hotplug bridge window --
| fixed BAR | BAR(I) | BAR(II) | BAR(III) | new BAR(IV) | fixed BAR |

This patch is a preparation for future patches with actual implementation,
and for now it just does the following:
 - declares the feature;
 - defines the bool pci_can_move_bars and bool pci_dev_bar_movable(dev);
 - invokes the .rescan_prepare() and .rescan_done() driver notifiers;
 - disables the feature for the powerpc/pseries.

The feature is disabled by default until the final patch of the series.
It can be overridden per-arch using the pci_can_move_bars=false flag or by
the following command line option:

pci=no_movable_bars

CC: Sam Bobroff 
CC: Rajat Jain 
CC: Lukas Wunner 
CC: Oliver O'Halloran 
CC: David Laight 
Signed-off-by: Sergey Miroshnichenko 
---
 .../admin-guide/kernel-parameters.txt |  1 +
 arch/powerpc/platforms/pseries/setup.c|  2 +
 drivers/pci/pci.c |  4 +
 drivers/pci/pci.h |  2 +
 drivers/pci/probe.c   | 85 ++-
 include/linux/pci.h   |  4 +
 6 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a84a83f8881e..c6243aaed0c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3528,6 +3528,7 @@
may put more devices in an IOMMU group.
force_floating  [S390] Force usage of floating interrupts.
nomio   [S390] Do not use MIO instructions.
+   no_movable_bars Don't allow BARs to be moved during hotplug
 
pcie_aspm=  [PCIE] Forcibly enable or disable PCIe Active State 
Power
Management.
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 0a40201f315f..7cd12c5a2deb 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -920,6 +920,8 @@ static void __init pseries_init(void)
 {
  

[PATCH v6 02/30] PCI: Enable bridge's I/O and MEM access for hotplugged devices

2019-10-24 Thread Sergey Miroshnichenko
The PCI_COMMAND_IO and PCI_COMMAND_MEMORY bits of the bridge must be
updated not only when enabling the bridge for the first time, but also if a
hotplugged device requests these types of resources.

Originally these bits were set by the pci_enable_device_flags() only, which
exits early if the bridge is already pci_is_enabled(). So if the bridge was
empty initially (an edge case), then hotplugged devices fail to IO/MEM.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 44d0d12c80cf..e85dc63c73fd 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1650,6 +1650,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
pci_enable_bridge(bridge);
 
if (pci_is_enabled(dev)) {
+   int i, bars = 0;
+
+   for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) {
+   if (dev->resource[i].flags & (IORESOURCE_MEM | 
IORESOURCE_IO))
+   bars |= (1 << i);
+   }
+   do_pci_enable_device(dev, bars);
+
if (!dev->is_busmaster)
pci_set_master(dev);
mutex_unlock(>enable_mutex);
-- 
2.23.0



[PATCH v6 01/30] PCI: Fix race condition in pci_enable/disable_device()

2019-10-24 Thread Sergey Miroshnichenko
This is a yet another approach to fix an old [1-2] concurrency issue, when:
 - two or more devices are being hot-added into a bridge which was
   initially empty;
 - a bridge with two or more devices is being hot-added;
 - during boot, if BIOS/bootloader/firmware doesn't pre-enable bridges.

The problem is that a bridge is reported as enabled before the MEM/IO bits
are actually written to the PCI_COMMAND register, so another driver thread
starts memory requests through the not-yet-enabled bridge:

 CPU0CPU1

 pci_enable_device_mem() pci_enable_device_mem()
   pci_enable_bridge() pci_enable_bridge()
 pci_is_enabled()
   return false;
 atomic_inc_return(enable_cnt)
 Start actual enabling the bridge
 ... pci_is_enabled()
 ...   return true;
 ... Start memory requests <-- FAIL
 ...
 Set the PCI_COMMAND_MEMORY bit <-- Must wait for this

Protect the pci_enable/disable_device() and pci_enable_bridge(), which is
similar to the previous solution from commit 40f11adc7cd9 ("PCI: Avoid race
while enabling upstream bridges"), but adding a per-device mutexes and
preventing the dev->enable_cnt from from incrementing early.

CC: Srinath Mannam 
CC: Marta Rybczynska 
Signed-off-by: Sergey Miroshnichenko 

[1] 
https://lore.kernel.org/linux-pci/1501858648-8-1-git-send-email-srinath.man...@broadcom.com/T/#u
[RFC PATCH v3] pci: Concurrency issue during pci enable bridge

[2] 
https://lore.kernel.org/linux-pci/744877924.5841545.1521630049567.javamail.zim...@kalray.eu/T/#u
[RFC PATCH] nvme: avoid race-conditions when enabling devices
---
 drivers/pci/pci.c   | 26 ++
 drivers/pci/probe.c |  1 +
 include/linux/pci.h |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a97e2571a527..44d0d12c80cf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1643,6 +1643,8 @@ static void pci_enable_bridge(struct pci_dev *dev)
struct pci_dev *bridge;
int retval;
 
+   mutex_lock(>enable_mutex);
+
bridge = pci_upstream_bridge(dev);
if (bridge)
pci_enable_bridge(bridge);
@@ -1650,6 +1652,7 @@ static void pci_enable_bridge(struct pci_dev *dev)
if (pci_is_enabled(dev)) {
if (!dev->is_busmaster)
pci_set_master(dev);
+   mutex_unlock(>enable_mutex);
return;
}
 
@@ -1658,11 +1661,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
pci_err(dev, "Error enabling bridge (%d), continuing\n",
retval);
pci_set_master(dev);
+   mutex_unlock(>enable_mutex);
 }
 
 static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 {
struct pci_dev *bridge;
+   /* Enable-locking of bridges is performed within the 
pci_enable_bridge() */
+   bool need_lock = !dev->subordinate;
int err;
int i, bars = 0;
 
@@ -1678,8 +1684,13 @@ static int pci_enable_device_flags(struct pci_dev *dev, 
unsigned long flags)
dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
}
 
-   if (atomic_inc_return(>enable_cnt) > 1)
+   if (need_lock)
+   mutex_lock(>enable_mutex);
+   if (pci_is_enabled(dev)) {
+   if (need_lock)
+   mutex_unlock(>enable_mutex);
return 0;   /* already enabled */
+   }
 
bridge = pci_upstream_bridge(dev);
if (bridge)
@@ -1694,8 +1705,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, 
unsigned long flags)
bars |= (1 << i);
 
err = do_pci_enable_device(dev, bars);
-   if (err < 0)
-   atomic_dec(>enable_cnt);
+   if (err >= 0)
+   atomic_inc(>enable_cnt);
+   if (need_lock)
+   mutex_unlock(>enable_mutex);
return err;
 }
 
@@ -1939,15 +1952,20 @@ void pci_disable_device(struct pci_dev *dev)
if (dr)
dr->enabled = 0;
 
+   mutex_lock(>enable_mutex);
dev_WARN_ONCE(>dev, atomic_read(>enable_cnt) <= 0,
  "disabling already-disabled device");
 
-   if (atomic_dec_return(>enable_cnt) != 0)
+   if (atomic_dec_return(>enable_cnt) != 0) {
+   mutex_unlock(>enable_mutex);
return;
+   }
 
do_pci_disable_device(dev);
 
dev->is_busmaster = 0;
+
+   mutex_unlock(>enable_mutex);
 }
 EXPORT_SYMBOL(pci_disable_device);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3d5271a7a849..d4f21e413638 100644
--- a/drivers/pci/probe.c
+++ b/d

Re: [PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature

2019-10-16 Thread Sergey Miroshnichenko

On 10/16/19 1:14 AM, Bjorn Helgaas wrote:

On Mon, Sep 30, 2019 at 03:59:25PM +0300, Sergey Miroshnichenko wrote:

Hello Bjorn,

On 9/28/19 1:02 AM, Bjorn Helgaas wrote:

On Fri, Aug 16, 2019 at 07:50:41PM +0300, Sergey Miroshnichenko wrote:

When hot-adding a device, the bridge may have windows not big enough (or
fragmented too much) for newly requested BARs to fit in. And expanding
these bridge windows may be impossible because blocked by "neighboring"
BARs and bridge windows.

Still, it may be possible to allocate a memory region for new BARs with the
following procedure:

1) notify all the drivers which support movable BARs to pause and release
 the BARs; the rest of the drivers are guaranteed that their devices will
 not get BARs moved;

2) release all the bridge windows except of root bridges;

3) try to recalculate new bridge windows that will fit all the BAR types:
 - fixed;
 - immovable;
 - movable;
 - newly requested by hot-added devices;

4) if the previous step fails, disable BARs for one of the hot-added
 devices and retry from step 3;

5) notify the drivers, so they remap BARs and resume.


You don't do the actual recalculation in *this* patch, but since you
mention the procedure here, are we confident that we never make things
worse?

It's possible that a hot-add will trigger this attempt to move things
around, and it's possible that we won't find space for the new device
even if we move things around.  But are we certain that every device
that worked *before* the hot-add will still work *afterwards*?

Much of the assignment was probably done by the BIOS using different
algorithms than Linux has, so I think there's some chance that the
BIOS did a better job and if we lose that BIOS assignment, we might
not be able to recreate it.


If a hardware has some special constraints on BAR assignment that the
kernel is not aware of yet, the movable BARs may break things after a
hotplug event. So the feature must be disabled there (manually) until
the kernel get support for that special needs.


I'm not talking about special constraints on BAR assignment.  (I'm not
sure what those constraints would be -- AFAIK the constraints for a
spec-compliant device are all discoverable via the BAR size and type
(or the Enhanced Allocation capability)).

What I'm concerned about is the case where we boot with a working
assignment, we hot-add a device, we move things around to try to
accommodate the new device, and not only do we fail to find resources
for the new device, we also fail to find a working assignment for the
devices that were present at boot.  We've moved things around from
what BIOS did, and since we use a different algorithm than the BIOS,
there's no guarantee that we'll be able to find the assignment BIOS
did.



If BAR assignment fails with a hot-added device, these patches will
disable BARs for this device and retry, falling back to the situation
where number of BARs and their size are the same as they were before
the hotplug event.

If all the BARs are immovable - they will just remain on their
positions. Nothing to break here I guess.

If almost all the BARs are immovable and there is one movable BAR,
after releasing the bridge windows there will be a free gap - right
where this movable BAR was. These patches are keeping the size of
released BARs, not requesting the size from the devices again - so the
device can't ask for a larger BAR. The space reserving is disabled by
this patchset, so the kernel will request the same size for the bridge
window containing this movable BAR. So there always will be a gap for
this BAR - in the same location it was before.

Based on these considerations I assume that the kernel is always able
to arrange BARs from scratch if a BIOS was able to make it before.

But! There is an implicit speculation that there will be the same
amount of BARs after the fallback (which is equivalent to a PCI rescan
triggered on unchanged topology). And two week ago I've found that
this is not always true!

I was testing on a "new" x86_64 PC, where BIOS doesn't reserve a space
for SR-IOV BARs (of a network adapter). On the boot, the kernel wasn't
arranging BARs itself - it took values written by the BIOS. And the
bridge window was "jammed" between immovable BARs, so it can't expand.
BARs of this device are also immovable, so the bridge window can't be
moved away. During the PCI rescan, the kernel tried to allocate both
"regular" and SR-IOV BARs - and failed. Even without changes in the
PCI topology.

So in the next version of this series there will be one more patch,
that allows the kernel to ignore BIOS's setting for the "safe" (non-IO
and non-VGA) BARs, so these BARs will be arranged kernel-way - and
also those forgotten by the BIOS.


I'm not sure why the PCI_CLASS_DISPLAY_VGA special case is there; can
you add a comment about why that's needed?  Obviously we can't move
the 0xa legacy frame b

Re: [PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature

2019-09-30 Thread Sergey Miroshnichenko

Hello David,

On 9/30/19 11:44 AM, David Laight wrote:

From: Bjorn Helgaas

Sent: 27 September 2019 23:02
On Fri, Aug 16, 2019 at 07:50:41PM +0300, Sergey Miroshnichenko wrote:

When hot-adding a device, the bridge may have windows not big enough (or
fragmented too much) for newly requested BARs to fit in. And expanding
these bridge windows may be impossible because blocked by "neighboring"
BARs and bridge windows.

Still, it may be possible to allocate a memory region for new BARs with the
following procedure:

1) notify all the drivers which support movable BARs to pause and release
the BARs; the rest of the drivers are guaranteed that their devices will
not get BARs moved;

2) release all the bridge windows except of root bridges;

3) try to recalculate new bridge windows that will fit all the BAR types:
- fixed;
- immovable;
- movable;
- newly requested by hot-added devices;

4) if the previous step fails, disable BARs for one of the hot-added
devices and retry from step 3;

5) notify the drivers, so they remap BARs and resume.


You don't do the actual recalculation in *this* patch, but since you
mention the procedure here, are we confident that we never make things
worse?

It's possible that a hot-add will trigger this attempt to move things
around, and it's possible that we won't find space for the new device
even if we move things around.  But are we certain that every device
that worked *before* the hot-add will still work *afterwards*?

Much of the assignment was probably done by the BIOS using different
algorithms than Linux has, so I think there's some chance that the
BIOS did a better job and if we lose that BIOS assignment, we might
not be able to recreate it.


Yep, removing everything and starting again is probably OTT and most of the 
churn won't help.

I think you need to work out what can be moved in order to make the required 
resources available
to each bus and then make the required changes.

In the simplest case you are trying to add resource below a bridge so need to 
'shuffle'
everything allocated after that bridge to later addresses (etc).



Thank you for the review and suggestions!

But a bridge window may be fragmented: its total free space is enough
to fit everything, but no sufficient gaps for the new BARs. And this
bridge window may be jammed between two immovable/fixed BARs.

Or there may be lots of empty spaces in lower addresses after un-plugs,
but everything if fixed/immovable on higher addresses.

I've spent some time thinking on an optimization technique which can
be efficient enough (touch as few BARs as possible) with as high
success rate as calculating from scratch - and concluded that it is
not worth it: if only release the "obstructing" BARs and bridge
windows, a hotplug event will affect a half of (n+m) on average, which
is still O(n+m), where n is a number of endpoints, and m is a
number of bridges. But it's still need to resize windows of a root and
other common bridges.

Calculating bridge windows from scratch is relatively straightforward
and fast, so I have just added support for fixed/immovable BARs there
and reused.


Many devices that support address reassignment might not need to be moved - so 
there is
no point remmapping them.



And it's the same algorithm that allocated BARs in first place, so it
will reassign the same BARs for the non-affected part of the topology.


There is also the case when a device that is present but not currently is use 
could be taken
through a remove+insert sequence in order to change its resources.
Much easier to implement than 'remap while active'.
This would require a call into the driver (than can sleep) to request whether 
it is idle.
(and probably one at the end if the remove wasn't done).



Unbind+rebind the "immovable" drivers of non-opened devices may
increase the probability of successful BAR allocation, but I'm afraid
this will produce some amount of false hotplug-like events in the logs.
Probably also some undesired effects like spikes in power consumption
because of driver initialization.

Best regards,
Serge


David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)



Re: [PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature

2019-09-30 Thread Sergey Miroshnichenko

Hello Bjorn,

On 9/28/19 1:02 AM, Bjorn Helgaas wrote:

On Fri, Aug 16, 2019 at 07:50:41PM +0300, Sergey Miroshnichenko wrote:

When hot-adding a device, the bridge may have windows not big enough (or
fragmented too much) for newly requested BARs to fit in. And expanding
these bridge windows may be impossible because blocked by "neighboring"
BARs and bridge windows.

Still, it may be possible to allocate a memory region for new BARs with the
following procedure:

1) notify all the drivers which support movable BARs to pause and release
the BARs; the rest of the drivers are guaranteed that their devices will
not get BARs moved;

2) release all the bridge windows except of root bridges;

3) try to recalculate new bridge windows that will fit all the BAR types:
- fixed;
- immovable;
- movable;
- newly requested by hot-added devices;

4) if the previous step fails, disable BARs for one of the hot-added
devices and retry from step 3;

5) notify the drivers, so they remap BARs and resume.


You don't do the actual recalculation in *this* patch, but since you
mention the procedure here, are we confident that we never make things
worse?

It's possible that a hot-add will trigger this attempt to move things
around, and it's possible that we won't find space for the new device
even if we move things around.  But are we certain that every device
that worked *before* the hot-add will still work *afterwards*?

Much of the assignment was probably done by the BIOS using different
algorithms than Linux has, so I think there's some chance that the
BIOS did a better job and if we lose that BIOS assignment, we might
not be able to recreate it.



If a hardware has some special constraints on BAR assignment that the
kernel is not aware of yet, the movable BARs may break things after a
hotplug event. So the feature must be disabled there (manually) until
the kernel get support for that special needs.

On x86 we had no choice - most of the machines we used just can't boot
with even an "empty" 16-port switch connected. So we hot-add it after
the boot, then trigger a rescan via 'echo 1 > /sys/bus/pci/rescan'.
And reserved bridge windows wasn't enough, and they can't expand
because are blocked by the next device.


This makes the prior reservation of memory by BIOS/bootloader/firmware not
required anymore for the PCI hotplug.

Drivers indicate their support of movable BARs by implementing the new
.rescan_prepare() and .rescan_done() hooks in the struct pci_driver. All
device's activity must be paused during a rescan, and iounmap()+ioremap()
must be applied to every used BAR.

The platform also may need to prepare to BAR movement, so new hooks added:
pcibios_rescan_prepare(pci_dev) and pcibios_rescan_prepare(pci_dev).

This patch is a preparation for future patches with actual implementation,
and for now it just does the following:
  - declares the feature;
  - defines pci_movable_bars_enabled(), pci_dev_movable_bars_supported(dev);
  - invokes the .rescan_prepare() and .rescan_done() driver notifiers;
  - declares and invokes the pcibios_rescan_prepare()/_done() hooks;
  - adds the PCI_IMMOVABLE_BARS flag.

The feature is disabled by default (via PCI_IMMOVABLE_BARS) until the final
patch of the series. It can be overridden per-arch using this flag or by
the following command line option:

 pcie_movable_bars={ off | force }

CC: Sam Bobroff 
CC: Rajat Jain 
CC: Lukas Wunner 
CC: Oliver O'Halloran 
CC: David Laight 
Signed-off-by: Sergey Miroshnichenko 
---
  .../admin-guide/kernel-parameters.txt |  7 ++
  drivers/pci/pci-driver.c  |  2 +
  drivers/pci/pci.c | 24 ++
  drivers/pci/pci.h |  2 +
  drivers/pci/probe.c   | 86 ++-
  include/linux/pci.h   |  7 ++
  6 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 47d981a86e2f..e2274ee87a35 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3526,6 +3526,13 @@
nomsi   Do not use MSI for native PCIe PME signaling (this makes
all PCIe root ports use INTx for all services).
  
+	pcie_movable_bars=[PCIE]


This isn't a PCIe-specific feature, it's just a function of whether
drivers are smart enough, so we shouldn't tie it specifically to PCIe.
We could eventually do this for conventional PCI as well.


+   Override the movable BARs support detection:
+   off
+   Disable even if supported by the platform
+   force
+   Enable even if not explicitly declared as supported


What's the need for "force"?  If it's possible, I think we should
enable this functionality all the ti

Re: [PATCH v5 01/23] PCI: Fix race condition in pci_enable/disable_device()

2019-09-30 Thread Sergey Miroshnichenko

Hello Bjorn,

On 9/28/19 12:59 AM, Bjorn Helgaas wrote:

On Fri, Aug 16, 2019 at 07:50:39PM +0300, Sergey Miroshnichenko wrote:

This is a yet another approach to fix an old [1-2] concurrency issue, when:
  - two or more devices are being hot-added into a bridge which was
initially empty;
  - a bridge with two or more devices is being hot-added;
  - during boot, if BIOS/bootloader/firmware doesn't pre-enable bridges.

The problem is that a bridge is reported as enabled before the MEM/IO bits
are actually written to the PCI_COMMAND register, so another driver thread
starts memory requests through the not-yet-enabled bridge:

  CPU0CPU1

  pci_enable_device_mem() pci_enable_device_mem()
pci_enable_bridge() pci_enable_bridge()
  pci_is_enabled()
return false;
  atomic_inc_return(enable_cnt)
  Start actual enabling the bridge
  ... pci_is_enabled()
  ...   return true;
  ... Start memory requests <-- FAIL
  ...
  Set the PCI_COMMAND_MEMORY bit <-- Must wait for this

Protect the pci_enable/disable_device() and pci_enable_bridge(), which is
similar to the previous solution from commit 40f11adc7cd9 ("PCI: Avoid race
while enabling upstream bridges"), but adding a per-device mutexes and
preventing the dev->enable_cnt from from incrementing early.


This isn't directly related to the movable BARs functionality; is it
here because you see the problem more frequently when moving BARs?



First two patches of this series (including this one) are fixes for
the boot and for the hotplug, not related to movable BARs.

Before these fixes, we were suffering from this issue on PowerNV until
commit db2173198b9513f7add8009f225afa1f1c79bcc6 "powerpc/powernv/pci:
Work around races in PCI bridge enabling" was backported to distros:
NVMEs randomly failed to start during system boot. So we've tested the
fixes with that commit reverted.

On x86 the BIOS does pre-enable the bridges, but they were still prone
to races when hot-added or was initially "empty".

Serge


Re: [PATCH v5 18/23] powerpc/pci: Handle BAR movement

2019-09-06 Thread Sergey Miroshnichenko
Hi Oliver,

On 9/4/19 8:37 AM, Oliver O'Halloran wrote:
> On Fri, 2019-08-16 at 19:50 +0300, Sergey Miroshnichenko wrote:
>> Add pcibios_rescan_prepare()/_done() hooks for the powerpc platform. Now if
>> the device's driver supports movable BARs, pcibios_rescan_prepare() will be
>> called after the device is stopped, and pcibios_rescan_done() - before it
>> resumes. There are no memory requests to this device between the hooks, so
>> it it safe to rebuild the EEH address cache during that.
>>
>> CC: Oliver O'Halloran 
>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  arch/powerpc/kernel/pci-hotplug.c | 10 ++
>>  1 file changed, 10 insertions(+)
>>
>> diff --git a/arch/powerpc/kernel/pci-hotplug.c 
>> b/arch/powerpc/kernel/pci-hotplug.c
>> index 0b0cf8168b47..18cf13bba228 100644
>> --- a/arch/powerpc/kernel/pci-hotplug.c
>> +++ b/arch/powerpc/kernel/pci-hotplug.c
>> @@ -144,3 +144,13 @@ void pci_hp_add_devices(struct pci_bus *bus)
>>  pcibios_finish_adding_to_bus(bus);
>>  }
>>  EXPORT_SYMBOL_GPL(pci_hp_add_devices);
>> +
>> +void pcibios_rescan_prepare(struct pci_dev *pdev)
>> +{
>> +eeh_addr_cache_rmv_dev(pdev);
>> +}
>> +
>> +void pcibios_rescan_done(struct pci_dev *pdev)
>> +{
>> +eeh_addr_cache_insert_dev(pdev);
>> +}
> 
> Is this actually sufficent? The PE number for a device is largely
> determined by the location of the MMIO BARs. If you move a BAR far
> enough the PE number stored in the eeh_pe would need to be updated as
> well.
> 

Thanks for the hint! I've checked on our PowerNV: for bridges with MEM
only it allocates PE numbers starting from 0xff down, and when there
are MEM64 - starting from 0 up, one PE number per 4GiB.

PEs are allocated during call to pnv_pci_setup_bridge(), and the I've
added invocation of pci_setup_bridge() after a hotplug event in the
"Recalculate all bridge windows during rescan" patch of this series.

Currently, if a bus already has a PE, pnv_ioda_setup_bus_PE() takes it
and returns. I can see two ways to change it, both are not difficult to
implement:

 a.1) check if MEM64 BARs appeared below the bus - allocate and assign
  a new master PE with required number of slave PEs;

 a.2) if the bus now has more MEM64 than before - check if more slave
  PEs must be reserved;

 b) release all the PEs before a PCI rescan and allocate+assign them
again after - with this approach the "Hook up the writes to
PCI_SECONDARY_BUS register" patch may be eliminated.

Do you find any of these suitable?

Serge



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v5 16/23] PCI: hotplug: movable BARs: Don't reserve IO/mem bus space

2019-09-04 Thread Sergey Miroshnichenko
On 9/4/19 8:42 AM, Oliver O'Halloran wrote:
> On Fri, 2019-08-16 at 19:50 +0300, Sergey Miroshnichenko wrote:
>> A hotplugged bridge with many hotplug-capable ports may request
>> reserving more IO space than the machine has. This could be overridden
>> with the "hpiosize=" kernel argument though.
>>
>> But when BARs are movable, there are no need to reserve space anymore:
>> new BARs are allocated not from reserved gaps, but via rearranging the
>> existing BARs. Requesting a precise amount of space for bridge windows
>> increases the chances of adding the new bridge successfully.
> 
> It wouldn't hurt to reserve some memory space to prevent unnecessary
> BAR shuffling at runtime. If it turns out that we need more space then
> we can always fall back to re-assigning the whole tree.
> 

Hi Oliver,

Thank you for your comments!

We had an issue on a x86_64 PC with a small amount of IO space: after
hotplugging an empty bridge of 32 ports even a DEFAULT_HOTPLUG_IO_SIZE
(which is 256) was enough to exhaust the space. So another patch of
this series ("Don't allow added devices to steal resources") had
disabled the BAR allocating for this bridge. It took some time for me
to guess that "hpiosize=0" can solve that.

For MEM and MEM64 spaces it will be harder to reproduce the same, but
there can be a similar problem when fitting between two immovable BARs.

To implement a fallback it would need to add some flag indicating that
allocating this bridge with reserved spaces has failed, so its windows
should be recalculated without reserved spaces - and try again. Maybe
even two types of retrials: with and without the full re-assignment.
We've tried to avoid adding execution paths and code complicatedness.

Serge

>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/pci/setup-bus.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
>> index c7b7e30c6284..7d64ec8e7088 100644
>> --- a/drivers/pci/setup-bus.c
>> +++ b/drivers/pci/setup-bus.c
>> @@ -1287,7 +1287,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, 
>> struct list_head *realloc_head)
>>  
>>  case PCI_HEADER_TYPE_BRIDGE:
>>  pci_bridge_check_ranges(bus);
>> -if (bus->self->is_hotplug_bridge) {
>> +if (bus->self->is_hotplug_bridge && 
>> !pci_movable_bars_enabled()) {
>>  additional_io_size  = pci_hotplug_io_size;
>>  additional_mem_size = pci_hotplug_mem_size;
>>  }
> 


[PATCH v5 23/23] PCI: pciehp: movable BARs: Trigger a domain rescan on hp events

2019-08-16 Thread Sergey Miroshnichenko
With movable BARs, adding a hotplugged device is not local to its bridge
anymore, but it affects the whole domain: BARs, bridge windows and bus
numbers can be substantially rearranged. So instead of trying to fit the
new devices into preallocated reserved gaps, initiate a full domain rescan.

The pci_rescan_bus() covers all the operations of the replaced functions:
 - assigning new bus numbers, as the pci_hp_add_bridge() does it;
 - allocating BARs (pci_assign_unassigned_bridge_resources());
 - cofiguring MPS settings (pcie_bus_configure_settings());
 - binding devices to their drivers (pci_bus_add_devices()).

CC: Lukas Wunner 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/hotplug/pciehp_pci.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index d17f3bf36f70..66c4e6d88fe3 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -58,6 +58,11 @@ int pciehp_configure_device(struct controller *ctrl)
goto out;
}
 
+   if (pci_movable_bars_enabled()) {
+   pci_rescan_bus(parent);
+   goto out;
+   }
+
for_each_pci_bridge(dev, parent)
pci_hp_add_bridge(dev);
 
-- 
2.21.0



[PATCH v5 22/23] PCI/portdrv: Declare support of movable BARs

2019-08-16 Thread Sergey Miroshnichenko
Switch's BARs are not used by the portdrv driver, but they are still
considered as immovable until the .rescan_prepare() and .rescan_done()
hooks are added. Add these hooks to increase chances to allocate new BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pcie/portdrv_pci.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 0a87091a0800..9dbddc7faaa7 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -197,6 +197,14 @@ static const struct pci_error_handlers 
pcie_portdrv_err_handler = {
.resume = pcie_portdrv_err_resume,
 };
 
+static void pcie_portdrv_rescan_prepare(struct pci_dev *pdev)
+{
+}
+
+static void pcie_portdrv_rescan_done(struct pci_dev *pdev)
+{
+}
+
 static struct pci_driver pcie_portdriver = {
.name   = "pcieport",
.id_table   = _pci_ids[0],
@@ -207,6 +215,9 @@ static struct pci_driver pcie_portdriver = {
 
.err_handler= _portdrv_err_handler,
 
+   .rescan_prepare = pcie_portdrv_rescan_prepare,
+   .rescan_done= pcie_portdrv_rescan_done,
+
.driver.pm  = PCIE_PORTDRV_PM_OPS,
 };
 
-- 
2.21.0



[PATCH v5 21/23] nvme-pci: Handle movable BARs

2019-08-16 Thread Sergey Miroshnichenko
Hotplugged devices can affect the existing ones by moving their BARs. The
PCI subsystem will inform the NVME driver about this by invoking the
.rescan_prepare() and .rescan_done() hooks, so the BARs can by re-mapped.

Tested under the "randrw" mode of the fio tool. Before the hotplugging:

  % sudo cat /proc/iomem
  ...
3fe8-3fe8007f : PCI Bus 0020:0b
  3fe8-3fe8007f : PCI Bus 0020:18
3fe8-3fe8000f : 0020:18:00.0
  3fe8-3fe8000f : nvme
3fe80010-3fe80017 : 0020:18:00.0
  ...

, then another NVME drive was hot-added, so BARs of the 0020:18:00.0 are
moved:

  % sudo cat /proc/iomem
...
3fe8-3fe800ff : PCI Bus 0020:0b
  3fe8-3fe8007f : PCI Bus 0020:10
3fe8-3fe83fff : 0020:10:00.0
  3fe8-3fe83fff : nvme
3fe80001-3fe80001 : 0020:10:00.0
  3fe80080-3fe800ff : PCI Bus 0020:18
3fe80080-3fe8008f : 0020:18:00.0
  3fe80080-3fe8008f : nvme
3fe80090-3fe80097 : 0020:18:00.0
...

During the rescanning, both READ and WRITE speeds drop to zero for a while
due to driver's pause, then restore.

Cc: linux-n...@lists.infradead.org
Cc: Christoph Hellwig 
Signed-off-by: Sergey Miroshnichenko 
---
 drivers/nvme/host/pci.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index db160cee42ad..a805d80082ca 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1645,7 +1645,7 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned 
long size)
 {
struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-   if (size <= dev->bar_mapped_size)
+   if (dev->bar && size <= dev->bar_mapped_size)
return 0;
if (size > pci_resource_len(pdev, 0))
return -ENOMEM;
@@ -2980,6 +2980,23 @@ static void nvme_error_resume(struct pci_dev *pdev)
flush_work(>ctrl.reset_work);
 }
 
+static void nvme_rescan_prepare(struct pci_dev *pdev)
+{
+   struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+   nvme_dev_disable(dev, false);
+   nvme_dev_unmap(dev);
+   dev->bar = NULL;
+}
+
+static void nvme_rescan_done(struct pci_dev *pdev)
+{
+   struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+   nvme_dev_map(dev);
+   nvme_reset_ctrl_sync(>ctrl);
+}
+
 static const struct pci_error_handlers nvme_err_handler = {
.error_detected = nvme_error_detected,
.slot_reset = nvme_slot_reset,
@@ -3049,6 +3066,8 @@ static struct pci_driver nvme_driver = {
 #endif
.sriov_configure = pci_sriov_configure_simple,
.err_handler= _err_handler,
+   .rescan_prepare = nvme_rescan_prepare,
+   .rescan_done= nvme_rescan_done,
 };
 
 static int __init nvme_init(void)
-- 
2.21.0



[PATCH v5 20/23] PCI: hotplug: movable BARs: Enable the feature by default

2019-08-16 Thread Sergey Miroshnichenko
This is the last patch in the series which implements the essentials of the
Movable BARs feature, so it is turned by default now. Tested on:

 - x86_64 with "pci=realloc,assign-busses,use_crs,pcie_bus_peer2peer"
   command line argument;
 - POWER8 PowerNV+PHB3 ppc64le with "pci=realloc,pcie_bus_peer2peer".

In case of problems it is still can be overridden by the following command
line option:

pcie_movable_bars=off

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci-driver.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d11909e79263..a8124e47bf6e 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1688,8 +1688,6 @@ static int __init pci_driver_init(void)
 {
int ret;
 
-   pci_add_flags(PCI_IMMOVABLE_BARS);
-
ret = bus_register(_bus_type);
if (ret)
return ret;
-- 
2.21.0



[PATCH v5 19/23] PCI: hotplug: Configure MPS for hot-added bridges during bus rescan

2019-08-16 Thread Sergey Miroshnichenko
Assure that MPS settings are set up for bridges which are discovered
during manually triggered rescan via sysfs. This sequence of bridge
init (using pci_rescan_bus()) will be used for pciehp hot-add events
when BARs are movable.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/probe.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 5f52a19738aa..4bb10d27cb3a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3688,7 +3688,7 @@ static void pci_reassign_root_bus_resources(struct 
pci_bus *root)
 unsigned int pci_rescan_bus(struct pci_bus *bus)
 {
unsigned int max;
-   struct pci_bus *root = bus;
+   struct pci_bus *root = bus, *child;
 
while (!pci_is_root_bus(root))
root = root->parent;
@@ -3708,6 +3708,9 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_assign_unassigned_bus_resources(bus);
}
 
+   list_for_each_entry(child, >children, node)
+   pcie_bus_configure_settings(child);
+
pci_bus_add_devices(bus);
 
return max;
-- 
2.21.0



[PATCH v5 18/23] powerpc/pci: Handle BAR movement

2019-08-16 Thread Sergey Miroshnichenko
Add pcibios_rescan_prepare()/_done() hooks for the powerpc platform. Now if
the device's driver supports movable BARs, pcibios_rescan_prepare() will be
called after the device is stopped, and pcibios_rescan_done() - before it
resumes. There are no memory requests to this device between the hooks, so
it it safe to rebuild the EEH address cache during that.

CC: Oliver O'Halloran 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci-hotplug.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index 0b0cf8168b47..18cf13bba228 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -144,3 +144,13 @@ void pci_hp_add_devices(struct pci_bus *bus)
pcibios_finish_adding_to_bus(bus);
 }
 EXPORT_SYMBOL_GPL(pci_hp_add_devices);
+
+void pcibios_rescan_prepare(struct pci_dev *pdev)
+{
+   eeh_addr_cache_rmv_dev(pdev);
+}
+
+void pcibios_rescan_done(struct pci_dev *pdev)
+{
+   eeh_addr_cache_insert_dev(pdev);
+}
-- 
2.21.0



[PATCH v5 15/23] PCI: hotplug: movable BARs: Assign fixed and immovable BARs before others

2019-08-16 Thread Sergey Miroshnichenko
Reassign resources during rescan in two steps: first the fixed/immovable
BARs and bridge windows that have fixed areas, so the movable ones will not
steal these reserved areas; then the rest - so the movable BARs will divide
the rest of the space.

With this change, pci_assign_resource() is now able to assign all types of
BARs, so the pdev_assign_fixed_resources() became unused and thus removed.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  2 ++
 drivers/pci/setup-bus.c | 79 -
 drivers/pci/setup-res.c |  8 +++--
 3 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 12add575faf1..e1fcc46f9c40 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -260,6 +260,8 @@ void pci_disable_bridge_window(struct pci_dev *dev);
 
 bool pci_dev_movable_bars_supported(struct pci_dev *dev);
 
+int assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r);
+
 /* PCIe link information */
 #define PCIE_SPEED2STR(speed) \
((speed) == PCIE_SPEED_16_0GT ? "16 GT/s" : \
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 6f12411357f3..c7b7e30c6284 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -38,6 +38,15 @@ struct pci_dev_resource {
unsigned long flags;
 };
 
+enum assign_step {
+   assign_fixed_resources,
+   assign_float_resources,
+};
+
+static void _assign_requested_resources_sorted(struct list_head *head,
+  struct list_head *fail_head,
+  enum assign_step step);
+
 static void free_list(struct list_head *head)
 {
struct pci_dev_resource *dev_res, *tmp;
@@ -278,19 +287,48 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
  */
 static void assign_requested_resources_sorted(struct list_head *head,
 struct list_head *fail_head)
+{
+   _assign_requested_resources_sorted(head, fail_head, 
assign_fixed_resources);
+   _assign_requested_resources_sorted(head, fail_head, 
assign_float_resources);
+}
+
+static void _assign_requested_resources_sorted(struct list_head *head,
+  struct list_head *fail_head,
+  enum assign_step step)
 {
struct resource *res;
struct pci_dev_resource *dev_res;
int idx;
 
list_for_each_entry(dev_res, head, list) {
+   bool is_fixed = false;
+
if (!pci_dev_bars_enabled(dev_res->dev))
continue;
 
res = dev_res->res;
+   if (!resource_size(res))
+   continue;
+
idx = res - _res->dev->resource[0];
-   if (resource_size(res) &&
-   pci_assign_resource(dev_res->dev, idx)) {
+
+   if (idx < PCI_BRIDGE_RESOURCES) {
+   is_fixed = (res->flags & IORESOURCE_PCI_FIXED) ||
+   !pci_dev_movable_bars_supported(dev_res->dev);
+   } else {
+   int b_res_idx = pci_get_bridge_resource_idx(res);
+   struct resource *fixed_res =
+   
_res->dev->subordinate->immovable_range[b_res_idx];
+
+   is_fixed = (fixed_res->start < fixed_res->end);
+   }
+
+   if (assign_fixed_resources == step && !is_fixed)
+   continue;
+   else if (assign_float_resources == step && is_fixed)
+   continue;
+
+   if (pci_assign_resource(dev_res->dev, idx)) {
if (fail_head) {
/*
 * If the failed resource is a ROM BAR and
@@ -1336,7 +1374,7 @@ void pci_bus_size_bridges(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
-static void assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
+int assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r)
 {
int i;
struct resource *parent_r;
@@ -1353,35 +1391,14 @@ static void assign_fixed_resource_on_bus(struct pci_bus 
*b, struct resource *r)
!(r->flags & IORESOURCE_PREFETCH))
continue;
 
-   if (resource_contains(parent_r, r))
-   request_resource(parent_r, r);
-   }
-}
-
-/*
- * Try to assign any resources marked as IORESOURCE_PCI_FIXED, as they are
- * skipped by pbus_assign_resources_sorted().
- */
-static void pdev_assign_fixed_resources(struct pci_dev *dev)
-{
-   int i;
-
-   for (i = 0; i <  PCI_NUM_RESOURCES; i++) {
-   struct pci_bus *b;
-   struct resource *r = >resource[i];
-
-   if (r-

[PATCH v5 17/23] powerpc/pci: Fix crash with enabled movable BARs

2019-08-16 Thread Sergey Miroshnichenko
Add a check for the UNSET resource flag to skip the released BARs

CC: Alexey Kardashevskiy 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8080558d020..362eac42f463 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2986,7 +2986,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
 
-   if (!res || !res->flags || res->start > res->end)
+   if (!res || !res->flags || res->start > res->end ||
+   (res->flags & IORESOURCE_UNSET))
return;
 
if (res->flags & IORESOURCE_IO) {
-- 
2.21.0



[PATCH v5 16/23] PCI: hotplug: movable BARs: Don't reserve IO/mem bus space

2019-08-16 Thread Sergey Miroshnichenko
A hotplugged bridge with many hotplug-capable ports may request
reserving more IO space than the machine has. This could be overridden
with the "hpiosize=" kernel argument though.

But when BARs are movable, there are no need to reserve space anymore:
new BARs are allocated not from reserved gaps, but via rearranging the
existing BARs. Requesting a precise amount of space for bridge windows
increases the chances of adding the new bridge successfully.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index c7b7e30c6284..7d64ec8e7088 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1287,7 +1287,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
 
case PCI_HEADER_TYPE_BRIDGE:
pci_bridge_check_ranges(bus);
-   if (bus->self->is_hotplug_bridge) {
+   if (bus->self->is_hotplug_bridge && 
!pci_movable_bars_enabled()) {
additional_io_size  = pci_hotplug_io_size;
additional_mem_size = pci_hotplug_mem_size;
}
-- 
2.21.0



[PATCH v5 13/23] PCI: Make sure bridge windows include their fixed BARs

2019-08-16 Thread Sergey Miroshnichenko
When the time comes to select a start address for the bridge window during
the root bus rescan, it should be not just a lowest possible address: this
window must cover all the underlying fixed and immovable BARs. The lowest
address that satisfies this requirement is the .realloc_range field of
struct pci_bus, which is calculated during the preparation to the rescan.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/bus.c   |  2 +-
 drivers/pci/setup-res.c | 28 ++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 495059d923f7..7aae830751e9 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -192,7 +192,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, 
struct resource *res,
 * this is an already-configured bridge window, its start
 * overrides "min".
 */
-   if (avail.start)
+   if (min_used < avail.start)
min_used = avail.start;
 
max = avail.end;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 732d18f60f1b..7357bcc12a53 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -248,9 +248,20 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
struct resource *res = dev->resource + resno;
resource_size_t min;
int ret;
+   resource_size_t start = (resource_size_t)-1;
+   resource_size_t end = 0;
 
min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
 
+   if (dev->subordinate && resno >= PCI_BRIDGE_RESOURCES) {
+   struct pci_bus *child_bus = dev->subordinate;
+   int b_resno = resno - PCI_BRIDGE_RESOURCES;
+   struct resource *immovable_range = 
_bus->immovable_range[b_resno];
+
+   if (immovable_range->start < immovable_range->end)
+   min = child_bus->realloc_range[b_resno].start;
+   }
+
/*
 * First, try exact prefetching match.  Even if a 64-bit
 * prefetchable bridge window is below 4GB, we can't put a 32-bit
@@ -262,7 +273,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH | IORESOURCE_MEM_64,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
 
/*
 * If the prefetchable window is only 32 bits wide, we can put
@@ -274,7 +285,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
}
 
/*
@@ -287,6 +298,19 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
ret = pci_bus_alloc_resource(bus, res, size, align, min, 0,
 pcibios_align_resource, dev);
 
+check_fixed:
+   if (ret == 0 && start < end) {
+   if (res->start > start || res->end < end) {
+   dev_err(>dev, "fixed area 0x%llx-0x%llx for %s 
doesn't fit in the allocated %pR (0x%llx-0x%llx)",
+   (unsigned long long)start, (unsigned long 
long)end,
+   dev_name(>dev),
+   res, (unsigned long long)res->start,
+   (unsigned long long)res->end);
+   release_resource(res);
+   return -1;
+   }
+   }
+
return ret;
 }
 
-- 
2.21.0



[PATCH v5 14/23] PCI: Fix assigning the fixed prefetchable resources

2019-08-16 Thread Sergey Miroshnichenko
Allow matching IORESOURCE_PCI_FIXED prefetchable BARs to non-prefetchable
windows, so they follow the same rules as immovable BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 586aaa9578b2..6f12411357f3 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1340,15 +1340,20 @@ static void assign_fixed_resource_on_bus(struct pci_bus 
*b, struct resource *r)
 {
int i;
struct resource *parent_r;
-   unsigned long mask = IORESOURCE_IO | IORESOURCE_MEM |
-IORESOURCE_PREFETCH;
+   unsigned long mask = IORESOURCE_TYPE_BITS;
 
pci_bus_for_each_resource(b, parent_r, i) {
if (!parent_r)
continue;
 
-   if ((r->flags & mask) == (parent_r->flags & mask) &&
-   resource_contains(parent_r, r))
+   if ((r->flags & mask) != (parent_r->flags & mask))
+   continue;
+
+   if (parent_r->flags & IORESOURCE_PREFETCH &&
+   !(r->flags & IORESOURCE_PREFETCH))
+   continue;
+
+   if (resource_contains(parent_r, r))
request_resource(parent_r, r);
}
 }
-- 
2.21.0



[PATCH v5 12/23] PCI: hotplug: movable BARs: Compute limits for relocated bridge windows

2019-08-16 Thread Sergey Miroshnichenko
With enabled movable BARs, bridge windows are recalculated during each pci
rescan. Some of the BARs below the bridge may be fixed/immovable: these
areas are represented by the .immovable_range field in struct pci_bus.

If a bridge window size is equal to its immovable range, it can only be
assigned to the start of this range. But if a bridge window size is larger,
and this difference in size is denoted as "delta", the window can start
from (immovable_range.start - delta) to (immovable_range.start), and it can
end from (immovable_range.end) to (immovable_range.end + delta). This range
(the new .realloc_range field in struct pci_bus) must then be compared with
immovable ranges of neighbouring bridges to guarantee no intersections.

This patch only calculates valid ranges for reallocated bridges during pci
rescan, and the next one will make use of these values during allocation.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 67 +
 include/linux/pci.h |  6 
 2 files changed, 73 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 420510a1a257..586aaa9578b2 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1819,6 +1819,72 @@ static enum enable_type pci_realloc_detect(struct 
pci_bus *bus,
 }
 #endif
 
+/*
+ * Calculate the address margins where the bridge windows may be allocated to 
fit all
+ * the fixed and immovable BARs beneath.
+ */
+static void pci_bus_update_realloc_range(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   struct pci_bus *parent = bus->parent;
+   int idx;
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_realloc_range(dev->subordinate);
+
+   if (!parent || !bus->self)
+   return;
+
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   struct resource *immovable_range = >immovable_range[idx];
+   resource_size_t window_size = resource_size(bus->resource[idx]);
+   resource_size_t realloc_start, realloc_end;
+
+   bus->realloc_range[idx].start = 0;
+   bus->realloc_range[idx].end = 0;
+
+   /* Check if there any immovable BARs under the bridge */
+   if (immovable_range->start >= immovable_range->end)
+   continue;
+
+   /* The lowest possible address where the bridge window can 
start */
+   realloc_start = immovable_range->end - window_size + 1;
+   /* The highest possible address where the bridge window can end 
*/
+   realloc_end = immovable_range->start + window_size - 1;
+
+   if (realloc_start > immovable_range->start)
+   realloc_start = immovable_range->start;
+
+   if (realloc_end < immovable_range->end)
+   realloc_end = immovable_range->end;
+
+   /*
+* Check that realloc range doesn't intersect with hard fixed 
ranges
+* of neighboring bridges
+*/
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *neighbor = dev->subordinate;
+   struct resource *n_imm_range;
+
+   if (!neighbor || neighbor == bus)
+   continue;
+
+   n_imm_range = >immovable_range[idx];
+
+   if (n_imm_range->start >= n_imm_range->end)
+   continue;
+
+   if (n_imm_range->end < immovable_range->start &&
+   n_imm_range->end > realloc_start)
+   realloc_start = n_imm_range->end;
+   }
+
+   bus->realloc_range[idx].start = realloc_start;
+   bus->realloc_range[idx].end = realloc_end;
+   }
+}
+
 /*
  * First try will not touch PCI bridge res.
  * Second and later try will clear small leaf bridge res.
@@ -1838,6 +1904,7 @@ void pci_assign_unassigned_root_bus_resources(struct 
pci_bus *bus)
 
if (pci_movable_bars_enabled()) {
__pci_bus_size_bridges(bus, NULL);
+   pci_bus_update_realloc_range(bus);
__pci_bus_assign_resources(bus, NULL, NULL);
 
goto dump;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index efafbf816fe6..bf6638cf2525 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -587,6 +587,12 @@ struct pci_bus {
 */
struct resource immovable_range[PCI_BRIDGE_RESOURCE_NUM];
 
+   /*
+* Acceptable address range, where the bridge window may reside, 
considering its
+* size, so it will cover all the fixed and immovable BARs below.
+*/
+   struct r

[PATCH v5 11/23] PCI: hotplug: movable BARs: Calculate immovable parts of bridge windows

2019-08-16 Thread Sergey Miroshnichenko
When movable BARs are enabled, and if a bridge contains a device with fixed
(IORESOURCE_PCI_FIXED) or immovable BARs, the corresponing windows can't be
moved too far away from their original positions - they must still contain
all the fixed/immovable BARs, like that:

  1) Window position before a bus rescan:

  | <--root bridge window--> |
  |  |
  | | <-- bridge window--> | |
  | | movable BARs | **fixed BAR** | |

  2) Possible valid outcome after rescan and move:

  | <--root bridge window--> |
  |  |
  || <-- bridge window--> |  |
  || **fixed BAR** | Movable BARs |  |

An immovable area of a bridge (separare for IO, MEM and MEM64 window types)
is a range that covers all the fixed and immovable BARs of direct children,
and all the fixed area of children bridges:

  | <--root bridge window--> |
  |  |
  |  | <--  bridge window level 1--> |   |
  |  |  immovable area of this bridge window |   |
  |  |   |   |
  |  | **fixed BAR**  | <--  bridge window level 2--> | BARs |   |
  |  || * fixed area of this bridge * |  |   |
  |  ||   |  |   |
  |  || ***fixed BAR*** |   | ***fixed BAR*** |  |   |

To store these areas, the .immovable_range field has been added to struct
pci_bus. It is filled recursively from leaves to the root before a rescan.

Also make pbus_size_io() and pbus_size_mem() return their usual result OR
the size of an immovable range of according type, depending on which one is
larger.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   | 14 +++
 drivers/pci/probe.c | 88 +
 drivers/pci/setup-bus.c | 17 
 include/linux/pci.h |  6 +++
 4 files changed, 125 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 53249cbc21b6..12add575faf1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -371,6 +371,20 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
return dev->error_state == pci_channel_io_perm_failure;
 }
 
+static inline int pci_get_bridge_resource_idx(struct resource *r)
+{
+   int idx = 1;
+
+   if (r->flags & IORESOURCE_IO)
+   idx = 0;
+   else if (!(r->flags & IORESOURCE_PREFETCH))
+   idx = 1;
+   else if (r->flags & IORESOURCE_MEM_64)
+   idx = 2;
+
+   return idx;
+}
+
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
 #define PCI_DEV_DISABLED_BARS 1
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index bf0a7d1c5d09..5f52a19738aa 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -550,6 +550,7 @@ void pci_read_bridge_bases(struct pci_bus *child)
 static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
 {
struct pci_bus *b;
+   int idx;
 
b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
@@ -566,6 +567,11 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus 
*parent)
if (parent)
b->domain_nr = parent->domain_nr;
 #endif
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   b->immovable_range[idx].start = 0;
+   b->immovable_range[idx].end = 0;
+   }
+
return b;
 }
 
@@ -3512,6 +3518,87 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static void pci_bus_update_immovable_range(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   int idx;
+   resource_size_t start, end;
+
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   bus->immovable_range[idx].start = 0;
+   bus->immovable_range[idx].end = 0;
+   }
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_immovable_range(dev->subordinate);
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   int i;
+   bool dev_is_movable = pci_dev_movable_bars_supported(dev);
+   struct pci_bus *child = dev->subordinate;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; ++i) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & 

[PATCH v5 09/23] PCI: Prohibit assigning BARs and bridge windows to non-direct parents

2019-08-16 Thread Sergey Miroshnichenko
When movable BARs are enabled, the feature of resource relocating from
commit 2bbc6942273b5 ("PCI : ability to relocate assigned pci-resources")
is not used. Instead, inability to assign a resource is used as a signal
to retry BAR assignment with other configuration of bridge windows.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c |  2 ++
 drivers/pci/setup-res.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 2c250efca512..aee330047121 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1356,6 +1356,8 @@ static void pdev_assign_fixed_resources(struct pci_dev 
*dev)
while (b && !r->parent) {
assign_fixed_resource_on_bus(b, r);
b = b->parent;
+   if (!r->parent && pci_movable_bars_enabled())
+   break;
}
}
 }
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index d8ca40a97693..732d18f60f1b 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -298,6 +298,18 @@ static int _pci_assign_resource(struct pci_dev *dev, int 
resno,
 
bus = dev->bus;
while ((ret = __pci_assign_resource(bus, dev, resno, size, min_align))) 
{
+   if (pci_movable_bars_enabled()) {
+   if (resno >= PCI_BRIDGE_RESOURCES &&
+   resno <= PCI_BRIDGE_RESOURCE_END) {
+   struct resource *res = dev->resource + resno;
+
+   res->start = 0;
+   res->end = 0;
+   res->flags = 0;
+   }
+   break;
+   }
+
if (!bus->parent || !bus->self->transparent)
break;
bus = bus->parent;
-- 
2.21.0



[PATCH v5 10/23] PCI: hotplug: movable BARs: Try to assign unassigned resources only once

2019-08-16 Thread Sergey Miroshnichenko
With enabled BAR movement, BARs and bridge windows can only be assigned to
their direct parents, so there can be only one variant of resource tree,
thus every retry within the pci_assign_unassigned_root_bus_resources() will
result in the same tree, and it is enough to try just once.

In case of failures the pci_reassign_root_bus_resources() disables BARs for
one of the hotplugged devices and tries the assignment again.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index aee330047121..33f709095675 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1819,6 +1819,13 @@ void pci_assign_unassigned_root_bus_resources(struct 
pci_bus *bus)
int pci_try_num = 1;
enum enable_type enable_local;
 
+   if (pci_movable_bars_enabled()) {
+   __pci_bus_size_bridges(bus, NULL);
+   __pci_bus_assign_resources(bus, NULL, NULL);
+
+   goto dump;
+   }
+
/* Don't realloc if asked to do so */
enable_local = pci_realloc_detect(bus, pci_realloc_enable);
if (pci_realloc_enabled(enable_local)) {
-- 
2.21.0



[PATCH v5 07/23] PCI: hotplug: movable BARs: Don't allow added devices to steal resources

2019-08-16 Thread Sergey Miroshnichenko
When movable BARs are enabled, the PCI subsystem at first releases all the
bridge windows and then attempts to assign resources both to previously
working devices and to the newly hotplugged ones, with the same priority.

If a hotplugged device gets its BARs first, this may lead to lack of space
for already working devices, which is unacceptable. If that happens, mark
one of the new devices with the newly introduced flag PCI_DEV_DISABLED_BARS
(if it is not yet marked) and retry the BAR recalculation.

The worst case would be no BARs for hotplugged devices, while all the rest
just continue working.

The algorithm is simple and it doesn't retry different subsets of hot-added
devices in case of a failure, e.g. if there are no space to allocate BARs
for both hotplugged devices A and B, but is enough for just A, the A will
be marked with PCI_DEV_DISABLED_BARS first, then (after the next failure) -
B. As a result, A will not get BARs while it could. This issue is only
relevant when hotplugging two and more devices simultaneously.

Add a new res_mask bitmask to the struct pci_dev for storing the indices of
assigned BARs.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  11 +
 drivers/pci/probe.c | 101 ++--
 drivers/pci/setup-bus.c |  15 ++
 include/linux/pci.h |   1 +
 4 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a0ec696512eb..53249cbc21b6 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -373,6 +373,7 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
 
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
+#define PCI_DEV_DISABLED_BARS 1
 
 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
 {
@@ -384,6 +385,16 @@ static inline bool pci_dev_is_added(const struct pci_dev 
*dev)
return test_bit(PCI_DEV_ADDED, >priv_flags);
 }
 
+static inline void pci_dev_disable_bars(struct pci_dev *dev)
+{
+   assign_bit(PCI_DEV_DISABLED_BARS, >priv_flags, true);
+}
+
+static inline bool pci_dev_bars_enabled(const struct pci_dev *dev)
+{
+   return !test_bit(PCI_DEV_DISABLED_BARS, >priv_flags);
+}
+
 #ifdef CONFIG_PCIEAER
 #include 
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a26bf740e9ab..bf0a7d1c5d09 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3428,6 +3428,23 @@ void __weak pcibios_rescan_done(struct pci_dev *dev)
 {
 }
 
+static unsigned int pci_dev_count_res_mask(struct pci_dev *dev)
+{
+   unsigned int res_mask = 0;
+   int i;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent)
+   continue;
+
+   res_mask |= (1 << i);
+   }
+
+   return res_mask;
+}
+
 static void pci_bus_rescan_prepare(struct pci_bus *bus)
 {
struct pci_dev *dev;
@@ -3438,6 +3455,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus)
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child = dev->subordinate;
 
+   dev->res_mask = pci_dev_count_res_mask(dev);
+
if (child)
pci_bus_rescan_prepare(child);
 
@@ -3481,7 +3500,7 @@ static void pci_setup_bridges(struct pci_bus *bus)
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child;
 
-   if (!pci_dev_is_added(dev) || pci_dev_is_ignored(dev))
+   if (!pci_dev_is_added(dev) || !pci_dev_bars_enabled(dev))
continue;
 
child = dev->subordinate;
@@ -3493,6 +3512,83 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static struct pci_dev *pci_find_next_new_device(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   if (!bus)
+   return NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child_bus = dev->subordinate;
+
+   if (!pci_dev_is_added(dev) && pci_dev_bars_enabled(dev))
+   return dev;
+
+   if (child_bus) {
+   struct pci_dev *next_new_dev;
+
+   next_new_dev = pci_find_next_new_device(child_bus);
+   if (next_new_dev)
+   return next_new_dev;
+   }
+   }
+
+   return NULL;
+}
+
+static bool pci_bus_check_all_bars_reassigned(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   bool ret = true;
+
+   if (!bus)
+   return false;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child = dev->subordinate;
+   unsigned int res_mask = pci_dev_count_res_mask(dev);
+
+  

[PATCH v5 08/23] PCI: Include fixed and immovable BARs into the bus size calculating

2019-08-16 Thread Sergey Miroshnichenko
The only difference between the fixed/immovable and movable BARs is a size
and offset preservation after they are released (the corresponding struct
resource* detached from a bridge window for a while during a bus rescan).

Include fixed/immovable BARs into result of pbus_size_mem() and prohibit
assigning them to non-direct parents.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 1a731002ce18..2c250efca512 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1011,12 +1011,21 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
struct resource *r = >resource[i];
resource_size_t r_size;
 
-   if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) ||
+   if (r->parent ||
((r->flags & mask) != type &&
 (r->flags & mask) != type2 &&
 (r->flags & mask) != type3))
continue;
r_size = resource_size(r);
+
+   if ((r->flags & IORESOURCE_PCI_FIXED) ||
+   !pci_dev_movable_bars_supported(dev)) {
+   if (pci_movable_bars_enabled())
+   size += r_size;
+
+   continue;
+   }
+
 #ifdef CONFIG_PCI_IOV
/* Put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
-- 
2.21.0



[PATCH v5 06/23] PCI: hotplug: movable BARs: Recalculate all bridge windows during rescan

2019-08-16 Thread Sergey Miroshnichenko
When the movable BARs feature is enabled and a rescan has been requested,
release all the bridge windows and recalculate them from scratch, taking
into account all kinds for BARs: fixed, immovable, movable, new.

This increases the chances to find a memory space to fit BARs for newly
hotplugged devices, especially if no/not enough gaps were reserved by the
BIOS/bootloader/firmware.

The last step of writing the recalculated windows to the bridges is done
by the new pci_setup_bridges() function.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 22 ++
 drivers/pci/setup-bus.c | 16 
 3 files changed, 39 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index be7acc477c64..a0ec696512eb 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -253,6 +253,7 @@ void __pci_bus_assign_resources(const struct pci_bus *bus,
struct list_head *realloc_head,
struct list_head *fail_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
+void pci_bus_release_root_bridge_resources(struct pci_bus *bus);
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 60e3b48d2251..a26bf740e9ab 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3474,6 +3474,25 @@ static void pci_bus_rescan_done(struct pci_bus *bus)
pci_config_pm_runtime_put(bus->self);
 }
 
+static void pci_setup_bridges(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child;
+
+   if (!pci_dev_is_added(dev) || pci_dev_is_ignored(dev))
+   continue;
+
+   child = dev->subordinate;
+   if (child)
+   pci_setup_bridges(child);
+   }
+
+   if (bus->self)
+   pci_setup_bridge(bus);
+}
+
 /**
  * pci_rescan_bus - Scan a PCI bus for devices
  * @bus: PCI bus to scan
@@ -3495,8 +3514,11 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_bus_rescan_prepare(root);
 
max = pci_scan_child_bus(root);
+
+   pci_bus_release_root_bridge_resources(root);
pci_assign_unassigned_root_bus_resources(root);
 
+   pci_setup_bridges(root);
pci_bus_rescan_done(root);
} else {
max = pci_scan_child_bus(bus);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 7c2c57f77c6f..04f626e1ac18 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1635,6 +1635,22 @@ static void pci_bus_release_bridge_resources(struct 
pci_bus *bus,
pci_bridge_release_resources(bus, type);
 }
 
+void pci_bus_release_root_bridge_resources(struct pci_bus *root_bus)
+{
+   int i;
+   struct resource *r;
+
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_IO, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_MEM, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus,
+IORESOURCE_MEM_64 | 
IORESOURCE_PREFETCH,
+whole_subtree);
+
+   pci_bus_for_each_resource(root_bus, r, i) {
+   pci_release_child_resources(root_bus, r);
+   }
+}
+
 static void pci_bus_dump_res(struct pci_bus *bus)
 {
struct resource *res;
-- 
2.21.0



[PATCH v5 05/23] PCI: hotplug: movable BARs: Fix reassigning the released bridge windows

2019-08-16 Thread Sergey Miroshnichenko
When a bridge window is temporarily released during the rescan, its old
size is not relevant anymore - it will be recreated from pbus_size_*(), so
it's start value should be zero.

If such window can't be reassigned, don't apply reset_resource(), so the
next retry may succeed.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 6cb8b293c576..7c2c57f77c6f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -295,7 +295,8 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
0 /* don't care */,
0 /* don't care */);
}
-   reset_resource(res);
+   if (!pci_movable_bars_enabled())
+   reset_resource(res);
}
}
 }
@@ -1579,8 +1580,8 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
type = old_flags = r->flags & PCI_RES_TYPE_MASK;
pci_info(dev, "resource %d %pR released\n",
 PCI_BRIDGE_RESOURCES + idx, r);
-   /* Keep the old size */
-   r->end = resource_size(r) - 1;
+   /* Don't keep the old size if the bridge will be recalculated */
+   r->end = pci_movable_bars_enabled() ? 0 : (resource_size(r) - 
1);
r->start = 0;
r->flags = 0;
 
-- 
2.21.0



[PATCH v5 04/23] PCI: Define PCI-specific version of the release_child_resources()

2019-08-16 Thread Sergey Miroshnichenko
If release the bridge resources with standard release_child_resources(), it
drops the .start field of children's BARs to zero, but with the STARTALIGN
flag remaining set, which makes the resource invalid for reassignment.

Some resources must preserve their offset and size: those marked with the
PCI_FIXED and the immovable ones - which are bound by drivers without
support of the movable BARs feature.

Add the pci_release_child_resources() to replace release_child_resources()
in handling the described PCI-specific cases.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 54 -
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1fa6519be..6cb8b293c576 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1482,6 +1482,55 @@ static void __pci_bridge_assign_resources(const struct 
pci_dev *bridge,
(IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
 IORESOURCE_MEM_64)
 
+/*
+ * Similar to generic release_child_resources(), but aware of immovable BARs 
and
+ * PCI_FIXED and STARTALIGN flags
+ */
+static void pci_release_child_resources(struct pci_bus *bus, struct resource 
*r)
+{
+   struct pci_dev *dev;
+
+   if (!bus || !r)
+   return;
+
+   if (r->flags & IORESOURCE_PCI_FIXED)
+   return;
+
+   r->child = NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   int i;
+
+   for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+   struct resource *tmp = >resource[i];
+   resource_size_t size = resource_size(tmp);
+
+   if (!tmp->flags || tmp->parent != r)
+   continue;
+
+   tmp->parent = NULL;
+   tmp->sibling = NULL;
+
+   pci_release_child_resources(dev->subordinate, tmp);
+
+   if ((tmp->flags & IORESOURCE_PCI_FIXED) ||
+   !pci_dev_movable_bars_supported(dev)) {
+   pci_dbg(dev, "release immovable %pR (%s), keep 
its flags, base and size\n",
+   tmp, tmp->name);
+   continue;
+   }
+
+   pci_dbg(dev, "release %pR (%s)\n", tmp, tmp->name);
+
+   tmp->start = 0;
+   tmp->end = size - 1;
+
+   tmp->flags &= ~IORESOURCE_STARTALIGN;
+   tmp->flags |= IORESOURCE_SIZEALIGN;
+   }
+   }
+}
+
 static void pci_bridge_release_resources(struct pci_bus *bus,
 unsigned long type)
 {
@@ -1522,7 +1571,10 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
return;
 
/* If there are children, release them all */
-   release_child_resources(r);
+   if (pci_movable_bars_enabled())
+   pci_release_child_resources(bus, r);
+   else
+   release_child_resources(r);
if (!release_resource(r)) {
type = old_flags = r->flags & PCI_RES_TYPE_MASK;
pci_info(dev, "resource %d %pR released\n",
-- 
2.21.0



[PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature

2019-08-16 Thread Sergey Miroshnichenko
When hot-adding a device, the bridge may have windows not big enough (or
fragmented too much) for newly requested BARs to fit in. And expanding
these bridge windows may be impossible because blocked by "neighboring"
BARs and bridge windows.

Still, it may be possible to allocate a memory region for new BARs with the
following procedure:

1) notify all the drivers which support movable BARs to pause and release
   the BARs; the rest of the drivers are guaranteed that their devices will
   not get BARs moved;

2) release all the bridge windows except of root bridges;

3) try to recalculate new bridge windows that will fit all the BAR types:
   - fixed;
   - immovable;
   - movable;
   - newly requested by hot-added devices;

4) if the previous step fails, disable BARs for one of the hot-added
   devices and retry from step 3;

5) notify the drivers, so they remap BARs and resume.

This makes the prior reservation of memory by BIOS/bootloader/firmware not
required anymore for the PCI hotplug.

Drivers indicate their support of movable BARs by implementing the new
.rescan_prepare() and .rescan_done() hooks in the struct pci_driver. All
device's activity must be paused during a rescan, and iounmap()+ioremap()
must be applied to every used BAR.

The platform also may need to prepare to BAR movement, so new hooks added:
pcibios_rescan_prepare(pci_dev) and pcibios_rescan_prepare(pci_dev).

This patch is a preparation for future patches with actual implementation,
and for now it just does the following:
 - declares the feature;
 - defines pci_movable_bars_enabled(), pci_dev_movable_bars_supported(dev);
 - invokes the .rescan_prepare() and .rescan_done() driver notifiers;
 - declares and invokes the pcibios_rescan_prepare()/_done() hooks;
 - adds the PCI_IMMOVABLE_BARS flag.

The feature is disabled by default (via PCI_IMMOVABLE_BARS) until the final
patch of the series. It can be overridden per-arch using this flag or by
the following command line option:

pcie_movable_bars={ off | force }

CC: Sam Bobroff 
CC: Rajat Jain 
CC: Lukas Wunner 
CC: Oliver O'Halloran 
CC: David Laight 
Signed-off-by: Sergey Miroshnichenko 
---
 .../admin-guide/kernel-parameters.txt |  7 ++
 drivers/pci/pci-driver.c  |  2 +
 drivers/pci/pci.c | 24 ++
 drivers/pci/pci.h |  2 +
 drivers/pci/probe.c   | 86 ++-
 include/linux/pci.h   |  7 ++
 6 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 47d981a86e2f..e2274ee87a35 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3526,6 +3526,13 @@
nomsi   Do not use MSI for native PCIe PME signaling (this makes
all PCIe root ports use INTx for all services).
 
+   pcie_movable_bars=[PCIE]
+   Override the movable BARs support detection:
+   off
+   Disable even if supported by the platform
+   force
+   Enable even if not explicitly declared as supported
+
pcmv=   [HW,PCMCIA] BadgePAD 4
 
pd_ignore_unused
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a8124e47bf6e..d11909e79263 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1688,6 +1688,8 @@ static int __init pci_driver_init(void)
 {
int ret;
 
+   pci_add_flags(PCI_IMMOVABLE_BARS);
+
ret = bus_register(_bus_type);
if (ret)
return ret;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 61d951766087..3a504f58ac60 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -139,6 +139,30 @@ static int __init pcie_port_pm_setup(char *str)
 }
 __setup("pcie_port_pm=", pcie_port_pm_setup);
 
+static bool pcie_movable_bars_off;
+static bool pcie_movable_bars_force;
+static int __init pcie_movable_bars_setup(char *str)
+{
+   if (!strcmp(str, "off"))
+   pcie_movable_bars_off = true;
+   else if (!strcmp(str, "force"))
+   pcie_movable_bars_force = true;
+   return 1;
+}
+__setup("pcie_movable_bars=", pcie_movable_bars_setup);
+
+bool pci_movable_bars_enabled(void)
+{
+   if (pcie_movable_bars_off)
+   return false;
+
+   if (pcie_movable_bars_force)
+   return true;
+
+   return !pci_has_flag(PCI_IMMOVABLE_BARS);
+}
+EXPORT_SYMBOL(pci_movable_bars_enabled);
+
 /* Time to wait after a reset for device to become responsive */
 #define PCIE_RESET_READY_POLL_MS 6
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d22d1b807701..be7acc477c64 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -257,6 +257,8 @@ bool pci_bus_

[PATCH v5 01/23] PCI: Fix race condition in pci_enable/disable_device()

2019-08-16 Thread Sergey Miroshnichenko
This is a yet another approach to fix an old [1-2] concurrency issue, when:
 - two or more devices are being hot-added into a bridge which was
   initially empty;
 - a bridge with two or more devices is being hot-added;
 - during boot, if BIOS/bootloader/firmware doesn't pre-enable bridges.

The problem is that a bridge is reported as enabled before the MEM/IO bits
are actually written to the PCI_COMMAND register, so another driver thread
starts memory requests through the not-yet-enabled bridge:

 CPU0CPU1

 pci_enable_device_mem() pci_enable_device_mem()
   pci_enable_bridge() pci_enable_bridge()
 pci_is_enabled()
   return false;
 atomic_inc_return(enable_cnt)
 Start actual enabling the bridge
 ... pci_is_enabled()
 ...   return true;
 ... Start memory requests <-- FAIL
 ...
 Set the PCI_COMMAND_MEMORY bit <-- Must wait for this

Protect the pci_enable/disable_device() and pci_enable_bridge(), which is
similar to the previous solution from commit 40f11adc7cd9 ("PCI: Avoid race
while enabling upstream bridges"), but adding a per-device mutexes and
preventing the dev->enable_cnt from from incrementing early.

CC: Srinath Mannam 
CC: Marta Rybczynska 
Signed-off-by: Sergey Miroshnichenko 

[1] 
https://lore.kernel.org/linux-pci/1501858648-8-1-git-send-email-srinath.man...@broadcom.com/T/#u
[RFC PATCH v3] pci: Concurrency issue during pci enable bridge

[2] 
https://lore.kernel.org/linux-pci/744877924.5841545.1521630049567.javamail.zim...@kalray.eu/T/#u
[RFC PATCH] nvme: avoid race-conditions when enabling devices
---
 drivers/pci/pci.c   | 26 ++
 drivers/pci/probe.c |  1 +
 include/linux/pci.h |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 1b27b5af3d55..e7f8c354e644 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1645,6 +1645,8 @@ static void pci_enable_bridge(struct pci_dev *dev)
struct pci_dev *bridge;
int retval;
 
+   mutex_lock(>enable_mutex);
+
bridge = pci_upstream_bridge(dev);
if (bridge)
pci_enable_bridge(bridge);
@@ -1652,6 +1654,7 @@ static void pci_enable_bridge(struct pci_dev *dev)
if (pci_is_enabled(dev)) {
if (!dev->is_busmaster)
pci_set_master(dev);
+   mutex_unlock(>enable_mutex);
return;
}
 
@@ -1660,11 +1663,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
pci_err(dev, "Error enabling bridge (%d), continuing\n",
retval);
pci_set_master(dev);
+   mutex_unlock(>enable_mutex);
 }
 
 static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 {
struct pci_dev *bridge;
+   /* Enable-locking of bridges is performed within the 
pci_enable_bridge() */
+   bool need_lock = !dev->subordinate;
int err;
int i, bars = 0;
 
@@ -1680,8 +1686,13 @@ static int pci_enable_device_flags(struct pci_dev *dev, 
unsigned long flags)
dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
}
 
-   if (atomic_inc_return(>enable_cnt) > 1)
+   if (need_lock)
+   mutex_lock(>enable_mutex);
+   if (pci_is_enabled(dev)) {
+   if (need_lock)
+   mutex_unlock(>enable_mutex);
return 0;   /* already enabled */
+   }
 
bridge = pci_upstream_bridge(dev);
if (bridge)
@@ -1696,8 +1707,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, 
unsigned long flags)
bars |= (1 << i);
 
err = do_pci_enable_device(dev, bars);
-   if (err < 0)
-   atomic_dec(>enable_cnt);
+   if (err >= 0)
+   atomic_inc(>enable_cnt);
+   if (need_lock)
+   mutex_unlock(>enable_mutex);
return err;
 }
 
@@ -1941,15 +1954,20 @@ void pci_disable_device(struct pci_dev *dev)
if (dr)
dr->enabled = 0;
 
+   mutex_lock(>enable_mutex);
dev_WARN_ONCE(>dev, atomic_read(>enable_cnt) <= 0,
  "disabling already-disabled device");
 
-   if (atomic_dec_return(>enable_cnt) != 0)
+   if (atomic_dec_return(>enable_cnt) != 0) {
+   mutex_unlock(>enable_mutex);
return;
+   }
 
do_pci_disable_device(dev);
 
dev->is_busmaster = 0;
+
+   mutex_unlock(>enable_mutex);
 }
 EXPORT_SYMBOL(pci_disable_device);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a3c7338fad86..2e58ece820e8 100644
--- a/drivers/pci/probe.c
+++ b/d

[PATCH v5 02/23] PCI: Enable bridge's I/O and MEM access for hotplugged devices

2019-08-16 Thread Sergey Miroshnichenko
The PCI_COMMAND_IO and PCI_COMMAND_MEMORY bits of the bridge must be
updated not only when enabling the bridge for the first time, but also if a
hotplugged device requests these types of resources.

Originally these bits were set by the pci_enable_device_flags() only, which
exits early if the bridge is already pci_is_enabled(). So if the bridge was
empty initially (an edge case), then hotplugged devices fail to IO/MEM.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e7f8c354e644..61d951766087 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1652,6 +1652,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
pci_enable_bridge(bridge);
 
if (pci_is_enabled(dev)) {
+   int i, bars = 0;
+
+   for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) {
+   if (dev->resource[i].flags & (IORESOURCE_MEM | 
IORESOURCE_IO))
+   bars |= (1 << i);
+   }
+   do_pci_enable_device(dev, bars);
+
if (!dev->is_busmaster)
pci_set_master(dev);
mutex_unlock(>enable_mutex);
-- 
2.21.0



[PATCH v5 00/23] PCI: Allow BAR movement during hotplug

2019-08-16 Thread Sergey Miroshnichenko
If the firmware or kernel has arranged memory for PCIe devices in a way
that doesn't provide enough space for BARs of a new hotplugged device, the
kernel can pause the drivers of the "obstructing" devices and move their
BARs, so the new BARs can fit into the freed spaces.

To rearrange the BARs and bridge windows these patches releases all of them
after a rescan and re-assigns in the same way as during the initial PCIe
topology scan at system boot.

When a driver is un-paused by the kernel after the PCIe rescan, it should
check if its BARs had moved, and ioremap() them.

Drivers indicate their support of the feature by implementing the new hooks
.rescan_prepare() and .rescan_done() in the struct pci_driver. If a driver
doesn't yet support the feature, BARs of its devices will be considered as
immovable (by checking the pci_dev_movable_bars_supported(dev)) and handled
in the same way as resources with the IORESOURCE_PCI_FIXED flag.

If a driver doesn't yet support the feature, its devices are guaranteed to
have their BARs remaining untouched.

Tested on:
 - x86_64 with "pci=realloc,assign-busses,use_crs,pcie_bus_peer2peer";
 - POWER8 PowerNV+OPAL+PHB3 ppc64le with [1] applied and the following:
   "pci=realloc,pcie_bus_peer2peer";
 - both platforms [with extra pacthes (yet to be submitted) for movable bus
   numbers]: manually initiated (via sysfs) rescan has found and turned on
   a hotplugged bridge.

Not so many platforms and test cases were covered, so all who are
interested are highly welcome to test on your setups - the more exotic the
better!

This patchset is a part of our work on adding support for hotplugging
bridges full of other bridges, NVME drives, SAS HBAs and GPUs without
special requirements such as Hot-Plug Controller, reservation of bus
numbers or memory regions by firmware, etc. The next patchset to submit
will implement the movable bus numbers.

[1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-August/195272.html
[PATCH v6 0/5] powerpc/powernv/pci: Make hotplug self-sufficient, 
independent of FW and DT

Changes since v4:
 - Feature is enabled by default (turned on by one of the latest patches);
 - Add pci_dev_movable_bars_supported(dev) instead of marking the immovable
   BARs with the IORESOURCE_PCI_FIXED flag;
 - Set up PCIe bridges during rescan via sysfs, so MPS settings are now
   configured not only during system boot or pcihp events;
 - Allow movement of switch's BARs if claimed by portdrv;
 - Update EEH address caches after rescan for powerpc;
 - Don't disable completely hot-added devices which can't have BARs being
   fit - just disable their BARs, so they are still visible in lspci etc;
 - Clearer names: fixed_range_hard -> immovable_range, fixed_range_soft ->
   realloc_range;
 - Drop the patch for pci_restore_config_space() - fixed by properly using
   the runtime PM.

Changes since v3:
 - Rebased to the upstream, so the patches apply cleanly again.

Changes since v2:
 - Fixed double-assignment of bridge windows;
 - Fixed assignment of fixed prefetched resources;
 - Fixed releasing of fixed resources;
 - Fixed a debug message;
 - Removed auto-enabling the movable BARs for x86 - let's rely on the
   "pcie_movable_bars=force" option for now;
 - Reordered the patches - bugfixes first.

Changes since v1:
 - Add a "pcie_movable_bars={ off | force }" command line argument;
 - Handle the IORESOURCE_PCI_FIXED flag properly;
 - Don't move BARs of devices which don't support the feature;
 - Guarantee that new hotplugged devices will not steal memory from working
   devices by ignoring the failing new devices with the new PCI_DEV_IGNORE
   flag;
 - Add rescan_prepare()+rescan_done() to the struct pci_driver instead of
   using the reset_prepare()+reset_done() from struct pci_error_handlers;
 - Add a bugfix of a race condition;
 - Fixed hotplug in a non-pre-enabled (by BIOS/firmware) bridge;
 - Fix the compatibility of the feature with pm_runtime and D3-state;
 - Hotplug events from pciehp also can move BARs;
 - Add support of the feature to the NVME driver.

Sergey Miroshnichenko (23):
  PCI: Fix race condition in pci_enable/disable_device()
  PCI: Enable bridge's I/O and MEM access for hotplugged devices
  PCI: hotplug: Add a flag for the movable BARs feature
  PCI: Define PCI-specific version of the release_child_resources()
  PCI: hotplug: movable BARs: Fix reassigning the released bridge
windows
  PCI: hotplug: movable BARs: Recalculate all bridge windows during
rescan
  PCI: hotplug: movable BARs: Don't allow added devices to steal
resources
  PCI: Include fixed and immovable BARs into the bus size calculating
  PCI: Prohibit assigning BARs and bridge windows to non-direct parents
  PCI: hotplug: movable BARs: Try to assign unassigned resources only
once
  PCI: hotplug: movable BARs: Calculate immovable parts of bridge
windows
  PCI: hotplug: movable BARs: Compute limits for relocated br

[PATCH v6 5/5] powerpc/pci: Enable assigning bus numbers instead of reading them from DT

2019-08-16 Thread Sergey Miroshnichenko
If the firmware indicates support of reassigning bus numbers via the PHB's
"ibm,supported-movable-bdfs" property in DT, PowerNV will not depend on PCI
topology info from DT anymore.

This makes possible to re-enumerate the fabric, assign the new bus numbers
and switch from the pnv_php module to the standard pciehp driver for PCI
hotplug functionality.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci_dn.c | 5 +
 arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 261d61460eac..90f8d46550df 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -542,6 +542,11 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
phb->pci_data = pdn;
}
 
+   if (of_get_property(dn, "ibm,supported-movable-bdfs", NULL)) {
+   pci_add_flags(PCI_REASSIGN_ALL_BUS);
+   return;
+   }
+
/* Update dn->phb ptrs for new phb and children devices */
pci_traverse_device_nodes(dn, add_pdn, phb);
 }
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 620a986209f5..eb01f16c4e60 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -41,7 +41,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
 
-   if (!pdev->is_virtfn)
+   if (!pdev->is_virtfn && !pci_has_flag(PCI_REASSIGN_ALL_BUS))
return;
 
/*
-- 
2.21.0



[PATCH v6 4/5] powerpc/powernv/pci: Hook up the writes to PCI_SECONDARY_BUS register

2019-08-16 Thread Sergey Miroshnichenko
Writing a new value to the PCI_SECONDARY_BUS register of the bridge means
that its children will become addressable on another address (new B in BDF)
or even un-addressable if the secondary bus is set to zero.

On PowerNV, device PEs are heavily BDF-dependent, so they must be updated
on every such change of its address.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci.c | 118 ++-
 1 file changed, 116 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index a5b04410c8b4..e9b4ed0f97a3 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -717,13 +717,127 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
where, size, val);
 }
 
+static void invalidate_children_pes(struct pci_dn *pdn)
+{
+   struct pnv_phb *phb = pdn->phb->private_data;
+   struct pci_dn *child;
+   bool found_pe = false;
+   int pe_num;
+   int pe_bus;
+
+   list_for_each_entry(child, >child_list, list) {
+   struct pnv_ioda_pe *pe = (child->pe_number != IODA_INVALID_PE) ?
+   >ioda.pe_array[child->pe_number] :
+   NULL;
+
+   if (!child->busno)
+   continue;
+
+   if ((child->class_code >> 8) == PCI_CLASS_BRIDGE_PCI)
+   invalidate_children_pes(child);
+
+   if (pe) {
+   u8 rid_bus = (pe->rid >> 8) & 0xff;
+
+   if (rid_bus) {
+   pe_num = child->pe_number;
+   pe_bus = rid_bus;
+   found_pe = true;
+   }
+
+   pe->rid &= 0xff;
+   }
+
+   child->busno = 0;
+   }
+
+   if (found_pe) {
+   u16 rid = pe_bus << 8;
+
+   opal_pci_set_pe(phb->opal_id, pe_num, rid, 7, 0, 0, 
OPAL_UNMAP_PE);
+   }
+}
+
+static u8 pre_hook_new_sec_bus(struct pci_dn *pdn, u8 new_secondary_bus)
+{
+   u32 old_secondary_bus = 0;
+
+   if ((pdn->class_code >> 8) != PCI_CLASS_BRIDGE_PCI)
+   return 0;
+
+   pnv_pci_cfg_read(pdn, PCI_SECONDARY_BUS, 1, _secondary_bus);
+   old_secondary_bus &= 0xff;
+
+   if (old_secondary_bus != new_secondary_bus)
+   invalidate_children_pes(pdn);
+
+   return old_secondary_bus;
+}
+
+static void update_children_pes(struct pci_dn *pdn, u8 new_secondary_bus)
+{
+   struct pnv_phb *phb = pdn->phb->private_data;
+   struct pci_dn *child;
+   bool found_pe = false;
+   int pe_num;
+
+   if (!new_secondary_bus)
+   return;
+
+   list_for_each_entry(child, >child_list, list) {
+   struct pnv_ioda_pe *pe = (child->pe_number != IODA_INVALID_PE) ?
+   >ioda.pe_array[child->pe_number] :
+   NULL;
+
+   if (child->busno)
+   continue;
+
+   child->busno = new_secondary_bus;
+
+   if (pe) {
+   pe->rid |= (child->busno << 8);
+   pe_num = child->pe_number;
+   found_pe = true;
+   }
+   }
+
+   if (found_pe) {
+   u16 rid = new_secondary_bus << 8;
+
+   opal_pci_set_pe(phb->opal_id, pe_num, rid, 7, 0, 0, 
OPAL_MAP_PE);
+   }
+}
+
+static void post_hook_new_sec_bus(struct pci_dn *pdn, u8 new_secondary_bus)
+{
+   if ((pdn->class_code >> 8) != PCI_CLASS_BRIDGE_PCI)
+   return;
+
+   update_children_pes(pdn, new_secondary_bus);
+}
+
 int pnv_pci_cfg_write(struct pci_dn *pdn,
  int where, int size, u32 val)
 {
struct pnv_phb *phb = pdn->phb->private_data;
+   u8 old_secondary_bus = 0, new_secondary_bus = 0;
+   int rc;
+
+   if (where == PCI_SECONDARY_BUS) {
+   new_secondary_bus = val & 0xff;
+   old_secondary_bus = pre_hook_new_sec_bus(pdn, 
new_secondary_bus);
+   } else if (where == PCI_PRIMARY_BUS && size > 1) {
+   new_secondary_bus = (val >> 8) & 0xff;
+   old_secondary_bus = pre_hook_new_sec_bus(pdn, 
new_secondary_bus);
+   }
 
-   return pnv_pci_cfg_write_raw(phb->opal_id, pdn->busno, pdn->devfn,
-where, size, val);
+   rc = pnv_pci_cfg_write_raw(phb->opal_id, pdn->busno, pdn->devfn,
+  where, size, val);
+
+   if (new_secondary_bus && old_secondary_bus != new_secondary_bus)
+   post_hook_new_sec_bus(pdn, new_secondary_bus);
+
+   return rc;
 }
 
 #if CONFIG_EEH
-- 
2.21.0



[PATCH v6 3/5] powerpc/pci: Create pci_dn on demand

2019-08-16 Thread Sergey Miroshnichenko
If a struct pci_dn hasn't yet been created for the PCIe device (there was
no DT node for it), allocate this structure and fill with info read from
the device directly.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/pci_dn.c | 88 ++--
 1 file changed, 74 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index e1a0ab2caafe..261d61460eac 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -20,6 +20,9 @@
 #include 
 #include 
 
+static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
+ struct pci_dn *parent);
+
 /*
  * The function is used to find the firmware data of one
  * specific PCI device, which is attached to the indicated
@@ -52,6 +55,9 @@ struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
dn = pci_bus_to_OF_node(pbus);
pdn = dn ? PCI_DN(dn) : NULL;
 
+   if (!pdn && pbus->self)
+   pdn = pbus->self->dev.archdata.pci_data;
+
return pdn;
 }
 
@@ -61,10 +67,13 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
struct device_node *dn = NULL;
struct pci_dn *parent, *pdn;
struct pci_dev *pdev = NULL;
+   bool pdev_found = false;
 
/* Fast path: fetch from PCI device */
list_for_each_entry(pdev, >devices, bus_list) {
if (pdev->devfn == devfn) {
+   pdev_found = true;
+
if (pdev->dev.archdata.pci_data)
return pdev->dev.archdata.pci_data;
 
@@ -73,6 +82,9 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
}
}
 
+   if (!pdev_found)
+   pdev = NULL;
+
/* Fast path: fetch from device node */
pdn = dn ? PCI_DN(dn) : NULL;
if (pdn)
@@ -85,9 +97,12 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
 
list_for_each_entry(pdn, >child_list, list) {
if (pdn->busno == bus->number &&
-pdn->devfn == devfn)
-return pdn;
-}
+   pdn->devfn == devfn) {
+   if (pdev)
+   pdev->dev.archdata.pci_data = pdn;
+   return pdn;
+   }
+   }
 
return NULL;
 }
@@ -117,17 +132,17 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
 
list_for_each_entry(pdn, >child_list, list) {
if (pdn->busno == pdev->bus->number &&
-   pdn->devfn == pdev->devfn)
+   pdn->devfn == pdev->devfn) {
+   pdev->dev.archdata.pci_data = pdn;
return pdn;
+   }
}
 
-   return NULL;
+   return pci_create_pdn_from_dev(pdev, parent);
 }
 
-#ifdef CONFIG_PCI_IOV
-static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
-  int vf_index,
-  int busno, int devfn)
+static struct pci_dn *pci_alloc_pdn(struct pci_dn *parent,
+   int busno, int devfn)
 {
struct pci_dn *pdn;
 
@@ -143,7 +158,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
pdn->parent = parent;
pdn->busno = busno;
pdn->devfn = devfn;
-   pdn->vf_index = vf_index;
pdn->pe_number = IODA_INVALID_PE;
INIT_LIST_HEAD(>child_list);
INIT_LIST_HEAD(>list);
@@ -151,7 +165,51 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
 
return pdn;
 }
-#endif
+
+static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
+ struct pci_dn *parent)
+{
+   struct pci_dn *pdn = NULL;
+   u32 class_code;
+   u16 device_id;
+   u16 vendor_id;
+
+   if (!parent)
+   return NULL;
+
+   pdn = pci_alloc_pdn(parent, pdev->bus->busn_res.start, pdev->devfn);
+   pci_info(pdev, "Create a new pdn for devfn %2x\n", pdev->devfn / 8);
+
+   if (!pdn) {
+   pci_err(pdev, "%s: Failed to allocate pdn\n", __func__);
+   return NULL;
+   }
+
+   #ifdef CONFIG_EEH
+   if (!eeh_dev_init(pdn)) {
+   kfree(pdn);
+   pci_err(pdev, "%s: Failed to allocate edev\n", __func__);
+   return NULL;
+   }
+   #endif /* CONFIG_EEH */
+
+   pci_bus_read_config_word(pdev->bus, pdev->devfn,
+PCI_VENDOR_ID, _id);
+   pdn->vendor_id = vendor_id;
+
+   pci_bus_read_config_word(pdev->bus, pdev->devfn,
+PCI_DEVICE_ID, _id);
+   pdn->device_id = device_id;
+
+   pci

[PATCH v6 2/5] powerpc/powernv/pci: Suppress an EEH error when reading an empty slot

2019-08-16 Thread Sergey Miroshnichenko
Reading an empty slot returns all ones, which triggers a false
EEH error event on PowerNV. This patch unfreezes the bus where
it has happened.

Reviewed-by: Oliver O'Halloran 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/include/asm/ppc-pci.h   |  1 +
 arch/powerpc/kernel/pci_dn.c |  2 +-
 arch/powerpc/platforms/powernv/pci.c | 31 +---
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index cec2d6409515..8b51c8577b94 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -36,6 +36,7 @@ void *traverse_pci_dn(struct pci_dn *root,
  void *(*fn)(struct pci_dn *, void *),
  void *data);
 extern void pci_devs_phb_init_dynamic(struct pci_controller *phb);
+struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus);
 
 /* From rtas_pci.h */
 extern void init_pci_config_tokens (void);
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index c4c8c237a106..e1a0ab2caafe 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -27,7 +27,7 @@
  * one of PF's bridge. For other devices, their firmware
  * data is linked to that of their bridge.
  */
-static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
+struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
 {
struct pci_bus *pbus;
struct device_node *dn;
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 8d6c094f074e..a5b04410c8b4 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -756,6 +756,21 @@ static inline pnv_pci_cfg_check(struct pci_dn *pdn)
 }
 #endif /* CONFIG_EEH */
 
+static int get_bus_pe_number(struct pci_bus *bus)
+{
+   struct pci_dn *pdn = pci_bus_to_pdn(bus);
+   struct pci_dn *child;
+
+   if (!pdn)
+   return IODA_INVALID_PE;
+
+   list_for_each_entry(child, >child_list, list)
+   if (child->pe_number != IODA_INVALID_PE)
+   return child->pe_number;
+
+   return IODA_INVALID_PE;
+}
+
 static int pnv_pci_read_config(struct pci_bus *bus,
   unsigned int devfn,
   int where, int size, u32 *val)
@@ -767,9 +782,19 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 
*val = 0x;
pdn = pci_get_pdn_by_devfn(bus, devfn);
-   if (!pdn)
-   return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
-   where, size, val);
+   if (!pdn) {
+   int pe_number = get_bus_pe_number(bus);
+
+   ret = pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
+  where, size, val);
+
+   if (!ret && (*val == EEH_IO_ERROR_VALUE(size)) && 
phb->unfreeze_pe)
+   phb->unfreeze_pe(phb, (pe_number == IODA_INVALID_PE) ?
+phb->ioda.reserved_pe_idx : pe_number,
+OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+   return ret;
+   }
 
if (!pnv_pci_cfg_check(pdn))
return PCIBIOS_DEVICE_NOT_FOUND;
-- 
2.21.0



[PATCH v6 1/5] powerpc/pci: Access PCI config space directly w/o pci_dn

2019-08-16 Thread Sergey Miroshnichenko
To fetch an updated DT for the newly hotplugged device, OS must explicitly
request it from the firmware via the pnv_php driver.

If pnv_php wasn't triggered/loaded, it is still possible to discover new
devices if PCIe I/O will not stop in absence of the pci_dn structure.

Reviewed-by: Oliver O'Halloran 
Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/kernel/rtas_pci.c   | 97 +++-
 arch/powerpc/platforms/powernv/pci.c | 64 --
 2 files changed, 109 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index ae5e43eaca48..912da28b3737 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -42,10 +42,26 @@ static inline int config_access_valid(struct pci_dn *dn, 
int where)
return 0;
 }
 
-int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+static int rtas_read_raw_config(unsigned long buid, int busno, unsigned int 
devfn,
+   int where, int size, u32 *val)
 {
int returnval = -1;
-   unsigned long buid, addr;
+   unsigned long addr = rtas_config_addr(busno, devfn, where);
+   int ret;
+
+   if (buid) {
+   ret = rtas_call(ibm_read_pci_config, 4, 2, ,
+   addr, BUID_HI(buid), BUID_LO(buid), size);
+   } else {
+   ret = rtas_call(read_pci_config, 2, 2, , addr, size);
+   }
+   *val = returnval;
+
+   return ret;
+}
+
+int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
+{
int ret;
 
if (!pdn)
@@ -58,16 +74,8 @@ int rtas_read_config(struct pci_dn *pdn, int where, int 
size, u32 *val)
return PCIBIOS_SET_FAILED;
 #endif
 
-   addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
-   buid = pdn->phb->buid;
-   if (buid) {
-   ret = rtas_call(ibm_read_pci_config, 4, 2, ,
-   addr, BUID_HI(buid), BUID_LO(buid), size);
-   } else {
-   ret = rtas_call(read_pci_config, 2, 2, , addr, size);
-   }
-   *val = returnval;
-
+   ret = rtas_read_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
+  where, size, val);
if (ret)
return PCIBIOS_DEVICE_NOT_FOUND;
 
@@ -85,18 +93,44 @@ static int rtas_pci_read_config(struct pci_bus *bus,
 
pdn = pci_get_pdn_by_devfn(bus, devfn);
 
-   /* Validity of pdn is checked in here */
-   ret = rtas_read_config(pdn, where, size, val);
-   if (*val == EEH_IO_ERROR_VALUE(size) &&
-   eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   if (pdn) {
+   /* Validity of pdn is checked in here */
+   ret = rtas_read_config(pdn, where, size, val);
+
+   if (*val == EEH_IO_ERROR_VALUE(size) &&
+   eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
+   ret = PCIBIOS_DEVICE_NOT_FOUND;
+   } else {
+   struct pci_controller *phb = pci_bus_to_host(bus);
+
+   ret = rtas_read_raw_config(phb->buid, bus->number, devfn,
+  where, size, val);
+   }
 
return ret;
 }
 
+static int rtas_write_raw_config(unsigned long buid, int busno, unsigned int 
devfn,
+int where, int size, u32 val)
+{
+   unsigned long addr = rtas_config_addr(busno, devfn, where);
+   int ret;
+
+   if (buid) {
+   ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
+   BUID_HI(buid), BUID_LO(buid), size, (ulong)val);
+   } else {
+   ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, 
(ulong)val);
+   }
+
+   if (ret)
+   return PCIBIOS_DEVICE_NOT_FOUND;
+
+   return PCIBIOS_SUCCESSFUL;
+}
+
 int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
 {
-   unsigned long buid, addr;
int ret;
 
if (!pdn)
@@ -109,15 +143,8 @@ int rtas_write_config(struct pci_dn *pdn, int where, int 
size, u32 val)
return PCIBIOS_SET_FAILED;
 #endif
 
-   addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
-   buid = pdn->phb->buid;
-   if (buid) {
-   ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
-   BUID_HI(buid), BUID_LO(buid), size, (ulong) val);
-   } else {
-   ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, 
(ulong)val);
-   }
-
+   ret = rtas_write_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
+   where, size, val);
if (ret)
return PCIBIOS_DEVICE_NOT_FOUND;
 
@@ -128,12 +155,20 @@ static int rtas_pci_write_config(struct pci_bus *bus,
 un

[PATCH v6 0/5] powerpc/powernv/pci: Make hotplug self-sufficient, independent of FW and DT

2019-08-16 Thread Sergey Miroshnichenko
Allow switching from the pnv_php module to the standard pciehp driver for
PowerNV, if the platform supports it: it can be a server working on top of
the skiboot with the [1] patchset applied.

Add the ability to discover hot-added devices which weren't added to the
Device Tree (by the pnv_php via an explicit OPAL call when a hotplug event
was intercepted) by direct access to the bus.

Sync the changes in PCIe topology (bus numbers and PEs) with the skiboot.

Tested on POWER8 PowerNV+PHB3 ppc64le (our Vesnin server) with:
 - the pciehp driver active;
 - the pnv_php driver disabled;
 - the "pci=pcie_bus_peer2peer,realloc" kernel command line argument;
 - controlled hotplug of a network card with SR-IOV works;
 - activating of SR-IOV on a network card works;
 - [with extra patches for movable BARs and bus numbers] manually initiated
   (via sysfs) rescan has found and turned on a hotplugged bridge.

[1] https://lists.ozlabs.org/pipermail/skiboot/2019-August/015140.html
[Skiboot] [PATCH v3 0/5] core/pci: Track changes of topology by an OS

Change since v5:
 - Activates on "ibm,supported-movable-bdfs" property in DT from skiboot
   instead of the "pci=realloc" flag;
 - Removed the code refactoring patches - will send them separately.

Changes since v4:
 - Fixed failing build when EEH is disabled in a kernel config;
 - Unfreeze the bus on EEH_IO_ERROR_VALUE(size), not only 0x;
 - Replaced the 0xff magic constant with phb->ioda.reserved_pe_idx;
 - Renamed create_pdn() -> pci_create_pdn_from_dev();
 - Renamed add_one_dev_pci_data(..., vf_index, ...) -> pci_alloc_pdn();
 - Renamed add_dev_pci_data() -> pci_create_vf_pdns();
 - Renamed remove_dev_pci_data() -> pci_destroy_vf_pdns();
 - Removed the patch fixing uninitialized IOMMU group - now it is fixed in
   commit 8f5b27347e88 ("powerpc/powernv/sriov: Register IOMMU groups for
   VFs")

Changes since v3:
 - Subject changed;
 - Don't disable EEH during rescan anymore - instead just unfreeze the
   target buses deliberately;
 - Add synchronization with the firmware when changing the PCIe topology;
 - Fixed for VFs;
 - Code cleanup.

Changes since v2:
 - Don't reassign bus numbers on PowerNV by default (to retain the default
   behavior), but only when pci=realloc is passed;
 - Less code affected;
 - pci_add_device_node_info is refactored with add_one_dev_pci_data;
 - Minor code cleanup.

Changes since v1:
 - Fixed build for ppc64le and ppc64be when CONFIG_PCI_IOV is disabled;
 - Fixed build for ppc64e when CONFIG_EEH is disabled;
 - Fixed code style warnings.

Sergey Miroshnichenko (5):
  powerpc/pci: Access PCI config space directly w/o pci_dn
  powerpc/powernv/pci: Suppress an EEH error when reading an empty slot
  powerpc/pci: Create pci_dn on demand
  powerpc/powernv/pci: Hook up the writes to PCI_SECONDARY_BUS register
  powerpc/pci: Enable assigning bus numbers instead of reading them from
DT

 arch/powerpc/include/asm/ppc-pci.h   |   1 +
 arch/powerpc/kernel/pci_dn.c |  95 +++--
 arch/powerpc/kernel/rtas_pci.c   |  97 ++---
 arch/powerpc/platforms/powernv/eeh-powernv.c |   2 +-
 arch/powerpc/platforms/powernv/pci.c | 205 +--
 5 files changed, 331 insertions(+), 69 deletions(-)

-- 
2.21.0



Re: [PATCH v5 5/8] powerpc/pci/IOV: Add support for runtime enabling the VFs

2019-05-14 Thread Sergey Miroshnichenko
On 4/30/19 9:00 AM, Oliver O'Halloran wrote:
> On Mon, 2019-03-11 at 14:52 +0300, Sergey Miroshnichenko wrote:
> 
>> When called within pcibios_sriov_enable(), the pci_sriov_get_totalvfs(pdev)
>> returns zero, because the device is yet preparing to enable the VFs.
> 
> I don't think this is correct. The earliest pcibios_sriov_enable() can
> be called is during a driver probe function. The totalvfs field is
> initialised by pci_iov_init() which is called before the device has
> been added to the bus. If it's returning zero then maybe the driver
> limited the number of VFs to zero?
> 
> That said, you need to reset numvfs to zero before changing the value. 
> So limiting the number of pci_dns that are created to the number
> actually required rather than totalvfs doesn't hurt.
> 
>> With this patch it becomes possible to enable VFs via sysfs "sriov_numvfs"
>> on PowerNV.
> 
> I tested on a few of our lab systems with random kernel versions
> spanning from 4.15 to 5.0 and sriov_numvfs seemed to work fine on all
> of them. Is there a specific configuration you're testing that needed
> this change?
> 

Thanks a lot for the review and testing!

I've just received back the hardware (Mellanox ConnectX-4 -
drivers/net/ethernet/mellanox/mlx5), and got surprised: the issue with the
pci_sriov_get_totalvfs(pdev) returning zero can't be reproduced anymore :/ I've 
rechecked
the code and don't know how could this even happen. I'm sorry about that; if it 
will
happen again, I have to investigate deeper.

The PCI subsystem doesn't let the number of VFs to be changed from non-zero 
value to
another non-zero value: it needs to sriov_disable() first. I guess we can rely 
on that and
don't reset the numvfs to zero explicitly.

I'll change the patch description and resend it in v6 with other fixes of this 
patchset.

Best regards,
Serge

>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  arch/powerpc/include/asm/pci-bridge.h |  4 +--
>>  arch/powerpc/kernel/pci_dn.c  | 32 ++-
>>  arch/powerpc/platforms/powernv/pci-ioda.c |  4 +--
>>  arch/powerpc/platforms/pseries/pci.c  |  4 +--
>>  4 files changed, 25 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>> b/arch/powerpc/include/asm/pci-bridge.h
>> index fc188e0e9179..6479bc96e0b6 100644
>> --- a/arch/powerpc/include/asm/pci-bridge.h
>> +++ b/arch/powerpc/include/asm/pci-bridge.h
>> @@ -225,8 +225,8 @@ struct pci_dn {
>>  extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
>> int devfn);
>>  extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
>> -extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
>> -extern void remove_dev_pci_data(struct pci_dev *pdev);
>> +extern struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs);
>> +extern void pci_destroy_vf_pdns(struct pci_dev *pdev);
>>  extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
>> struct device_node *dn);
>>  extern void pci_remove_device_node_info(struct device_node *dn);
>> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
>> index 7f12882d8882..7fa362f8038d 100644
>> --- a/arch/powerpc/kernel/pci_dn.c
>> +++ b/arch/powerpc/kernel/pci_dn.c
>> @@ -222,18 +222,19 @@ static struct pci_dn *pci_create_pdn_from_dev(struct 
>> pci_dev *pdev,
>>  return pdn;
>>  }
>>  
>> -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
>> +struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs)
>>  {
>> +struct pci_dn *pdn = pci_get_pdn(pdev);
>> +
>>  #ifdef CONFIG_PCI_IOV
>> -struct pci_dn *parent, *pdn;
>> +struct pci_dn *parent;
>>  int i;
>>  
>>  /* Only support IOV for now */
>>  if (!pdev->is_physfn)
>> -return pci_get_pdn(pdev);
>> +return pdn;
>>  
>>  /* Check if VFs have been populated */
>> -pdn = pci_get_pdn(pdev);
>>  if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>>  return NULL;
>>  
>> @@ -242,33 +243,38 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
>>  if (!parent)
>>  return NULL;
>>  
>> -for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>> +for (i = 0; i < num_vfs; i++) {
>>  struct eeh_dev *edev __maybe_unused;
>> +struct pci_dn *vpdn;
>>  
>> -pdn = pci_alloc_pdn(parent,
>> -pci_iov_virtfn_

Re: [PATCH RFC v4 12/21] PCI: Don't allow hotplugged devices to steal resources

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 11:55 PM, Bjorn Helgaas wrote:
> On Mon, Mar 11, 2019 at 04:31:13PM +0300, Sergey Miroshnichenko wrote:
>> When movable BARs are enabled, the PCI subsystem at first releases
>> all the bridge windows and then performs an attempt to assign new
>> requested resources and re-assign the existing ones.
> 
> s/performs an attempt/attempts/
> 
> I guess "new requested resources" means "resources to newly hotplugged
> devices"?
> 

Yes, that's exactly what I've tried to express :) Will rephrase that in v5.

>> If a hotplugged device gets its resources first, there could be no
>> space left to re-assign resources of already working devices, which
>> is unacceptable. If this happens, this patch marks one of the new
>> devices with the new introduced flag PCI_DEV_IGNORE and retries the
>> resource assignment.
>>
>> This patch adds a new res_mask bitmask to the struct pci_dev for
>> storing the indices of assigned resources.
>>
>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/pci/bus.c   |   5 ++
>>  drivers/pci/pci.h   |  11 +
>>  drivers/pci/probe.c | 100 +++-
>>  drivers/pci/setup-bus.c |  15 ++
>>  include/linux/pci.h |   1 +
>>  5 files changed, 130 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
>> index 5cb40b2518f9..a9784144d6f2 100644
>> --- a/drivers/pci/bus.c
>> +++ b/drivers/pci/bus.c
>> @@ -311,6 +311,11 @@ void pci_bus_add_device(struct pci_dev *dev)
>>  {
>>  int retval;
>>  
>> +if (pci_dev_is_ignored(dev)) {
>> +pci_warn(dev, "%s: don't enable the ignored device\n", 
>> __func__);
>> +return;
> 
> I'm not sure about this.  Even if we're unable to assign space for all
> the device's BARs, it still should respond to config accesses, and I
> think it should show up in sysfs and lspci.
> 

I agree, that would be better.

Also, this patch introduces a new issue to think about: how to recover BARs for 
such
devices when their neighbors was removed and it's enough space now.

>> +}
>> +
>>  /*
>>   * Can not put in pci_device_add yet because resources
>>   * are not assigned yet for some devices.
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index e06e8692a7b1..56b905068ac5 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -366,6 +366,7 @@ static inline bool pci_dev_is_disconnected(const struct 
>> pci_dev *dev)
>>  
>>  /* pci_dev priv_flags */
>>  #define PCI_DEV_ADDED 0
>> +#define PCI_DEV_IGNORE 1
>>  
>>  static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
>>  {
>> @@ -377,6 +378,16 @@ static inline bool pci_dev_is_added(const struct 
>> pci_dev *dev)
>>  return test_bit(PCI_DEV_ADDED, >priv_flags);
>>  }
>>  
>> +static inline void pci_dev_ignore(struct pci_dev *dev, bool ignore)
>> +{
>> +assign_bit(PCI_DEV_IGNORE, >priv_flags, ignore);
>> +}
>> +
>> +static inline bool pci_dev_is_ignored(const struct pci_dev *dev)
>> +{
>> +return test_bit(PCI_DEV_IGNORE, >priv_flags);
>> +}
>> +
>>  #ifdef CONFIG_PCIEAER
>>  #include 
>>  
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 692752c71f71..62f4058a001f 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -3248,6 +3248,23 @@ unsigned int pci_rescan_bus_bridge_resize(struct 
>> pci_dev *bridge)
>>  return max;
>>  }
>>  
>> +static unsigned int pci_dev_res_mask(struct pci_dev *dev)
>> +{
>> +unsigned int res_mask = 0;
>> +int i;
>> +
>> +for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
>> +struct resource *r = >resource[i];
>> +
>> +if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent)
>> +continue;
>> +
>> +res_mask |= (1 << i);
>> +}
>> +
>> +return res_mask;
>> +}
>> +
>>  static void pci_bus_rescan_prepare(struct pci_bus *bus)
>>  {
>>  struct pci_dev *dev;
>> @@ -3257,6 +3274,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus)
>>  list_for_each_entry(dev, >devices, bus_list) {
>>  struct pci_bus *child = dev->subordinate;
>>  
>> +dev->res_mask = pci_dev_res_mask(dev);
>> +
>>  if (child) {
>>  pci_bus_re

Re: [PATCH RFC v4 11/21] PCI: Release and reassign the root bridge resources during rescan

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 11:41 PM, Bjorn Helgaas wrote:
> On Mon, Mar 11, 2019 at 04:31:12PM +0300, Sergey Miroshnichenko wrote:
>> When the movable BARs feature is enabled, don't rely on the memory gaps
>> reserved by the BIOS/bootloader/firmware, but instead rearrange the BARs
>> and bridge windows starting from the root.
>>
>> Endpoint device's BARs, after being released, are resorted and written
>> back by the pci_assign_unassigned_root_bus_resources().
>>
>> The last step of writing the recalculated windows to the bridges is done
>> by the new pci_setup_bridges() function.
>>
>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/pci/pci.h   |  1 +
>>  drivers/pci/probe.c | 22 ++
>>  drivers/pci/setup-bus.c | 11 ++-
>>  3 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index 224d88634115..e06e8692a7b1 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -248,6 +248,7 @@ void __pci_bus_assign_resources(const struct pci_bus 
>> *bus,
>>  struct list_head *realloc_head,
>>  struct list_head *fail_head);
>>  bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
>> +void pci_bus_release_root_bridge_resources(struct pci_bus *bus);
>>  
>>  void pci_reassigndev_resource_alignment(struct pci_dev *dev);
>>  void pci_disable_bridge_window(struct pci_dev *dev);
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 1cf6ec960236..692752c71f71 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -3299,6 +3299,25 @@ static void pci_bus_rescan_done(struct pci_bus *bus)
>>  pm_runtime_put(>dev);
>>  }
>>  
>> +static void pci_setup_bridges(struct pci_bus *bus)
>> +{
>> +struct pci_dev *dev;
>> +
>> +list_for_each_entry(dev, >devices, bus_list) {
>> +struct pci_bus *child;
>> +
>> +if (!pci_dev_is_added(dev) || pci_dev_is_ignored(dev))
>> +continue;
>> +
>> +child = dev->subordinate;
>> +if (child)
>> +pci_setup_bridges(child);
>> +}
>> +
>> +if (bus->self)
>> +pci_setup_bridge(bus);
>> +}
>> +
>>  /**
>>   * pci_rescan_bus - Scan a PCI bus for devices
>>   * @bus: PCI bus to scan
>> @@ -3321,8 +3340,11 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
>>  pci_bus_rescan_prepare(root);
>>  
>>  max = pci_scan_child_bus(root);
>> +
>> +pci_bus_release_root_bridge_resources(root);
>>  pci_assign_unassigned_root_bus_resources(root);
>>  
>> +pci_setup_bridges(root);
>>  pci_bus_rescan_done(root);
>>  } else {
>>  max = pci_scan_child_bus(bus);
>> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
>> index be7d4e6d7b65..36a1907d9509 100644
>> --- a/drivers/pci/setup-bus.c
>> +++ b/drivers/pci/setup-bus.c
>> @@ -1584,7 +1584,7 @@ static void pci_bridge_release_resources(struct 
>> pci_bus *bus,
>>  pci_printk(KERN_DEBUG, dev, "resource %d %pR released\n",
>>  PCI_BRIDGE_RESOURCES + idx, r);
>>  /* keep the old size */
>> -r->end = resource_size(r) - 1;
>> +r->end = pci_movable_bars_enabled() ? 0 : (resource_size(r) - 
>> 1);
> 
> Doesn't this mean we're throwing away the information about the BAR
> size, and we'll have to size the BAR again somewhere?  I would like to
> avoid that.  But I don't know yet where you rely on this, so maybe
> it's not possible to avoid it.
> 

This resource is not a BAR, but a bridge window, I'm freeing it intentionally, 
so
pbus_size_mem() can later recalculate a new size.

Serge

>>  r->start = 0;
>>  r->flags = 0;
>>  
>> @@ -1637,6 +1637,15 @@ static void pci_bus_release_bridge_resources(struct 
>> pci_bus *bus,
>>  pci_bridge_release_resources(bus, type);
>>  }
>>  
>> +void pci_bus_release_root_bridge_resources(struct pci_bus *root_bus)
>> +{
>> +pci_bus_release_bridge_resources(root_bus, IORESOURCE_IO, 
>> whole_subtree);
>> +pci_bus_release_bridge_resources(root_bus, IORESOURCE_MEM, 
>> whole_subtree);
>> +pci_bus_release_bridge_resources(root_bus,
>> + IORESOURCE_MEM_64 | 
>> IORESOURCE_PREFETCH,
>> + whole_subtree);
>> +}
>> +
>>  static void pci_bus_dump_res(struct pci_bus *bus)
>>  {
>>  struct resource *res;
>> -- 
>> 2.20.1
>>


Re: [PATCH RFC v4 09/21] PCI: Mark immovable BARs with PCI_FIXED

2019-03-27 Thread Sergey Miroshnichenko
On 3/27/19 8:03 PM, David Laight wrote:
> From: Bjorn Helgaas
>> Sent: 26 March 2019 20:29
>>
>> On Mon, Mar 11, 2019 at 04:31:10PM +0300, Sergey Miroshnichenko wrote:
>>> If a PCIe device driver doesn't yet have support for movable BARs,
>>> mark device's BARs with IORESOURCE_PCI_FIXED.
>>
>> I'm hesitant about using IORESOURCE_PCI_FIXED for this purpose.  That
>> was originally added to describe resources that can not be changed
>> because they're hardwired in the device, e.g., legacy resources and
>> Enhanced Allocation resources.
>>
>> In general, I think the bits in res->flags should tell us things about
>> the hardware.  This particular use would be something about the
>> *driver*, and I think we should figure that out by looking at
>> dev->driver.
> 
> There will also be drivers that don't support BARs being moved,
> but may be in a state (ie not actually open) where they can go
> through a remove-rescan sequence to allow the BAR be moved.
> 
> This might even be true if the open count is non-zero.
> 
>   David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> 1PT, UK
> Registration No: 1397386 (Wales)
> 

This approach with IORESOURCE_PCI_FIXED was used because struct resource 
doesn't have a
pointer to its device (and so to its driver). But now, after you have mentioned 
that, I
can see that in every place I use the FIXED flag to mark the immovable 
resources - also
has the according struct pci_dev *dev nearby.

So, replacing every

if (r->flags & IORESOURCE_PCI_FIXED)

with

if (!dev->driver->rescan_prepare)

or something like

if (pci_dev_movable_bars_capable(dev))

will reduce this huge patchset a little, and also makes irrelevant the case I've
completely forgotten about - IORESOURCE_PCI_FIXED must be unset on removing 
(rmmod) the
"immovable" driver.

Thanks a lot! I'll rework the changes in this way and resend it as v5.

Serge


Re: [PATCH RFC v4 08/21] nvme-pci: Handle movable BARs

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 11:20 PM, Bjorn Helgaas wrote:
> [+cc Keith, Jens, Christoph, Sagi, linux-nvme, LKML]
> 
> On Mon, Mar 11, 2019 at 04:31:09PM +0300, Sergey Miroshnichenko wrote:
>> Hotplugged devices can affect the existing ones by moving their BARs.
>> PCI subsystem will inform the NVME driver about this by invoking
>> reset_prepare()+reset_done(), then iounmap()+ioremap() must be called.
> 
> Do you mean the PCI core will invoke ->rescan_prepare() and
> ->rescan_done() (as opposed to *reset*)?
> 

Yes, of course, sorry for the confusion!

These are new callbacks, so drivers can explicitly show their support of 
movable BARs, and
the PCI core can detect if they don't and note that the corresponding BARs 
can't be moved
for now.

>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/nvme/host/pci.c | 29 +++--
>>  1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 92bad1c810ac..ccea3033a67a 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -106,6 +106,7 @@ struct nvme_dev {
>>  unsigned int num_vecs;
>>  int q_depth;
>>  u32 db_stride;
>> +resource_size_t current_phys_bar;
>>  void __iomem *bar;
>>  unsigned long bar_mapped_size;
>>  struct work_struct remove_work;
>> @@ -1672,13 +1673,16 @@ static int nvme_remap_bar(struct nvme_dev *dev, 
>> unsigned long size)
>>  {
>>  struct pci_dev *pdev = to_pci_dev(dev->dev);
>>  
>> -if (size <= dev->bar_mapped_size)
>> +if (dev->bar &&
>> +dev->current_phys_bar == pci_resource_start(pdev, 0) &&
>> +size <= dev->bar_mapped_size)
>>  return 0;
>>  if (size > pci_resource_len(pdev, 0))
>>  return -ENOMEM;
>>  if (dev->bar)
>>  iounmap(dev->bar);
>> -dev->bar = ioremap(pci_resource_start(pdev, 0), size);
>> +dev->current_phys_bar = pci_resource_start(pdev, 0);
>> +dev->bar = ioremap(dev->current_phys_bar, size);
> 
> dev->current_phys_bar is different from pci_resource_start() in the
> case where the PCI core has moved the nvme BAR, but nvme has not yet
> remapped it.
> 
> I'm not sure it's worth keeping track of current_phys_bar, as opposed
> to always unmapping and remapping.  Is this a performance path?  I
> think there are advantages to always exercising the same code path,
> regardless of whether the BAR happened to be moved, e.g., if there's a
> bug in the "BAR moved" path, it may be a heisenbug because whether we
> exercise that path depends on the current configuration.
> 
> If you do need to cache current_phys_bar, maybe this, so it's a little
> easier to see that you're not changing the ioremap() itself:
> 
>   dev->bar = ioremap(pci_resource_start(pdev, 0), size);
>   dev->current_phys_bar = pci_resource_start(pdev, 0);
> 

Oh, I see now. Rescan is rather a rare event, and unconditional remapping is 
simpler, so a
bit more resistant to bugs.

>>  if (!dev->bar) {
>>  dev->bar_mapped_size = 0;
>>  return -ENOMEM;
>> @@ -2504,6 +2508,8 @@ static void nvme_reset_work(struct work_struct *work)
>>  if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
>>  goto out;
>>  
>> +nvme_remap_bar(dev, db_bar_size(dev, 0));
> 
> How is this change connected to rescan?  This looks reset-related.
> 

Thanks for catching that! This has also slipped form early stage of this 
pathset, when
reset_done() (which is rescan_done() now) just initiated an NVME reset.

Best regards,
Serge

>>  /*
>>   * If we're called to reset a live controller first shut it down before
>>   * moving on.
>> @@ -2910,6 +2916,23 @@ static void nvme_error_resume(struct pci_dev *pdev)
>>  flush_work(>ctrl.reset_work);
>>  }
>>  
>> +void nvme_rescan_prepare(struct pci_dev *pdev)
>> +{
>> +struct nvme_dev *dev = pci_get_drvdata(pdev);
>> +
>> +nvme_dev_disable(dev, false);
>> +nvme_dev_unmap(dev);
>> +dev->bar = NULL;
>> +}
>> +
>> +void nvme_rescan_done(struct pci_dev *pdev)
>> +{
>> +struct nvme_dev *dev = pci_get_drvdata(pdev);
>> +
>> +nvme_dev_map(dev);
>> +nvme_reset_ctrl_sync(>ctrl);
>> +}
>> +
>>  static const struct pci_error_handlers nvme_err_handler = {
>>  .error_detected = nvme_error_detected,
>>  .slot_reset = nvme_slot_reset,
>> @@ -2974,6 +2997,8 @@ static struct pci_driver nvme_driver = {
>>  },
>>  .sriov_configure = pci_sriov_configure_simple,
>>  .err_handler= _err_handler,
>> +.rescan_prepare = nvme_rescan_prepare,
>> +.rescan_done= nvme_rescan_done,
>>  };
>>  
>>  static int __init nvme_init(void)
>> -- 
>> 2.20.1
>>


Re: [PATCH RFC v4 05/21] PCI: hotplug: Add a flag for the movable BARs feature

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 10:24 PM, Bjorn Helgaas wrote:
> On Mon, Mar 11, 2019 at 04:31:06PM +0300, Sergey Miroshnichenko wrote:
>> If a new PCIe device has been hot-plugged between the two active ones
>> without big enough gap between their BARs, 
> 
> Just to speak precisely here, a hot-added device is not "between" two
> active ones because the new device has zeros in its BARs.
> 
> BARs from different devices can be interleaved arbitrarily, subject to
> bridge window constraints, so we can really only speak about a *BAR*
> (not the entire device) being between two other BARs.
> 
> Also, I don't think there's anything here that is PCIe-specific, so we
> should talk about "PCI", not "PCIe".
> 

I agree, that should be rephrased. This patchset intends to solve the problem 
when a
bridge window is not big enough (or fragmented too much) to fit new BARs, and 
it can't be
expanded enough because blocked by "neighboring" BARs.

>> these BARs should be moved
>> if their drivers support this feature. The drivers should be notified
>> and paused during the procedure:
>>
>> 1) dev 8 (new)
>>|
>>v
>> .. |  dev 3  |  dev 3  |  dev 5  |  dev 7  |
>> .. |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 0  |
>>
>> 2) dev 8
>>  |
>>  v
>> .. |  dev 3  |  dev 3  | -->   --> |  dev 5  |  dev 7  |
>> .. |  BAR 0  |  BAR 1  | -->   --> |  BAR 0  |  BAR 0  |
>>
>>  3)
>>
>> .. |  dev 3  |  dev 3  |  dev 8  |  dev 8  |  dev 5  |  dev 7  |
>> .. |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 0  |
>>
>> Thus, prior reservation of memory regions by BIOS/bootloader/firmware
>> is not required anymore for the PCIe hotplug.
>>
>> The PCI_MOVABLE_BARS flag is set by the platform is this feature is
>> supported and tested, but can be overridden by the following command
>> line option:
>> pcie_movable_bars={ off | force }
> 
> A chicken switch to turn this functionality off is OK, but I think it
> should be enabled by default.  There isn't anything about this that's
> platform-specific, is there?
> 

I'm a bit afraid to suppose that; I was once surprised that bus numbers can't 
be assigned
arbitrarily on some platforms [1], so probably there are some similar 
restrictions on BARs
too.

Was going to propose adding pci_add_flags(PCI_MOVABLE_BARS) into 
arch/.../init.c for
tested platforms, so there will be less upset people with their BARs suddenly 
broken. But
this logic can be reversed: pci_clear_flags(PCI_MOVABLE_BARS) for platforms 
where movable
BARs can't work.

Serge

[1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2018-September/178103.html

>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  .../admin-guide/kernel-parameters.txt |  7 ++
>>  drivers/pci/pci.c | 24 +++
>>  include/linux/pci.h   |  2 ++
>>  3 files changed, 33 insertions(+)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
>> b/Documentation/admin-guide/kernel-parameters.txt
>> index 2b8ee90bb644..d40eaf993f80 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -3417,6 +3417,13 @@
>>  nomsi   Do not use MSI for native PCIe PME signaling (this makes
>>  all PCIe root ports use INTx for all services).
>>  
>> +pcie_movable_bars=[PCIE]
>> +Override the movable BARs support detection:
>> +off
>> +Disable even if supported by the platform
>> +force
>> +Enable even if not explicitly declared as supported
>> +
>>  pcmv=   [HW,PCMCIA] BadgePAD 4
>>  
>>  pd_ignore_unused
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 69898fe5255e..4dac49a887ec 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -139,6 +139,30 @@ static int __init pcie_port_pm_setup(char *str)
>>  }
>>  __setup("pcie_port_pm=", pcie_port_pm_setup);
>>  
>> +static bool pcie_movable_bars_off;
>> +static bool pcie_movable_bars_force;
>> +static int __init pcie_movable_bars_setup(char *str)
>> +{
>> +if (!strcmp(str, "off"))
>> +pcie_movable_bars_off = true;
>> +else if (!strcmp(str, "force"))
>> +pcie_movable_bars_force = tru

Re: [PATCH RFC v4 03/21] PCI: Enable bridge's I/O and MEM access for hotplugged devices

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 10:13 PM, Bjorn Helgaas wrote:
> On Mon, Mar 11, 2019 at 04:31:04PM +0300, Sergey Miroshnichenko wrote:
>> After updating the bridge window resources, the PCI_COMMAND_IO and
>> PCI_COMMAND_MEMORY bits of the bridge must be addressed as well.
>>
>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/pci/pci.c | 8 
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 895201d4c9e6..69898fe5255e 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -1622,6 +1622,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  pci_enable_bridge(bridge);
>>  
>>  if (pci_is_enabled(dev)) {
>> +int i, bars = 0;
>> +
>> +for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) {
>> +if (dev->resource[i].flags & (IORESOURCE_MEM | 
>> IORESOURCE_IO))
>> +bars |= (1 << i);
>> +}
>> +do_pci_enable_device(dev, bars);
> 
> In what situation is this needed, exactly?  This code already exists
> in pci_enable_device_flags().  Why isn't that enough?
> 
> I guess maybe there's some case where we enable the bridge, then
> assign bridge windows, then enable a downstream device?
> 
> Does this fix a bug with current hotplug?
> 

Sure, this change was implemented because of the issue: 
pci_enable_device_flags() returns
early if the device is already pci_is_enabled(), so if a bridge was already 
enabled before
the hotplug event, but without MEM and/or IO being set, these bits will not be 
set even if
a new device wants them.

I've chosen the pci_enable_bridge() for this snippet because it recursively 
updates all
the parent bridges.

Serge

>>  if (!dev->is_busmaster)
>>  pci_set_master(dev);
>>  mutex_unlock(>enable_mutex);
>> -- 
>> 2.20.1
>>


Re: [PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device()

2019-03-27 Thread Sergey Miroshnichenko
On 3/26/19 10:00 PM, Bjorn Helgaas wrote:
> [+cc Srinath, Marta, LKML]
> 
> On Mon, Mar 11, 2019 at 04:31:03PM +0300, Sergey Miroshnichenko wrote:
>>  CPU0  CPU1
>>
>>  pci_enable_device_mem()   pci_enable_device_mem()
>>pci_enable_bridge()   pci_enable_bridge()
>>  pci_is_enabled()
>>return false;
>>  atomic_inc_return(enable_cnt)
>>  Start actual enabling the bridge
>>  ...   pci_is_enabled()
>>  ... return true;
>>  ...   Start memory requests <-- FAIL
>>  ...
>>  Set the PCI_COMMAND_MEMORY bit <-- Must wait for this
>>
>> This patch protects the pci_enable/disable_device() and pci_enable_bridge()
>> with mutexes.
> 
> This is a subtle issue that we've tried to fix before, but we've never
> had a satisfactory solution, so I hope you've figured out the right
> fix.
> 
> I'll include some links to previous discussion.  This patch is very
> similar to [2], which we didn't actually apply.  We did apply the
> patch from [3] as 40f11adc7cd9 ("PCI: Avoid race while enabling
> upstream bridges"), but it caused the regressions reported in [4,5],
> so we reverted it with 0f50a49e3008 ("Revert "PCI: Avoid race while
> enabling upstream bridges"").
> 

Thanks for the links, I wasn't aware of these discussions and patches!

On PowerNV this issue is partially hidden by db2173198b95 
("powerpc/powernv/pci: Work
around races in PCI bridge enabling"), and on x86 BIOS pre-initializes all the 
bridges, so
it doesn't reproduce until hotplugging in a hotplugged bridge.

This patch is indeed similar to 40f11adc7cd9 ("PCI: Avoid race while enabling 
upstream
bridges"), but instead of a single static mutex it adds per-device mutexes and 
prevents
the dev->enable_cnt from incrementing too early. So it's not needed anymore to 
carefully
select a moment safe enough to enable the device.

Serge

> I think the underlying design problem is that we have a driver for
> device B calling pci_enable_device(), and it is changing the state of
> device A (an upstream bridge).  The model generally is that a driver
> should only touch the device it is bound to.
> 
> It's tricky to get the locking right when several children of device A
> all need to operate on A.
> 
> That's all to say I'll have to think carefully about this particular
> patch, so I'll go on to the others and come back to this one.
> 
> Bjorn
> 
> [1] 
> https://lore.kernel.org/linux-pci/1494256190-28993-1-git-send-email-srinath.man...@broadcom.com/T/#u
> [RFC PATCH] pci: Concurrency issue in NVMe Init through PCIe switch
> 
> [2] 
> https://lore.kernel.org/linux-pci/1496135297-19680-1-git-send-email-srinath.man...@broadcom.com/T/#u
> [RFC PATCH v2] pci: Concurrency issue in NVMe Init through PCIe switch
> 
> [3] 
> https://lore.kernel.org/linux-pci/1501858648-8-1-git-send-email-srinath.man...@broadcom.com/T/#u
> [RFC PATCH v3] pci: Concurrency issue during pci enable bridge
> 
> [4] 
> https://lore.kernel.org/linux-pci/150547971091.977464.16294045866179907260.stgit@buzz/T/#u
> [PATCH bisected regression in 4.14] PCI: fix race while enabling upstream 
> bridges concurrently
> 
> [5] 
> https://lore.kernel.org/linux-wireless/04c9b578-693c-1dc6-9f0f-904580231...@kernel.dk/T/#u
> iwlwifi firmware load broken in current -git
> 
> [6] 
> https://lore.kernel.org/linux-pci/744877924.5841545.1521630049567.javamail.zim...@kalray.eu/T/#u
> [RFC PATCH] nvme: avoid race-conditions when enabling devices
> 
>> Signed-off-by: Sergey Miroshnichenko 
>> ---
>>  drivers/pci/pci.c   | 26 ++
>>  drivers/pci/probe.c |  1 +
>>  include/linux/pci.h |  1 +
>>  3 files changed, 24 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index f006068be209..895201d4c9e6 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -1615,6 +1615,8 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  struct pci_dev *bridge;
>>  int retval;
>>  
>> +mutex_lock(>enable_mutex);
>> +
>>  bridge = pci_upstream_bridge(dev);
>>  if (bridge)
>>  pci_enable_bridge(bridge);
>> @@ -1622,6 +1624,7 @@ static void pci_enable_bridge(struct pci_dev *dev)
>>  if (pci_is_enabled(dev)) {
>>  if (!dev->is_busmaster)
>>  pci_set_master(dev);
>> +mutex_unlock(&

[PATCH RFC v4 21/21] powerpc/pci: Fix crash with enabled movable BARs

2019-03-11 Thread Sergey Miroshnichenko
Check a resource for the UNSET flags.

Signed-off-by: Sergey Miroshnichenko 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fa6af52b5219..353b36727f6a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2977,7 +2977,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
 
-   if (!res || !res->flags || res->start > res->end)
+   if (!res || !res->flags || res->start > res->end ||
+   (res->flags & IORESOURCE_UNSET))
return;
 
if (res->flags & IORESOURCE_IO) {
-- 
2.20.1



[PATCH RFC v4 19/21] PCI: Prioritize fixed BAR assigning over the movable ones

2019-03-11 Thread Sergey Miroshnichenko
The allocated bridge windows are big enough to house all the children
bridges and BARs, but the fixed resources must be assigned first, so the
movable ones later divide the rest of the window. That's the assignment
order:

 1. Bridge windows with fixed areas;
 2. Fixed BARs;
 3. The rest of BARs and bridge windows.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 69 -
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f4737339d5ec..932a6c020d10 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -272,31 +272,54 @@ static void reassign_resources_sorted(struct list_head 
*realloc_head,
}
 }
 
-/**
- * assign_requested_resources_sorted() - satisfy resource requests
- *
- * @head : head of the list tracking requests for resources
- * @fail_head : head of the list tracking requests that could
- * not be allocated
- *
- * Satisfy resource requests of each element in the list. Add
- * requests that could not satisfied to the failed_list.
- */
-static void assign_requested_resources_sorted(struct list_head *head,
-struct list_head *fail_head)
+enum assign_step {
+   assign_fixed_bridge_windows,
+   assign_fixed_resources,
+   assign_float_resources,
+};
+
+static void _assign_requested_resources_sorted(struct list_head *head,
+  struct list_head *fail_head,
+  enum assign_step step)
 {
struct resource *res;
struct pci_dev_resource *dev_res;
int idx;
 
list_for_each_entry(dev_res, head, list) {
+   bool is_fixed;
+   bool is_fixed_bridge;
+   bool is_bridge;
+
if (pci_dev_is_ignored(dev_res->dev))
continue;
 
res = dev_res->res;
+   if (!resource_size(res))
+   continue;
+
idx = res - _res->dev->resource[0];
-   if (resource_size(res) &&
-   pci_assign_resource(dev_res->dev, idx)) {
+   is_fixed = res->flags & IORESOURCE_PCI_FIXED;
+   is_bridge = dev_res->dev->subordinate && idx >= 
PCI_BRIDGE_RESOURCES;
+
+   if (is_bridge) {
+   struct pci_bus *child = dev_res->dev->subordinate;
+   int b_res_idx = pci_get_bridge_resource_idx(res);
+   struct resource *fixed_res = 
>fixed_range_hard[b_res_idx];
+
+   is_fixed_bridge = fixed_res->start < fixed_res->end;
+   } else {
+   is_fixed_bridge = false;
+   }
+
+   if (assign_fixed_bridge_windows == step && !is_fixed_bridge)
+   continue;
+   else if (assign_fixed_resources == step && (!is_fixed || 
is_bridge))
+   continue;
+   else if (assign_float_resources == step && (is_fixed || 
is_fixed_bridge))
+   continue;
+
+   if (pci_assign_resource(dev_res->dev, idx)) {
if (fail_head) {
/*
 * if the failed res is for ROM BAR, and it will
@@ -315,6 +338,24 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
}
 }
 
+/**
+ * assign_requested_resources_sorted() - satisfy resource requests
+ *
+ * @head : head of the list tracking requests for resources
+ * @fail_head : head of the list tracking requests that could
+ * not be allocated
+ *
+ * Satisfy resource requests of each element in the list. Add
+ * requests that could not satisfied to the failed_list.
+ */
+static void assign_requested_resources_sorted(struct list_head *head,
+ struct list_head *fail_head)
+{
+   _assign_requested_resources_sorted(head, fail_head, 
assign_fixed_bridge_windows);
+   _assign_requested_resources_sorted(head, fail_head, 
assign_fixed_resources);
+   _assign_requested_resources_sorted(head, fail_head, 
assign_float_resources);
+}
+
 static unsigned long pci_fail_res_type_mask(struct list_head *fail_head)
 {
struct pci_dev_resource *fail_res;
-- 
2.20.1



[PATCH RFC v4 18/21] PCI: Make sure bridge windows include their fixed BARs

2019-03-11 Thread Sergey Miroshnichenko
Consider previously calculated boundaries when allocating a bridge
window, setting the lowest allowed address and checking the result.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/bus.c   |  2 +-
 drivers/pci/setup-res.c | 31 +--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index a9784144d6f2..ce2d2aeedbd3 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -192,7 +192,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, 
struct resource *res,
 * this is an already-configured bridge window, its start
 * overrides "min".
 */
-   if (avail.start)
+   if (min_used < avail.start)
min_used = avail.start;
 
max = avail.end;
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 732d18f60f1b..04442339548d 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -248,9 +248,22 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
struct resource *res = dev->resource + resno;
resource_size_t min;
int ret;
+   resource_size_t start = (resource_size_t)-1;
+   resource_size_t end = 0;
 
min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
 
+   if (dev->subordinate && resno >= PCI_BRIDGE_RESOURCES) {
+   struct pci_bus *child_bus = dev->subordinate;
+   int b_resno = resno - PCI_BRIDGE_RESOURCES;
+   resource_size_t soft_start = 
child_bus->fixed_range_soft[b_resno].start;
+
+   start = child_bus->fixed_range_hard[b_resno].start;
+   end = child_bus->fixed_range_hard[b_resno].end;
+   if (start < end)
+   min = soft_start;
+   }
+
/*
 * First, try exact prefetching match.  Even if a 64-bit
 * prefetchable bridge window is below 4GB, we can't put a 32-bit
@@ -262,7 +275,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH | IORESOURCE_MEM_64,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
 
/*
 * If the prefetchable window is only 32 bits wide, we can put
@@ -274,7 +287,7 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
 IORESOURCE_PREFETCH,
 pcibios_align_resource, dev);
if (ret == 0)
-   return 0;
+   goto check_fixed;
}
 
/*
@@ -287,6 +300,20 @@ static int __pci_assign_resource(struct pci_bus *bus, 
struct pci_dev *dev,
ret = pci_bus_alloc_resource(bus, res, size, align, min, 0,
 pcibios_align_resource, dev);
 
+check_fixed:
+   if (ret == 0 && start < end) {
+   if (res->start > start || res->end < end) {
+   dev_err(>dev, "%s: fixed area 0x%llx-0x%llx for %s 
doesn't fit in the allocated %pR (0x%llx-0x%llx)",
+   __func__,
+   (unsigned long long)start, (unsigned long 
long)end,
+   dev_name(>dev),
+   res, (unsigned long long)res->start,
+   (unsigned long long)res->end);
+   release_resource(res);
+   return -1;
+   }
+   }
+
return ret;
 }
 
-- 
2.20.1



[PATCH RFC v4 20/21] PCI: pciehp: Add support for the movable BARs feature

2019-03-11 Thread Sergey Miroshnichenko
With movable BARs, adding a hotplugged device may affect all the PCIe
domain starting from the root, so use a pci_rescan_bus() function which
handles the rearrangement of existing BARs and bridge windows.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/hotplug/pciehp_pci.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index b9c1396db6fe..7c0871db5bae 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -56,12 +56,16 @@ int pciehp_configure_device(struct controller *ctrl)
goto out;
}
 
-   for_each_pci_bridge(dev, parent)
-   pci_hp_add_bridge(dev);
+   if (pci_movable_bars_enabled()) {
+   pci_rescan_bus(parent);
+   } else {
+   for_each_pci_bridge(dev, parent)
+   pci_hp_add_bridge(dev);
 
-   pci_assign_unassigned_bridge_resources(bridge);
-   pcie_bus_configure_settings(parent);
-   pci_bus_add_devices(parent);
+   pci_assign_unassigned_bridge_resources(bridge);
+   pcie_bus_configure_settings(parent);
+   pci_bus_add_devices(parent);
+   }
 
  out:
pci_unlock_rescan_remove();
-- 
2.20.1



[PATCH RFC v4 12/21] PCI: Don't allow hotplugged devices to steal resources

2019-03-11 Thread Sergey Miroshnichenko
When movable BARs are enabled, the PCI subsystem at first releases
all the bridge windows and then performs an attempt to assign new
requested resources and re-assign the existing ones.

If a hotplugged device gets its resources first, there could be no
space left to re-assign resources of already working devices, which
is unacceptable. If this happens, this patch marks one of the new
devices with the new introduced flag PCI_DEV_IGNORE and retries the
resource assignment.

This patch adds a new res_mask bitmask to the struct pci_dev for
storing the indices of assigned resources.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/bus.c   |   5 ++
 drivers/pci/pci.h   |  11 +
 drivers/pci/probe.c | 100 +++-
 drivers/pci/setup-bus.c |  15 ++
 include/linux/pci.h |   1 +
 5 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 5cb40b2518f9..a9784144d6f2 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -311,6 +311,11 @@ void pci_bus_add_device(struct pci_dev *dev)
 {
int retval;
 
+   if (pci_dev_is_ignored(dev)) {
+   pci_warn(dev, "%s: don't enable the ignored device\n", 
__func__);
+   return;
+   }
+
/*
 * Can not put in pci_device_add yet because resources
 * are not assigned yet for some devices.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index e06e8692a7b1..56b905068ac5 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -366,6 +366,7 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
 
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
+#define PCI_DEV_IGNORE 1
 
 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
 {
@@ -377,6 +378,16 @@ static inline bool pci_dev_is_added(const struct pci_dev 
*dev)
return test_bit(PCI_DEV_ADDED, >priv_flags);
 }
 
+static inline void pci_dev_ignore(struct pci_dev *dev, bool ignore)
+{
+   assign_bit(PCI_DEV_IGNORE, >priv_flags, ignore);
+}
+
+static inline bool pci_dev_is_ignored(const struct pci_dev *dev)
+{
+   return test_bit(PCI_DEV_IGNORE, >priv_flags);
+}
+
 #ifdef CONFIG_PCIEAER
 #include 
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 692752c71f71..62f4058a001f 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3248,6 +3248,23 @@ unsigned int pci_rescan_bus_bridge_resize(struct pci_dev 
*bridge)
return max;
 }
 
+static unsigned int pci_dev_res_mask(struct pci_dev *dev)
+{
+   unsigned int res_mask = 0;
+   int i;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent)
+   continue;
+
+   res_mask |= (1 << i);
+   }
+
+   return res_mask;
+}
+
 static void pci_bus_rescan_prepare(struct pci_bus *bus)
 {
struct pci_dev *dev;
@@ -3257,6 +3274,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus)
list_for_each_entry(dev, >devices, bus_list) {
struct pci_bus *child = dev->subordinate;
 
+   dev->res_mask = pci_dev_res_mask(dev);
+
if (child) {
pci_bus_rescan_prepare(child);
} else if (dev->driver &&
@@ -3318,6 +3337,84 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static struct pci_dev *pci_find_next_new_device(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   if (!bus)
+   return NULL;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child_bus = dev->subordinate;
+
+   if (!pci_dev_is_added(dev) && !pci_dev_is_ignored(dev))
+   return dev;
+
+   if (child_bus) {
+   struct pci_dev *next_new_dev;
+
+   next_new_dev = pci_find_next_new_device(child_bus);
+   if (next_new_dev)
+   return next_new_dev;
+   }
+   }
+
+   return NULL;
+}
+
+static bool pci_bus_validate_resources(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   bool ret = true;
+
+   if (!bus)
+   return false;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child = dev->subordinate;
+   unsigned int res_mask = pci_dev_res_mask(dev);
+
+   if (pci_dev_is_ignored(dev))
+   continue;
+
+   if (dev->res_mask & ~res_mask) {
+   pci_err(dev, "%s: Non-re-enabled resources found: 0x%x 
-> 0x%x\n",
+   __func__, dev->res_mask, res_mask);
+   ret = false;
+

[PATCH RFC v4 16/21] PCI: Calculate fixed areas of bridge windows based on fixed BARs

2019-03-11 Thread Sergey Miroshnichenko
For every (IO, MEM, MEM64) bridge window, count the fixed resources of
its children endpoints and children bridge windows:

| <- BAR -> || <- child bus fixed_range_hard -> |   | <- fixed BAR -> |
 | <-bus's fixed_range_hard-> |
| <-   bus's bridge window -> |

These ranges will be later used to arrange bridge windows in a way which
covers every immovable BAR as well as the movable ones during hotplug.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   | 14 +++
 drivers/pci/probe.c | 82 +
 drivers/pci/setup-bus.c | 17 +
 include/linux/pci.h |  6 +++
 4 files changed, 119 insertions(+)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 56b905068ac5..14e3ebe68010 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -364,6 +364,20 @@ static inline bool pci_dev_is_disconnected(const struct 
pci_dev *dev)
return dev->error_state == pci_channel_io_perm_failure;
 }
 
+static inline int pci_get_bridge_resource_idx(struct resource *r)
+{
+   int idx = 1;
+
+   if (r->flags & IORESOURCE_IO)
+   idx = 0;
+   else if (!(r->flags & IORESOURCE_PREFETCH))
+   idx = 1;
+   else if (r->flags & IORESOURCE_MEM_64)
+   idx = 2;
+
+   return idx;
+}
+
 /* pci_dev priv_flags */
 #define PCI_DEV_ADDED 0
 #define PCI_DEV_IGNORE 1
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 62f4058a001f..70b15654f253 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -551,6 +551,7 @@ void pci_read_bridge_bases(struct pci_bus *child)
 static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
 {
struct pci_bus *b;
+   int idx;
 
b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
@@ -567,6 +568,11 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus 
*parent)
if (parent)
b->domain_nr = parent->domain_nr;
 #endif
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   b->fixed_range_hard[idx].start = (resource_size_t)-1;
+   b->fixed_range_hard[idx].end = 0;
+   }
+
return b;
 }
 
@@ -3337,6 +3343,81 @@ static void pci_setup_bridges(struct pci_bus *bus)
pci_setup_bridge(bus);
 }
 
+static void pci_bus_update_fixed_range_hard(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   int idx;
+   resource_size_t start, end;
+
+   for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) {
+   bus->fixed_range_hard[idx].start = (resource_size_t)-1;
+   bus->fixed_range_hard[idx].end = 0;
+   }
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_fixed_range_hard(dev->subordinate);
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   int i;
+
+   for (i = 0; i < PCI_BRIDGE_RESOURCES; ++i) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || (r->flags & IORESOURCE_UNSET) || 
!r->parent)
+   continue;
+
+   if (r->flags & IORESOURCE_PCI_FIXED) {
+   idx = pci_get_bridge_resource_idx(r);
+   start = bus->fixed_range_hard[idx].start;
+   end = bus->fixed_range_hard[idx].end;
+
+   if (start > r->start)
+   start = r->start;
+   if (end < r->end)
+   end = r->end;
+
+   if (bus->fixed_range_hard[idx].start != start ||
+   bus->fixed_range_hard[idx].end != end) {
+   dev_dbg(>dev, "%s: Found fixed 
0x%llx-0x%llx in %s, expand the fixed bridge window %d to 0x%llx-0x%llx\n",
+   __func__,
+   (unsigned long long)r->start,
+   (unsigned long long)r->end,
+   dev_name(>dev), idx,
+   (unsigned long long)start,
+   (unsigned long long)end);
+   bus->fixed_range_hard[idx].start = 
start;
+   bus->fixed_range_hard[idx].end = end;
+   }
+   }
+   }
+
+   if (dev->subordinate) {
+   struct pci_bus *child = dev->subordinate;
+
+

[PATCH RFC v4 17/21] PCI: Calculate boundaries for bridge windows

2019-03-11 Thread Sergey Miroshnichenko
If a bridge window contains fixed areas (there are PCIe devices with
immovable BARs located on this bus), this window must be allocated
within the bound memory area, limited by windows size and by address
range of fixed resources, calculated as follows:

   | <-- bus's fixed_range_hard   --> |
  | <--  fixed_range_hard.end - window size   --> |
   | <--  fixed_range_hard.start + window size   --> |
  | <--bus's fixed_range_soft--> |

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 56 +
 include/linux/pci.h |  4 ++-
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a1fd7f3c5ea8..f4737339d5ec 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1809,6 +1809,61 @@ static enum enable_type pci_realloc_detect(struct 
pci_bus *bus,
 }
 #endif
 
+static void pci_bus_update_fixed_range_soft(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+   struct pci_bus *parent = bus->parent;
+   int idx;
+
+   list_for_each_entry(dev, >devices, bus_list)
+   if (dev->subordinate)
+   pci_bus_update_fixed_range_soft(dev->subordinate);
+
+   if (!parent || !bus->self)
+   return;
+
+   for (idx = 0; idx < ARRAY_SIZE(bus->fixed_range_hard); ++idx) {
+   struct resource *r;
+   resource_size_t soft_start, soft_end;
+   resource_size_t hard_start = bus->fixed_range_hard[idx].start;
+   resource_size_t hard_end = bus->fixed_range_hard[idx].end;
+
+   if (hard_start > hard_end)
+   continue;
+
+   r = bus->resource[idx];
+
+   soft_start = hard_end - resource_size(r) + 1;
+   soft_end = hard_start + resource_size(r) - 1;
+
+   if (soft_start > hard_start)
+   soft_start = hard_start;
+
+   if (soft_end < hard_end)
+   soft_end = hard_end;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *sibling = dev->subordinate;
+   resource_size_t s_start, s_end;
+
+   if (!sibling || sibling == bus)
+   continue;
+
+   s_start = sibling->fixed_range_hard[idx].start;
+   s_end = sibling->fixed_range_hard[idx].end;
+
+   if (s_start > s_end)
+   continue;
+
+   if (s_end < hard_start && s_end > soft_start)
+   soft_start = s_end;
+   }
+
+   bus->fixed_range_soft[idx].start = soft_start;
+   bus->fixed_range_soft[idx].end = soft_end;
+   }
+}
+
 /*
  * first try will not touch pci bridge res
  * second and later try will clear small leaf bridge res
@@ -1847,6 +1902,7 @@ void pci_assign_unassigned_root_bus_resources(struct 
pci_bus *bus)
/* Depth first, calculate sizes and alignments of all
   subordinate buses. */
__pci_bus_size_bridges(bus, add_list);
+   pci_bus_update_fixed_range_soft(bus);
 
/* Depth last, allocate resources and update the hardware. */
__pci_bus_assign_resources(bus, add_list, _head);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7a4d62d84bc1..75a56db73ad4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -574,9 +574,11 @@ struct pci_bus {
 
/*
 * If there are fixed resources in the bridge window, the hard range
-* contains the lowest and the highest addresses of them.
+* contains the lowest and the highest addresses of them, and this
+* bridge window must reside within the soft range.
 */
struct resource fixed_range_hard[PCI_BRIDGE_RESOURCE_NUM];
+   struct resource fixed_range_soft[PCI_BRIDGE_RESOURCE_NUM];
 
struct pci_ops  *ops;   /* Configuration access functions */
struct msi_controller *msi; /* MSI controller */
-- 
2.20.1



[PATCH RFC v4 00/21] PCI: Allow BAR movement during hotplug

2019-03-11 Thread Sergey Miroshnichenko
If the firmware or kernel has arranged memory for PCIe devices in a way
that doesn't provide enough space for BARs of a new hotplugged device, the
kernel can pause the drivers of the "obstructing" devices and move their
BARs, so new BARs can fit into the freed spaces.

When a driver is un-paused by the kernel after the PCIe rescan, it should
check if its BARs had moved, and ioremap() them if needed.

Drivers indicate their support of the feature by implementing the new
rescan_prepare() and rescan_done() hooks in the struct pci_driver. If a
driver doesn't yet support the feature, BARs of its devices will be marked
as immovable by the IORESOURCE_PCI_FIXED flag.

To re-arrange the BARs and bridge windows this patch releases all of them
after a rescan and re-assigns in the same way as during the initial PCIe
topology scan at system boot.

Tested on:
 - x86_64 with "pci=realloc,assign-busses,use_crs pcie_movable_bars=force"
 - POWER8 PowerNV+PHB3 ppc64le with [1] and [2] applied and the following:
   "pci=realloc pcie_movable_bars=force"

Not so many platforms and test cases were covered, so all who are
interested are highly welcome to test on your setups - the more exotic the
better!

This patchset is a part of our work on adding support for hotplugging
bridges full of NVME and GPU devices without special requirements such as
Hot-Plug Controller, reservation of bus numbers or memory regions by
firmware, etc. Future work will be devoted to implementing the movable bus
numbers.

[1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-March/186618.html
[2] https://lists.ozlabs.org/pipermail/skiboot/2019-March/013571.html

Changes since v3:
 - Rebased to the upstream, so the patches apply cleanly again.

Changes since v2:
 - Fixed double-assignment of bridge windows;
 - Fixed assignment of fixed prefetched resources;
 - Fixed releasing of fixed resources;
 - Fixed a debug message;
 - Removed auto-enabling the movable BARs for x86 - let's rely on the
   "pcie_movable_bars=force" option for now;
 - Reordered the patches - bugfixes first.

Changes since v1:
 - Add a "pcie_movable_bars={ off | force }" command line argument;
 - Handle the IORESOURCE_PCI_FIXED flag properly;
 - Don't move BARs of devices which don't support the feature;
 - Guarantee that new hotplugged devices will not steal memory from working
   devices by ignoring the failing new devices with the new PCI_DEV_IGNORE
   flag;
 - Add rescan_prepare()+rescan_done() to the struct pci_driver instead of
   using the reset_prepare()+reset_done() from struct pci_error_handlers;
 - Add a bugfix of a race condition;
 - Fixed hotplug in a non-pre-enabled (by BIOS/firmware) bridge;
 - Fix the compatibility of the feature with pm_runtime and D3-state;
 - Hotplug events from pciehp also can move BARs;
 - Add support of the feature to the NVME driver.

Sergey Miroshnichenko (21):
  PCI: Fix writing invalid BARs during pci_restore_state()
  PCI: Fix race condition in pci_enable/disable_device()
  PCI: Enable bridge's I/O and MEM access for hotplugged devices
  PCI: Define PCI-specific version of the release_child_resources()
  PCI: hotplug: Add a flag for the movable BARs feature
  PCI: Pause the devices with movable BARs during rescan
  PCI: Wake up bridges during rescan when movable BARs enabled
  nvme-pci: Handle movable BARs
  PCI: Mark immovable BARs with PCI_FIXED
  PCI: Fix assigning of fixed prefetchable resources
  PCI: Release and reassign the root bridge resources during rescan
  PCI: Don't allow hotplugged devices to steal resources
  PCI: Include fixed BARs into the bus size calculating
  PCI: Don't reserve memory for hotplug when enabled movable BARs
  PCI: Allow the failed resources to be reassigned later
  PCI: Calculate fixed areas of bridge windows based on fixed BARs
  PCI: Calculate boundaries for bridge windows
  PCI: Make sure bridge windows include their fixed BARs
  PCI: Prioritize fixed BAR assigning over the movable ones
  PCI: pciehp: Add support for the movable BARs feature
  powerpc/pci: Fix crash with enabled movable BARs

 .../admin-guide/kernel-parameters.txt |   7 +
 arch/powerpc/platforms/powernv/pci-ioda.c |   3 +-
 drivers/nvme/host/pci.c   |  29 +-
 drivers/pci/bus.c |   7 +-
 drivers/pci/hotplug/pciehp_pci.c  |  14 +-
 drivers/pci/pci.c |  60 +++-
 drivers/pci/pci.h |  26 ++
 drivers/pci/probe.c   | 271 +-
 drivers/pci/setup-bus.c   | 245 ++--
 drivers/pci/setup-res.c   |  43 ++-
 include/linux/pci.h   |  14 +
 11 files changed, 678 insertions(+), 41 deletions(-)

-- 
2.20.1



[PATCH RFC v4 15/21] PCI: Allow the failed resources to be reassigned later

2019-03-11 Thread Sergey Miroshnichenko
Don't lose the size of the requested EP's BAR if it can't be fit
in a current trial, so this can be retried.

But a failed bridge window must be dropped and recalculated in the
next trial.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c |  3 ++-
 drivers/pci/setup-res.c | 12 
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f9d605cd1725..c1559a4a8564 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -309,7 +309,8 @@ static void assign_requested_resources_sorted(struct 
list_head *head,
0 /* don't care */,
0 /* don't care */);
}
-   reset_resource(res);
+   if (!pci_movable_bars_enabled())
+   reset_resource(res);
}
}
 }
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index d8ca40a97693..732d18f60f1b 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -298,6 +298,18 @@ static int _pci_assign_resource(struct pci_dev *dev, int 
resno,
 
bus = dev->bus;
while ((ret = __pci_assign_resource(bus, dev, resno, size, min_align))) 
{
+   if (pci_movable_bars_enabled()) {
+   if (resno >= PCI_BRIDGE_RESOURCES &&
+   resno <= PCI_BRIDGE_RESOURCE_END) {
+   struct resource *res = dev->resource + resno;
+
+   res->start = 0;
+   res->end = 0;
+   res->flags = 0;
+   }
+   break;
+   }
+
if (!bus->parent || !bus->self->transparent)
break;
bus = bus->parent;
-- 
2.20.1



[PATCH RFC v4 14/21] PCI: Don't reserve memory for hotplug when enabled movable BARs

2019-03-11 Thread Sergey Miroshnichenko
pbus_size_mem() returns a precise amount of memory required to fit
all the requested BARs and windows of children bridges.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 9d93f2b32bf1..f9d605cd1725 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1229,7 +1229,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
list_head *realloc_head)
 
case PCI_HEADER_TYPE_BRIDGE:
pci_bridge_check_ranges(bus);
-   if (bus->self->is_hotplug_bridge) {
+   if (bus->self->is_hotplug_bridge && 
!pci_movable_bars_enabled()) {
additional_io_size  = pci_hotplug_io_size;
additional_mem_size = pci_hotplug_mem_size;
}
-- 
2.20.1



[PATCH RFC v4 11/21] PCI: Release and reassign the root bridge resources during rescan

2019-03-11 Thread Sergey Miroshnichenko
When the movable BARs feature is enabled, don't rely on the memory gaps
reserved by the BIOS/bootloader/firmware, but instead rearrange the BARs
and bridge windows starting from the root.

Endpoint device's BARs, after being released, are resorted and written
back by the pci_assign_unassigned_root_bus_resources().

The last step of writing the recalculated windows to the bridges is done
by the new pci_setup_bridges() function.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 22 ++
 drivers/pci/setup-bus.c | 11 ++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 224d88634115..e06e8692a7b1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -248,6 +248,7 @@ void __pci_bus_assign_resources(const struct pci_bus *bus,
struct list_head *realloc_head,
struct list_head *fail_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
+void pci_bus_release_root_bridge_resources(struct pci_bus *bus);
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 1cf6ec960236..692752c71f71 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -3299,6 +3299,25 @@ static void pci_bus_rescan_done(struct pci_bus *bus)
pm_runtime_put(>dev);
 }
 
+static void pci_setup_bridges(struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   struct pci_bus *child;
+
+   if (!pci_dev_is_added(dev) || pci_dev_is_ignored(dev))
+   continue;
+
+   child = dev->subordinate;
+   if (child)
+   pci_setup_bridges(child);
+   }
+
+   if (bus->self)
+   pci_setup_bridge(bus);
+}
+
 /**
  * pci_rescan_bus - Scan a PCI bus for devices
  * @bus: PCI bus to scan
@@ -3321,8 +3340,11 @@ unsigned int pci_rescan_bus(struct pci_bus *bus)
pci_bus_rescan_prepare(root);
 
max = pci_scan_child_bus(root);
+
+   pci_bus_release_root_bridge_resources(root);
pci_assign_unassigned_root_bus_resources(root);
 
+   pci_setup_bridges(root);
pci_bus_rescan_done(root);
} else {
max = pci_scan_child_bus(bus);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index be7d4e6d7b65..36a1907d9509 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1584,7 +1584,7 @@ static void pci_bridge_release_resources(struct pci_bus 
*bus,
pci_printk(KERN_DEBUG, dev, "resource %d %pR released\n",
PCI_BRIDGE_RESOURCES + idx, r);
/* keep the old size */
-   r->end = resource_size(r) - 1;
+   r->end = pci_movable_bars_enabled() ? 0 : (resource_size(r) - 
1);
r->start = 0;
r->flags = 0;
 
@@ -1637,6 +1637,15 @@ static void pci_bus_release_bridge_resources(struct 
pci_bus *bus,
pci_bridge_release_resources(bus, type);
 }
 
+void pci_bus_release_root_bridge_resources(struct pci_bus *root_bus)
+{
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_IO, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus, IORESOURCE_MEM, 
whole_subtree);
+   pci_bus_release_bridge_resources(root_bus,
+IORESOURCE_MEM_64 | 
IORESOURCE_PREFETCH,
+whole_subtree);
+}
+
 static void pci_bus_dump_res(struct pci_bus *bus)
 {
struct resource *res;
-- 
2.20.1



[PATCH RFC v4 13/21] PCI: Include fixed BARs into the bus size calculating

2019-03-11 Thread Sergey Miroshnichenko
The only difference between the fixed and movable BARs is an offset
preservation during the release+reassign procedure on PCIe rescan.

When fixed BARs are included into the result of pbus_size_mem(), these
BARs can be restricted: assign them to direct parents only.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 551108f48df7..9d93f2b32bf1 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1007,12 +1007,20 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
struct resource *r = >resource[i];
resource_size_t r_size;
 
-   if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) ||
+   if (r->parent ||
((r->flags & mask) != type &&
 (r->flags & mask) != type2 &&
 (r->flags & mask) != type3))
continue;
r_size = resource_size(r);
+
+   if (r->flags & IORESOURCE_PCI_FIXED) {
+   if (pci_movable_bars_enabled())
+   size += r_size;
+
+   continue;
+   }
+
 #ifdef CONFIG_PCI_IOV
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
@@ -1351,6 +1359,8 @@ static void pdev_assign_fixed_resources(struct pci_dev 
*dev)
while (b && !r->parent) {
assign_fixed_resource_on_bus(b, r);
b = b->parent;
+   if (!r->parent && pci_movable_bars_enabled())
+   break;
}
}
 }
-- 
2.20.1



[PATCH RFC v4 10/21] PCI: Fix assigning of fixed prefetchable resources

2019-03-11 Thread Sergey Miroshnichenko
Allow matching them to non-prefetchable windows, as it is done for movable
resources.

Signed-off-by: Sergey Miroshnichenko 
---
 drivers/pci/setup-bus.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 3644feb13179..be7d4e6d7b65 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1301,15 +1301,20 @@ static void assign_fixed_resource_on_bus(struct pci_bus 
*b, struct resource *r)
 {
int i;
struct resource *parent_r;
-   unsigned long mask = IORESOURCE_IO | IORESOURCE_MEM |
-IORESOURCE_PREFETCH;
+   unsigned long mask = IORESOURCE_TYPE_BITS;
 
pci_bus_for_each_resource(b, parent_r, i) {
if (!parent_r)
continue;
 
-   if ((r->flags & mask) == (parent_r->flags & mask) &&
-   resource_contains(parent_r, r))
+   if ((r->flags & mask) != (parent_r->flags & mask))
+   continue;
+
+   if (parent_r->flags & IORESOURCE_PREFETCH &&
+   !(r->flags & IORESOURCE_PREFETCH))
+   continue;
+
+   if (resource_contains(parent_r, r))
request_resource(parent_r, r);
}
 }
-- 
2.20.1



  1   2   >