from:"Leonardo Bras"

[PATCH v6 11/11] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2021-08-17 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

This should cause no behavioural change, just adjust naming.

Signed-off-by: Leonardo Bras 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 87 +-
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 0eccc29f5573..dab5c56ffd0e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -349,7 +349,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -369,11 +369,11 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
@@ -713,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
if (dma_window != NULL)
@@ -869,37 +872,37 @@ static int remove_ddw(struct device_node *np, bool 
remove_prop, const char *win_
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove DMA window property: %d\n",
np, ret);
return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
bool found = false;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   *dma_addr = be64_to_cpu(direct64->dma_base);
-   *window_shift = be32_to_cpu(direct64->window_shift);
+   dma64 = window->prop;
+   *dma_addr = be64_to_cpu(dma64->dma_base);
+   *window_shift = be32_to_cpu(dma64->window_shift);
found = true;
break;
}
}
-   spin_unlock(_window_list_lock);
+   spin_unlock(_win_list_lock);
 
return found;
 }
 
-static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
-   const struct 
dynamic_dma_window_prop *dma64)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+ const struct dynamic_dma_window_prop 
*dma64)
 {
-   struct direct_window *window;
+   struct dma_win *window;
 
window = kzalloc(sizeof(*window), GFP_KERNEL);
if (!window)
@@ -915,7 +918,7 @@ static void find_existing_ddw_windows_named(const char 
*name)
 {
int len;
struct device_node *pdn;
-   struct direct_window *window;
+   struct dma_win *window;
const struct dynamic_dma_window_prop *dma64;
 
for_each_node_with_property(pdn, name) {
@@ -929,9 +932,9 @@ static void find_existing_ddw_windows_named(const char 
*name)
if (!window)
break;
 
-   spin_lock(_window_list_lock);
-   list_add(>list, _window_list);
-   spin_unlock(_window_list_lock);
+   spin_lock(_win_list_lock);

[PATCH v6 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-08-17 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

By using DDW, indirect mapping  can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.

Indirect mapping will only be used if direct mapping is not a
possibility.

For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 89 +-
 1 file changed, 74 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index e11c00b2dc1e..0eccc29f5573 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -940,6 +941,7 @@ static int find_existing_ddw_windows(void)
return 0;
 
find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+   find_existing_ddw_windows_named(DMA64_PROPNAME);
 
return 0;
 }
@@ -1226,14 +1228,17 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_create_response create;
int page_shift;
u64 win_addr;
+   const char *win_name;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
bool ddw_enabled = false;
struct failed_ddw_pdn *fpdn;
-   bool default_win_removed = false;
+   bool default_win_removed = false, direct_mapping = false;
bool pmem_present;
+   struct pci_dn *pci = PCI_DN(pdn);
+   struct iommu_table *tbl = pci->table_group->tables[0];
 
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
@@ -1242,6 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
mutex_lock(_window_init_mutex);
 
if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   direct_mapping = (len >= max_ram_len);
ddw_enabled = true;
goto out_unlock;
}
@@ -1322,8 +1328,8 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
  query.page_size);
goto out_failed;
}
-   /* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
+
+
/*
 * The "ibm,pmemory" can appear anywhere in the address space.
 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
@@ -1339,13 +1345,25 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_info(>dev, "Skipping ibm,pmemory");
}
 
+   /* check if the available block * number of ptes will map everything */
if (query.largest_available_block < (1ULL << (len - page_shift))) {
dev_dbg(>dev,
"can't map partition max 0x%llx with %llu %llu-sized 
pages\n",
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
-   goto out_failed;
+
+   /* DDW + IOMMU

[PATCH v6 09/11] powerpc/pseries/iommu: Find existing DDW with given property name

2021-08-17 Thread Leonardo Bras

At the moment pseries stores information about created directly mapped
DDW window in DIRECT64_PROPNAME.

With the objective of implementing indirect DMA mapping with DDW, it's
necessary to have another propriety name to make sure kexec'ing into older
kernels does not break, as it would if we reuse DIRECT64_PROPNAME.

In order to have this, find_existing_ddw_windows() needs to be able to
look for different property names.

Extract find_existing_ddw_windows() into find_existing_ddw_windows_named()
and calls it with current property name.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 901f290999d0..e11c00b2dc1e 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -910,24 +910,21 @@ static struct direct_window *ddw_list_new_entry(struct 
device_node *pdn,
return window;
 }
 
-static int find_existing_ddw_windows(void)
+static void find_existing_ddw_windows_named(const char *name)
 {
int len;
struct device_node *pdn;
struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
-
-   if (!firmware_has_feature(FW_FEATURE_LPAR))
-   return 0;
+   const struct dynamic_dma_window_prop *dma64;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true, DIRECT64_PROPNAME);
+   for_each_node_with_property(pdn, name) {
+   dma64 = of_get_property(pdn, name, );
+   if (!dma64 || len < sizeof(*dma64)) {
+   remove_ddw(pdn, true, name);
continue;
}
 
-   window = ddw_list_new_entry(pdn, direct64);
+   window = ddw_list_new_entry(pdn, dma64);
if (!window)
break;
 
@@ -935,6 +932,14 @@ static int find_existing_ddw_windows(void)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
}
+}
+
+static int find_existing_ddw_windows(void)
+{
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return 0;
+
+   find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 
return 0;
 }
-- 
2.32.0

[PATCH v6 08/11] powerpc/pseries/iommu: Update remove_dma_window() to accept property name

2021-08-17 Thread Leonardo Bras

Update remove_dma_window() so it can be used to remove DDW with a given
property name.

This enables the creation of new property names for DDW, so we can
have different usage for it, like indirect mapping.

Also, add return values to it so we can check if the property was found
while removing the active DDW. This allows skipping the remaining property
names while reducing the impact of multiple property names.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index a47f59a8f107..901f290999d0 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -844,31 +844,33 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
__remove_dma_window(np, ddw_avail, liobn);
 }
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static int remove_ddw(struct device_node *np, bool remove_prop, const char 
*win_name)
 {
struct property *win;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
int ret = 0;
 
+   win = of_find_property(np, win_name, NULL);
+   if (!win)
+   return -EINVAL;
+
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 _avail[0], DDW_APPLICABLE_SIZE);
if (ret)
-   return;
+   return 0;
 
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win)
-   return;
 
if (win->length >= sizeof(struct dynamic_dma_window_prop))
remove_dma_window(np, ddw_avail, win);
 
if (!remove_prop)
-   return;
+   return 0;
 
ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
+   return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
@@ -921,7 +923,7 @@ static int find_existing_ddw_windows(void)
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
continue;
}
 
@@ -1565,7 +1567,7 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 * we have to remove the property when releasing
 * the device node.
 */
-   remove_ddw(np, false);
+   remove_ddw(np, false, DIRECT64_PROPNAME);
if (pci && pci->table_group)
iommu_pseries_free_group(pci->table_group,
np->full_name);
-- 
2.32.0

[PATCH v6 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-08-17 Thread Leonardo Bras

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, declare iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

Signed-off-by: Leonardo Bras 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 72 ++
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 00392582fe10..a47f59a8f107 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -501,6 +501,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long 
start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
+static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long 
busno,
+   unsigned long liobn, unsigned long 
win_addr,
+   unsigned long window_size, unsigned 
long page_shift,
+   void *base, struct iommu_table_ops 
*table_ops)
+{
+   tbl->it_busno = busno;
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_size = window_size >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_base = (unsigned long)base;
+   tbl->it_blocksize = 16;
+   tbl->it_type = TCE_PCI;
+   tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops;
+
 static void iommu_table_setparms(struct pci_controller *phb,
 struct device_node *dn,
 struct iommu_table *tbl)
@@ -509,8 +527,13 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
const unsigned long *basep;
const u32 *sizep;
 
-   node = phb->dn;
+   /* Test if we are going over 2GB of DMA space */
+   if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
+   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+   }
 
+   node = phb->dn;
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -519,33 +542,18 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
return;
}
 
-   tbl->it_base = (unsigned long)__va(*basep);
+   iommu_table_setparms_common(tbl, phb->bus->number, 0, 
phb->dma_window_base_cur,
+   phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+   __va(*basep), _table_pseries_ops);
 
if (!is_kdump_kernel())
memset((void *)tbl->it_base, 0, *sizep);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-   /* Units of tce entries */
-   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-   /* Test if we are going over 2GB of DMA space */
-   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
-   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
-   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-   }
-
phb->dma_window_base_cur += phb->dma_window_size;
-
-   /* Set the tce table size - measured in entries */
-   tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
-   tbl->it_index = 0;
-   tbl->it_blocksize = 16;
-   tbl->it_type = TCE_PCI;
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops;
+
 /*
  * iommu_table_setparms_lpar
  *
@@ -557,17 +565,13 @@ static void iommu_table_setparms_lpar(struct 
pci_controller *phb,
  struct iommu_table_group *table_group,
  const __be32 *dma_window)
 {
-   unsigned long offset, size;
+   unsigned long offset, size, liobn;
 
-   of_parse_dma_window(dn, dma_window, >it_index, , );
+   of_parse_dma_window(dn, dma_window, , , );
+
+   iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, 
IOMMU_PAGE_SHIFT_4K, NULL,
+   _table_lpar_multi_ops);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-   tbl->it_base   = 0;
-   tbl->it_blocksize  = 16;
-   tbl

[PATCH v6 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-08-17 Thread Leonardo Bras

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Also, the error path got remove_ddw() replaced by a new helper
__remove_dma_window(), which only removes the new DDW with an rtas-call.
For this, a new helper clean_dma_window() was needed to clean anything
that could left if walk_system_ram_range() fails.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 129 -
 1 file changed, 84 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index b34b473bbdc1..00392582fe10 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -795,17 +795,10 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
- struct property *win)
+static void clean_dma_window(struct device_node *np, struct 
dynamic_dma_window_prop *dwp)
 {
-   struct dynamic_dma_window_prop *dwp;
-   u64 liobn;
int ret;
 
-   dwp = win->value;
-   liobn = (u64)be32_to_cpu(dwp->liobn);
-
-   /* clear the whole window, note the arg is in kernel pages */
ret = tce_clearrange_multi_pSeriesLP(0,
1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
if (ret)
@@ -814,18 +807,39 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
else
pr_debug("%pOF successfully cleared tces in window.\n",
 np);
+}
+
+/*
+ * Call only if DMA window is clean.
+ */
+static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 
liobn)
+{
+   int ret;
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
 
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
+{
+   struct dynamic_dma_window_prop *dwp;
+   u64 liobn;
+
+   dwp = win->value;
+   liobn = (u64)be32_to_cpu(dwp->liobn);
+
+   clean_dma_window(np, dwp);
+   __remove_dma_window(np, ddw_avail, liobn);
+}
+
 static void remove_ddw(struct device_node *np, bool remove_prop)
 {
struct property *win;
@@ -1153,6 +1167,35 @@ static int iommu_get_page_shift(u32 query_page_size)
return 0;
 }
 
+static struct property *ddw_property_create(const char *propname, u32 liobn, 
u64 dma_addr,
+   u32 page_shift, u32 window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return NULL;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value) {
+   kfree(win64->name);
+   kfree(win64->value);
+   kfree(win64);
+   return NULL;
+   }
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shi

[PATCH v6 05/11] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2021-08-17 Thread Leonardo Bras

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 36 +-
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 712d1667144a..b34b473bbdc1 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -853,25 +853,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
-   u64 dma_addr = 0;
+   bool found = false;
 
spin_lock(_window_list_lock);
/* check if we already created a window and dupe that config if so */
list_for_each_entry(window, _window_list, list) {
if (window->device == pdn) {
direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   *dma_addr = be64_to_cpu(direct64->dma_base);
*window_shift = be32_to_cpu(direct64->window_shift);
+   found = true;
break;
}
}
spin_unlock(_window_list_lock);
 
-   return dma_addr;
+   return found;
 }
 
 static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
@@ -1161,20 +1162,20 @@ static int iommu_get_page_shift(u32 query_page_size)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 dma_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
+   bool ddw_enabled = false;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
@@ -1186,9 +1187,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   dma_addr = find_existing_ddw(pdn, );
-   if (dma_addr != 0)
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   ddw_enabled = true;
goto out_unlock;
+   }
 
/*
 * If we already went through this for a previous function of
@@ -1342,7 +1344,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
 
-   dma_addr = be64_to_cpu(ddwprop->dma_base);
+   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
+   ddw_enabled = true;
goto out_unlock;
 
 out_free_window:
@@ -1374,10 +1377,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * as RAM, then we failed to create a window to cover persistent
 * memory and need to set the DMA limit.
 */
-   if (pmem_present && dma_addr && (len == max_ram_len))
-   dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+   if (pmem_present && ddw_enabled && (len == max_ram_len))
+   dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL 
<< len);
 
-   return dma_addr;
+   return ddw_enabled;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1456,11 +1459,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
break;
}
 
-   if (pdn && PCI_DN(pdn)) {
-   pdev->dev.archdata.dma_offset = enable_ddw(pdev

[PATCH v6 04/11] powerpc/pseries/iommu: Add ddw_list_new_entry() helper

2021-08-17 Thread Leonardo Bras

There are two functions creating direct_window_list entries in a
similar way, so create a ddw_list_new_entry() to avoid duplicity and
simplify those functions.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 32 +-
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 33d82865d6e6..712d1667144a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -874,6 +874,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int 
*window_shift)
return dma_addr;
 }
 
+static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
+   const struct 
dynamic_dma_window_prop *dma64)
+{
+   struct direct_window *window;
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return NULL;
+
+   window->device = pdn;
+   window->prop = dma64;
+
+   return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
int len;
@@ -886,18 +901,15 @@ static int find_existing_ddw_windows(void)
 
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64)
-   continue;
-
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
-   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
-   kfree(window);
+   if (!direct64 || len < sizeof(*direct64)) {
remove_ddw(pdn, true);
continue;
}
 
-   window->device = pdn;
-   window->prop = direct64;
+   window = ddw_list_new_entry(pdn, direct64);
+   if (!window)
+   break;
+
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
@@ -1307,7 +1319,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   window = ddw_list_new_entry(pdn, ddwprop);
if (!window)
goto out_clear_window;
 
@@ -1326,8 +1338,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   window->device = pdn;
-   window->prop = ddwprop;
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
-- 
2.32.0

[PATCH v6 03/11] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2021-08-17 Thread Leonardo Bras

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index b1b8d12bab39..33d82865d6e6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-   struct iommu_table_group *table_group;
struct iommu_table *tbl;
 
-   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-  node);
-   if (!table_group)
-   return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
-   goto free_group;
+   return NULL;
 
INIT_LIST_HEAD_RCU(>it_group_list);
kref_init(>it_kref);
+   return tbl;
+}
 
-   table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+   struct iommu_table_group *table_group;
+
+   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+   if (!table_group)
+   return NULL;
 
-   return table_group;
+   table_group->tables[0] = iommu_pseries_alloc_table(node);
+   if (table_group->tables[0])
+   return table_group;
 
-free_group:
kfree(table_group);
return NULL;
 }
-- 
2.32.0

[PATCH v6 02/11] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-08-17 Thread Leonardo Bras

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 61 
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2af89a5e379f..ed98ad63633e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -690,32 +690,24 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
 
-   tbl->it_reserved_start = res_start;
-   tbl->it_reserved_end = res_end;
-
-   /* Check if res_start..res_end isn't empty and overlaps the table */
-   if (res_start && res_end &&
-   (tbl->it_offset + tbl->it_size < res_start ||
-res_end < tbl->it_offset))
-   return;
+   if (res_start < tbl->it_offset)
+   res_start = tbl->it_offset;
 
-   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   set_bit(i - tbl->it_offset, tbl->it_map);
-}
+   if (res_end > (tbl->it_offset + tbl->it_size))
+   res_end = tbl->it_offset + tbl->it_size;
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-   int i;
+   /* Check if res_start..res_end is a valid range in the table */
+   if (res_start >= res_end) {
+   tbl->it_reserved_start = tbl->it_offset;
+   tbl->it_reserved_end = tbl->it_offset;
+   return;
+   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
 
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   clear_bit(i - tbl->it_offset, tbl->it_map);
+   set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
 /*
@@ -779,6 +771,22 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+   unsigned long start = 0, end;
+
+   /* ignore reserved bit0 */
+   if (tbl->it_offset == 0)
+   start = 1;
+   end = tbl->it_reserved_start - tbl->it_offset;
+   if (find_next_bit(tbl->it_map, end, start) != end)
+   return true;
+
+   start = tbl->it_reserved_end - tbl->it_offset;
+   end = tbl->it_size;
+   return find_next_bit(tbl->it_map, end, start) != end;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
struct iommu_table *tbl;
@@ -795,10 +803,8 @@ static void iommu_table_free(struct kref *kref)
 
iommu_debugfs_del(tbl);
 
-   iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
-   if (!bitmap_empty(tbl->it_map, tbl->it_size))
+   if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
 
/* free bitmap */
@@ -1099,14 +1105,9 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock_nest_lock(>pools[i].lock, >large_pool.lock);
 
-   iommu_table_release_pages(tbl);
-
-   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+   if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
-   /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-   iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-   tbl->it_reserved_end);
} else {
memset(tbl->it_map, 0xff, sz);
}
-- 
2.32.0

[PATCH v6 01/11] powerpc/pseries/iommu: Replace hard-coded page shift

2021-08-17 Thread Leonardo Bras

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.

IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
Reviewed-by: Frederic Barrat 
---
 arch/powerpc/include/asm/tce.h |  8 --
 arch/powerpc/platforms/pseries/iommu.c | 39 +++---
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..0c34d2756d92 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,7 @@
 #define TCE_VB 0
 #define TCE_PCI1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT  12
-#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE 8   /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT  12
 #define TCE_VALID  0x800   /* TCE valid */
 #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
 #define TCE_PCI_WRITE  0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 0c55b991f665..b1b8d12bab39 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long 
index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
long index,
 
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
-   rpn = __pa(uaddr) >> TCE_SHIFT;
-   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
TCE_RPN_SHIFT);
+   rpn = __pa(uaddr) >> tceshift;
+   *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
 
-   uaddr += TCE_PAGE_SIZE;
+   uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
*tbl, long index)
return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
 
while (npages--) {
-   tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+   tce = proto_tce | rpn << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
-   tce_free_pSeriesLP(liobn, tcenum_start,
+   tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
   (npages_start - (npages + 1)));
break;
}
@@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+   const unsigned long tceshift = tbl->it_page_shift;
 
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
-  tbl->it_page_shift, npages, uaddr,
+  tceshift, npages, uaddr,
   direction, attrs);
}
 
@@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
if (!tcep) {
loc

[PATCH v6 00/11] DDW + Indirect Mapping

2021-08-17 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

Using the DDW instead of the default DMA window may allow to expand the
amount of memory that can be DMA-mapped, given the number of pages (TCEs)
may stay the same (or increase) and the default DMA window offers only
4k-pages while DDW may offer larger pages (4k, 64k, 16M ...).

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #3 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #4 adds helpers for adding DDWs in the list.

Patch #5 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #6 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #7 adds new helper _iommu_table_setparms() and use it in other
*setparams*() to fill iommu_table. It will also be used for creating a
new iommu_table for indirect mapping.

Patch #8 updates remove_dma_window() to accept different property names,
so we can introduce a new property for indirect mapping.

Patch #9 extracts find_existing_ddw_windows() into
find_existing_ddw_windows_named(), and calls it by it's property name.
This will be useful when the property for indirect mapping is created,
so we can search the device-tree for both properties.

Patch #10:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.
It introduces a new property name for DDW with indirect DMA mapping.

Patch #11:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an virtio-net interface that
allows default DMA window and DDW to coexist.

Changes since v5:
- Reviews from Frederic Barrat
- 02/11 : memset bitmap only if tbl not in use
- 06/11 : remove_ddw() is not used in enable_ddw() error path anymore 
  New helpers were created for that.
- 10/11 : There was a typo, but got replaced due to 06/11 fix.
v5 Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=253799=%2A=both

Changes since v4:
- Solve conflicts with new upstream versions
- Avoid unecessary code moving by doing variable declaration before definition
- Rename _iommu_table_setparms to iommu_table_setparms_common and changed base
  parameter from unsigned long to void* in order to avoid unecessary casting.
- Fix breaking case for existing direct-mapping.
- Fix IORESOURCE_MEM bound issue
- Move new tbl to pci->table_group->tables[1] instead of replacing [0]
v4 Link: 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=241597=%2A=both

Changes since v3:
- Fixed inverted free order at ddw_property_create()
- Updated goto tag naming
v3 Link: 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=240287=%2A=both

Changes since v2:
- Some patches got removed from the series and sent by themselves,
- New tbl created for DDW + indirect mapping reserves MMIO32 space,
- Improved reserved area algorithm,
- Improved commit messages,
- Removed define for default DMA window prop name,
- Avoided some unnecessary renaming,
- Removed some unnecessary empty lines,
- Changed some code moving to forward declarations.
v2 Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=201210=%2A=both


Leonardo Bras (11):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_new_entry() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
enable_ddw()
  powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new
helper
  powerpc/pseries/iommu: Update remove_dma_window() to accept property
name
  powerpc/pseries/iommu: Find existing DDW with given property name
  powerpc/pseries/iommu: Make use of DDW for indirect mapping

[PATCH v5 11/11] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2021-07-16 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

This should cause no behavioural change, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +-
 1 file changed, 47 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index a67e71c49aeb..52548dfb8b45 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -349,7 +349,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -369,11 +369,11 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
@@ -713,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
if (dma_window != NULL)
@@ -822,11 +825,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -854,37 +857,37 @@ static int remove_ddw(struct device_node *np, bool 
remove_prop, const char *win_
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove DMA window property: %d\n",
np, ret);
return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
bool found = false;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   *dma_addr = be64_to_cpu(direct64->dma_base);
-   *window_shift = be32_to_cpu(direct64->window_shift);
+   dma64 = window->prop;
+   *dma_addr = be64_to_cpu(dma64->dma_base);
+   *window_shift = be32_to_cpu(dma64->window_shift);
found = true;
break;
}
}
-   spin_unlock(_window_list_lock);
+   spin_unlock(_win_list_lock);
 
return found;
 }
 
-static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
-   const struct 
dynamic_dma_window_prop *dma64)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+

[PATCH v5 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-07-16 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

By using DDW, indirect mapping  can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.

Indirect mapping will only be used if direct mapping is not a
possibility.

For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 87 +-
 1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 22d251e15b61..a67e71c49aeb 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -925,6 +926,7 @@ static int find_existing_ddw_windows(void)
return 0;
 
find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+   find_existing_ddw_windows_named(DMA64_PROPNAME);
 
return 0;
 }
@@ -1211,14 +1213,17 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_create_response create;
int page_shift;
u64 win_addr;
+   const char *win_name;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
bool ddw_enabled = false;
struct failed_ddw_pdn *fpdn;
-   bool default_win_removed = false;
+   bool default_win_removed = false, direct_mapping = false;
bool pmem_present;
+   struct pci_dn *pci = PCI_DN(pdn);
+   struct iommu_table *tbl = pci->table_group->tables[0];
 
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
@@ -1227,6 +1232,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
mutex_lock(_window_init_mutex);
 
if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   direct_mapping = (len >= max_ram_len);
ddw_enabled = true;
goto out_unlock;
}
@@ -1307,8 +1313,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
  query.page_size);
goto out_failed;
}
-   /* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
+
/*
 * The "ibm,pmemory" can appear anywhere in the address space.
 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
@@ -1324,13 +1329,25 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_info(>dev, "Skipping ibm,pmemory");
}
 
+   /* check if the available block * number of ptes will map everything */
if (query.largest_available_block < (1ULL << (len - page_shift))) {
dev_dbg(>dev,
"can't map partition max 0x%llx with %llu %llu-sized 
pages\n",
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
-   goto out_failed;
+
+   /* DDW + IOMMU on single window may fail if there i

[PATCH v5 09/11] powerpc/pseries/iommu: Find existing DDW with given property name

2021-07-16 Thread Leonardo Bras

At the moment pseries stores information about created directly mapped
DDW window in DIRECT64_PROPNAME.

With the objective of implementing indirect DMA mapping with DDW, it's
necessary to have another propriety name to make sure kexec'ing into older
kernels does not break, as it would if we reuse DIRECT64_PROPNAME.

In order to have this, find_existing_ddw_windows() needs to be able to
look for different property names.

Extract find_existing_ddw_windows() into find_existing_ddw_windows_named()
and calls it with current property name.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 17c6f4706e76..22d251e15b61 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -895,24 +895,21 @@ static struct direct_window *ddw_list_new_entry(struct 
device_node *pdn,
return window;
 }
 
-static int find_existing_ddw_windows(void)
+static void find_existing_ddw_windows_named(const char *name)
 {
int len;
struct device_node *pdn;
struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
-
-   if (!firmware_has_feature(FW_FEATURE_LPAR))
-   return 0;
+   const struct dynamic_dma_window_prop *dma64;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true, DIRECT64_PROPNAME);
+   for_each_node_with_property(pdn, name) {
+   dma64 = of_get_property(pdn, name, );
+   if (!dma64 || len < sizeof(*dma64)) {
+   remove_ddw(pdn, true, name);
continue;
}
 
-   window = ddw_list_new_entry(pdn, direct64);
+   window = ddw_list_new_entry(pdn, dma64);
if (!window)
break;
 
@@ -920,6 +917,14 @@ static int find_existing_ddw_windows(void)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
}
+}
+
+static int find_existing_ddw_windows(void)
+{
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return 0;
+
+   find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 
return 0;
 }
-- 
2.32.0

[PATCH v5 08/11] powerpc/pseries/iommu: Update remove_dma_window() to accept property name

2021-07-16 Thread Leonardo Bras

Update remove_dma_window() so it can be used to remove DDW with a given
property name.

This enables the creation of new property names for DDW, so we can
have different usage for it, like indirect mapping.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 108c3dcca686..17c6f4706e76 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -830,31 +830,32 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static int remove_ddw(struct device_node *np, bool remove_prop, const char 
*win_name)
 {
struct property *win;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
int ret = 0;
 
+   win = of_find_property(np, win_name, NULL);
+   if (!win)
+   return -EINVAL;
+
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 _avail[0], DDW_APPLICABLE_SIZE);
if (ret)
-   return;
-
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win)
-   return;
+   return 0;
 
if (win->length >= sizeof(struct dynamic_dma_window_prop))
remove_dma_window(np, ddw_avail, win);
 
if (!remove_prop)
-   return;
+   return 0;
 
ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
+   return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
@@ -907,7 +908,7 @@ static int find_existing_ddw_windows(void)
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
continue;
}
 
@@ -1382,7 +1383,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
kfree(win64);
 
 out_remove_win:
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
 
 out_failed:
if (default_win_removed)
@@ -1547,7 +1548,7 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 * we have to remove the property when releasing
 * the device node.
 */
-   remove_ddw(np, false);
+   remove_ddw(np, false, DIRECT64_PROPNAME);
if (pci && pci->table_group)
iommu_pseries_free_group(pci->table_group,
np->full_name);
-- 
2.32.0

[PATCH v5 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-07-16 Thread Leonardo Bras

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, declare iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 72 ++
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 7ca79a04fa52..108c3dcca686 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -501,6 +501,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long 
start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
+static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long 
busno,
+   unsigned long liobn, unsigned long 
win_addr,
+   unsigned long window_size, unsigned 
long page_shift,
+   void *base, struct iommu_table_ops 
*table_ops)
+{
+   tbl->it_busno = busno;
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_size = window_size >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_base = (unsigned long)base;
+   tbl->it_blocksize = 16;
+   tbl->it_type = TCE_PCI;
+   tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops;
+
 static void iommu_table_setparms(struct pci_controller *phb,
 struct device_node *dn,
 struct iommu_table *tbl)
@@ -509,8 +527,13 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
const unsigned long *basep;
const u32 *sizep;
 
-   node = phb->dn;
+   /* Test if we are going over 2GB of DMA space */
+   if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
+   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+   }
 
+   node = phb->dn;
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -519,33 +542,18 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
return;
}
 
-   tbl->it_base = (unsigned long)__va(*basep);
+   iommu_table_setparms_common(tbl, phb->bus->number, 0, 
phb->dma_window_base_cur,
+   phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+   __va(*basep), _table_pseries_ops);
 
if (!is_kdump_kernel())
memset((void *)tbl->it_base, 0, *sizep);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-   /* Units of tce entries */
-   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-   /* Test if we are going over 2GB of DMA space */
-   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
-   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
-   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-   }
-
phb->dma_window_base_cur += phb->dma_window_size;
-
-   /* Set the tce table size - measured in entries */
-   tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
-   tbl->it_index = 0;
-   tbl->it_blocksize = 16;
-   tbl->it_type = TCE_PCI;
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops;
+
 /*
  * iommu_table_setparms_lpar
  *
@@ -557,17 +565,13 @@ static void iommu_table_setparms_lpar(struct 
pci_controller *phb,
  struct iommu_table_group *table_group,
  const __be32 *dma_window)
 {
-   unsigned long offset, size;
+   unsigned long offset, size, liobn;
 
-   of_parse_dma_window(dn, dma_window, >it_index, , );
+   of_parse_dma_window(dn, dma_window, , , );
+
+   iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, 
IOMMU_PAGE_SHIFT_4K, NULL,
+   _table_lpar_multi_ops);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-   tbl->it_base   = 0;
-   tbl->it_blocksize  = 16;
-   tbl->it_type =

[PATCH v5 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-07-16 Thread Leonardo Bras

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 93 --
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index b34b473bbdc1..7ca79a04fa52 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1153,6 +1153,35 @@ static int iommu_get_page_shift(u32 query_page_size)
return 0;
 }
 
+static struct property *ddw_property_create(const char *propname, u32 liobn, 
u64 dma_addr,
+   u32 page_shift, u32 window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return NULL;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value) {
+   kfree(win64->name);
+   kfree(win64->value);
+   kfree(win64);
+   return NULL;
+   }
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shift = cpu_to_be32(page_shift);
+   ddwprop->window_shift = cpu_to_be32(window_shift);
+
+   return win64;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1171,12 +1200,12 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
+   u64 win_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
bool ddw_enabled = false;
-   struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
bool pmem_present;
@@ -1293,72 +1322,64 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << page_shift);
goto out_failed;
}
-   win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-   if (!win64) {
-   dev_info(>dev,
-   "couldn't allocate property for 64bit dma window\n");
-   goto out_failed;
-   }
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-   win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-   win64->length = sizeof(*ddwprop);
-   if (!win64->name || !win64->value) {
-   dev_info(>dev,
-   "couldn't allocate property name and value\n");
-   goto out_free_prop;
-   }
 
ret = create_ddw(dev, ddw_avail, , page_shift, len);
if (ret != 0)
-   goto out_free_prop;
-
-   ddwprop->liobn = cpu_to_be32(create.liobn);
-   ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-   create.addr_lo);
-   ddwprop->tce_shift = cpu_to_be32(page_shift);
-   ddwprop->window_shift = cpu_to_be32(len);
+   goto out_failed;
 
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = ddw_list_new_entry(pdn, ddwprop);
+   win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+   win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
+   page_shift, len);
+   if (!win64) {
+   dev_info(>dev,
+"couldn't allocate property, property

[PATCH v5 05/11] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2021-07-16 Thread Leonardo Bras

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 36 +-
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 712d1667144a..b34b473bbdc1 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -853,25 +853,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
-   u64 dma_addr = 0;
+   bool found = false;
 
spin_lock(_window_list_lock);
/* check if we already created a window and dupe that config if so */
list_for_each_entry(window, _window_list, list) {
if (window->device == pdn) {
direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   *dma_addr = be64_to_cpu(direct64->dma_base);
*window_shift = be32_to_cpu(direct64->window_shift);
+   found = true;
break;
}
}
spin_unlock(_window_list_lock);
 
-   return dma_addr;
+   return found;
 }
 
 static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
@@ -1161,20 +1162,20 @@ static int iommu_get_page_shift(u32 query_page_size)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 dma_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
+   bool ddw_enabled = false;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
@@ -1186,9 +1187,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   dma_addr = find_existing_ddw(pdn, );
-   if (dma_addr != 0)
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   ddw_enabled = true;
goto out_unlock;
+   }
 
/*
 * If we already went through this for a previous function of
@@ -1342,7 +1344,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
 
-   dma_addr = be64_to_cpu(ddwprop->dma_base);
+   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
+   ddw_enabled = true;
goto out_unlock;
 
 out_free_window:
@@ -1374,10 +1377,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * as RAM, then we failed to create a window to cover persistent
 * memory and need to set the DMA limit.
 */
-   if (pmem_present && dma_addr && (len == max_ram_len))
-   dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+   if (pmem_present && ddw_enabled && (len == max_ram_len))
+   dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL 
<< len);
 
-   return dma_addr;
+   return ddw_enabled;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1456,11 +1459,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
break;
}
 
-   if (pdn && PCI_DN(pdn)) {
-   pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
-   if (pdev->dev.

[PATCH v5 04/11] powerpc/pseries/iommu: Add ddw_list_new_entry() helper

2021-07-16 Thread Leonardo Bras

There are two functions creating direct_window_list entries in a
similar way, so create a ddw_list_new_entry() to avoid duplicity and
simplify those functions.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 32 +-
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 33d82865d6e6..712d1667144a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -874,6 +874,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int 
*window_shift)
return dma_addr;
 }
 
+static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
+   const struct 
dynamic_dma_window_prop *dma64)
+{
+   struct direct_window *window;
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return NULL;
+
+   window->device = pdn;
+   window->prop = dma64;
+
+   return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
int len;
@@ -886,18 +901,15 @@ static int find_existing_ddw_windows(void)
 
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64)
-   continue;
-
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
-   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
-   kfree(window);
+   if (!direct64 || len < sizeof(*direct64)) {
remove_ddw(pdn, true);
continue;
}
 
-   window->device = pdn;
-   window->prop = direct64;
+   window = ddw_list_new_entry(pdn, direct64);
+   if (!window)
+   break;
+
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
@@ -1307,7 +1319,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   window = ddw_list_new_entry(pdn, ddwprop);
if (!window)
goto out_clear_window;
 
@@ -1326,8 +1338,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   window->device = pdn;
-   window->prop = ddwprop;
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
-- 
2.32.0

[PATCH v5 03/11] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2021-07-16 Thread Leonardo Bras

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index b1b8d12bab39..33d82865d6e6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-   struct iommu_table_group *table_group;
struct iommu_table *tbl;
 
-   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-  node);
-   if (!table_group)
-   return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
-   goto free_group;
+   return NULL;
 
INIT_LIST_HEAD_RCU(>it_group_list);
kref_init(>it_kref);
+   return tbl;
+}
 
-   table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+   struct iommu_table_group *table_group;
+
+   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+   if (!table_group)
+   return NULL;
 
-   return table_group;
+   table_group->tables[0] = iommu_pseries_alloc_table(node);
+   if (table_group->tables[0])
+   return table_group;
 
-free_group:
kfree(table_group);
return NULL;
 }
-- 
2.32.0

[PATCH v5 02/11] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-07-16 Thread Leonardo Bras

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 65 
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2af89a5e379f..b10bf58ae467 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -690,32 +690,24 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
 
-   tbl->it_reserved_start = res_start;
-   tbl->it_reserved_end = res_end;
-
-   /* Check if res_start..res_end isn't empty and overlaps the table */
-   if (res_start && res_end &&
-   (tbl->it_offset + tbl->it_size < res_start ||
-res_end < tbl->it_offset))
-   return;
+   if (res_start < tbl->it_offset)
+   res_start = tbl->it_offset;
 
-   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   set_bit(i - tbl->it_offset, tbl->it_map);
-}
+   if (res_end > (tbl->it_offset + tbl->it_size))
+   res_end = tbl->it_offset + tbl->it_size;
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-   int i;
+   /* Check if res_start..res_end is a valid range in the table */
+   if (res_start >= res_end) {
+   tbl->it_reserved_start = tbl->it_offset;
+   tbl->it_reserved_end = tbl->it_offset;
+   return;
+   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
 
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   clear_bit(i - tbl->it_offset, tbl->it_map);
+   set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
 /*
@@ -779,6 +771,22 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+   unsigned long start = 0, end;
+
+   /* ignore reserved bit0 */
+   if (tbl->it_offset == 0)
+   start = 1;
+   end = tbl->it_reserved_start - tbl->it_offset;
+   if (find_next_bit(tbl->it_map, end, start) != end)
+   return true;
+
+   start = tbl->it_reserved_end - tbl->it_offset;
+   end = tbl->it_size;
+   return find_next_bit(tbl->it_map, end, start) != end;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
struct iommu_table *tbl;
@@ -795,10 +803,8 @@ static void iommu_table_free(struct kref *kref)
 
iommu_debugfs_del(tbl);
 
-   iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
-   if (!bitmap_empty(tbl->it_map, tbl->it_size))
+   if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
 
/* free bitmap */
@@ -1099,18 +1105,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock_nest_lock(>pools[i].lock, >large_pool.lock);
 
-   iommu_table_release_pages(tbl);
-
-   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+   if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
-   /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-   iommu_table_reserve_pages(tbl, tbl->it_reserved_start

[PATCH v5 01/11] powerpc/pseries/iommu: Replace hard-coded page shift

2021-07-16 Thread Leonardo Bras

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.

IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/tce.h |  8 --
 arch/powerpc/platforms/pseries/iommu.c | 39 +++---
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..0c34d2756d92 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,7 @@
 #define TCE_VB 0
 #define TCE_PCI1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT  12
-#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE 8   /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT  12
 #define TCE_VALID  0x800   /* TCE valid */
 #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
 #define TCE_PCI_WRITE  0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 0c55b991f665..b1b8d12bab39 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long 
index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
long index,
 
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
-   rpn = __pa(uaddr) >> TCE_SHIFT;
-   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
TCE_RPN_SHIFT);
+   rpn = __pa(uaddr) >> tceshift;
+   *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
 
-   uaddr += TCE_PAGE_SIZE;
+   uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
*tbl, long index)
return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
 
while (npages--) {
-   tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+   tce = proto_tce | rpn << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
-   tce_free_pSeriesLP(liobn, tcenum_start,
+   tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
   (npages_start - (npages + 1)));
break;
}
@@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+   const unsigned long tceshift = tbl->it_page_shift;
 
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
-  tbl->it_page_shift, npages, uaddr,
+  tceshift, npages, uaddr,
   direction, attrs);
}
 
@@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
if (!tcep) {
local_irq_restore(fla

[PATCH v5 00/11] DDW + Indirect Mapping

2021-07-16 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

Using the DDW instead of the default DMA window may allow to expand the
amount of memory that can be DMA-mapped, given the number of pages (TCEs)
may stay the same (or increase) and the default DMA window offers only
4k-pages while DDW may offer larger pages (4k, 64k, 16M ...).

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #3 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #4 adds helpers for adding DDWs in the list.

Patch #5 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #6 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #7 adds new helper _iommu_table_setparms() and use it in other
*setparams*() to fill iommu_table. It will also be used for creating a
new iommu_table for indirect mapping.

Patch #8 updates remove_dma_window() to accept different property names,
so we can introduce a new property for indirect mapping.

Patch #9 extracts find_existing_ddw_windows() into
find_existing_ddw_windows_named(), and calls it by it's property name.
This will be useful when the property for indirect mapping is created,
so we can search the device-tree for both properties.

Patch #10:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.
It introduces a new property name for DDW with indirect DMA mapping.

Patch #11:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an virtio-net interface that
allows default DMA window and DDW to coexist.

Changes since v4:
- Solve conflicts with new upstream versions
- Avoid unecessary code moving by doing variable declaration before definition
- Rename _iommu_table_setparms to iommu_table_setparms_common and changed base
  parameter from unsigned long to void* in order to avoid unecessary casting.
- Fix breaking case for existing direct-mapping.
- Fix IORESOURCE_MEM bound issue
- Move new tbl to pci->table_group->tables[1] instead of replacing [0]
v4 Link: 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=241597=%2A=both

Changes since v3:
- Fixed inverted free order at ddw_property_create()
- Updated goto tag naming
v3 Link: 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=240287=%2A=both

Changes since v2:
- Some patches got removed from the series and sent by themselves,
- New tbl created for DDW + indirect mapping reserves MMIO32 space,
- Improved reserved area algorithm,
- Improved commit messages,
- Removed define for default DMA window prop name,
- Avoided some unnecessary renaming,
- Removed some unnecessary empty lines,
- Changed some code moving to forward declarations.
v2 Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=201210=%2A=both


Leonardo Bras (11):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_new_entry() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
enable_ddw()
  powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new
helper
  powerpc/pseries/iommu: Update remove_dma_window() to accept property
name
  powerpc/pseries/iommu: Find existing DDW with given property name
  powerpc/pseries/iommu: Make use of DDW for indirect mapping
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h   |   1 +
 arch/powerpc/include/asm/tce.h |   8 -
 arch/powerpc/kernel/iommu.c|  65 ++--
 arch/powerpc/platforms/pseries/iommu.c | 481 +++--
 4 files changed, 330 insertions(+), 225 deletions(-)

-- 
2.32.0

[PATCH v4 11/11] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2021-04-30 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

This should cause no behavioural change, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 93 +-
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 572879af0211..ce7b841fb10f 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -355,7 +355,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -375,11 +375,11 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
@@ -712,7 +712,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
if (dma_window != NULL)
@@ -816,11 +819,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -848,37 +851,37 @@ static int remove_ddw(struct device_node *np, bool 
remove_prop, const char *win_
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove DMA window property: %d\n",
np, ret);
return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
bool found = false;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   *dma_addr = be64_to_cpu(direct64->dma_base);
-   *window_shift = be32_to_cpu(direct64->window_shift);
+   dma64 = window->prop;
+   *dma_addr = be64_to_cpu(dma64->dma_base);
+   *window_shift = be32_to_cpu(dma64->window_shift);
found = true;
break;
}
}
-   spin_unlock(_window_list_lock);
+   spin_unlock(_win_list_lock);
 
return found;
 }
 
-static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
-   const struct 
dynamic_dma_window_prop *dma64)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+

[PATCH v4 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-30 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

By using DDW, indirect mapping  can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.

Indirect mapping will only be used if direct mapping is not a
possibility.

For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 87 +-
 1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index de54ddd9decd..572879af0211 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,7 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+static phys_addr_t ddw_memory_hotplug_max(void);
 #ifdef CONFIG_IOMMU_API
 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 
long *tce,
enum dma_data_direction *direction, bool 
realmode);
@@ -380,6 +381,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -918,6 +920,7 @@ static int find_existing_ddw_windows(void)
return 0;
 
find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+   find_existing_ddw_windows_named(DMA64_PROPNAME);
 
return 0;
 }
@@ -1207,10 +1210,13 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
+   const char *win_name;
struct property *win64 = NULL;
struct failed_ddw_pdn *fpdn;
-   bool default_win_removed = false;
+   bool default_win_removed = false, direct_mapping = false;
bool pmem_present;
+   struct pci_dn *pci = PCI_DN(pdn);
+   struct iommu_table *tbl = pci->table_group->tables[0];
 
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
@@ -1218,8 +1224,12 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, ))
-   goto out_unlock;
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   direct_mapping = (len >= max_ram_len);
+
+   mutex_unlock(_window_init_mutex);
+   return direct_mapping;
+   }
 
/*
 * If we already went through this for a previous function of
@@ -1298,7 +1308,6 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
/* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
/*
 * The "ibm,pmemory" can appear anywhere in the address space.
 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
@@ -1320,6 +1329,17 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
+
+   len = order_base_2(query.largest_available_block << page_shift);
+   win_name =

[PATCH v4 09/11] powerpc/pseries/iommu: Find existing DDW with given property name

2021-04-30 Thread Leonardo Bras

At the moment pseries stores information about created directly mapped
DDW window in DIRECT64_PROPNAME.

With the objective of implementing indirect DMA mapping with DDW, it's
necessary to have another propriety name to make sure kexec'ing into older
kernels does not break, as it would if we reuse DIRECT64_PROPNAME.

In order to have this, find_existing_ddw_windows() needs to be able to
look for different property names.

Extract find_existing_ddw_windows() into find_existing_ddw_windows_named()
and calls it with current property name.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index f8922fcf34b6..de54ddd9decd 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -888,24 +888,21 @@ static struct direct_window *ddw_list_new_entry(struct 
device_node *pdn,
return window;
 }
 
-static int find_existing_ddw_windows(void)
+static void find_existing_ddw_windows_named(const char *name)
 {
int len;
struct device_node *pdn;
struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
-
-   if (!firmware_has_feature(FW_FEATURE_LPAR))
-   return 0;
+   const struct dynamic_dma_window_prop *dma64;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true, DIRECT64_PROPNAME);
+   for_each_node_with_property(pdn, name) {
+   dma64 = of_get_property(pdn, name, );
+   if (!dma64 || len < sizeof(*dma64)) {
+   remove_ddw(pdn, true, name);
continue;
}
 
-   window = ddw_list_new_entry(pdn, direct64);
+   window = ddw_list_new_entry(pdn, dma64);
if (!window)
break;
 
@@ -913,6 +910,14 @@ static int find_existing_ddw_windows(void)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
}
+}
+
+static int find_existing_ddw_windows(void)
+{
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return 0;
+
+   find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 
return 0;
 }
-- 
2.30.2

[PATCH v4 08/11] powerpc/pseries/iommu: Update remove_dma_window() to accept property name

2021-04-30 Thread Leonardo Bras

Update remove_dma_window() so it can be used to remove DDW with a given
property name.

This enables the creation of new property names for DDW, so we can
have different usage for it, like indirect mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 89cb6e9e9f31..f8922fcf34b6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -823,31 +823,32 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static int remove_ddw(struct device_node *np, bool remove_prop, const char 
*win_name)
 {
struct property *win;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
int ret = 0;
 
+   win = of_find_property(np, win_name, NULL);
+   if (!win)
+   return -EINVAL;
+
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 _avail[0], DDW_APPLICABLE_SIZE);
if (ret)
-   return;
-
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win)
-   return;
+   return 0;
 
if (win->length >= sizeof(struct dynamic_dma_window_prop))
remove_dma_window(np, ddw_avail, win);
 
if (!remove_prop)
-   return;
+   return 0;
 
ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
+   return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
@@ -900,7 +901,7 @@ static int find_existing_ddw_windows(void)
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
continue;
}
 
@@ -1372,7 +1373,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
win64 = NULL;
 
 out_remove_win:
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
 
 out_failed:
if (default_win_removed)
@@ -1536,7 +1537,7 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 * we have to remove the property when releasing
 * the device node.
 */
-   remove_ddw(np, false);
+   remove_ddw(np, false, DIRECT64_PROPNAME);
if (pci && pci->table_group)
iommu_pseries_free_group(pci->table_group,
np->full_name);
-- 
2.30.2

[PATCH v4 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-04-30 Thread Leonardo Bras

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, move iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 100 -
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 5a70ecd579b8..89cb6e9e9f31 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,11 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 
long *tce,
+   enum dma_data_direction *direction, bool 
realmode);
+#endif
+
 static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
struct iommu_table *tbl;
@@ -501,6 +506,28 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long 
start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
+static inline void _iommu_table_setparms(struct iommu_table *tbl, unsigned 
long busno,
+unsigned long liobn, unsigned long 
win_addr,
+unsigned long window_size, unsigned 
long page_shift,
+unsigned long base, struct 
iommu_table_ops *table_ops)
+{
+   tbl->it_busno = busno;
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_size = window_size >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_base = base;
+   tbl->it_blocksize = 16;
+   tbl->it_type = TCE_PCI;
+   tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+   .set = tce_build_pSeries,
+   .clear = tce_free_pSeries,
+   .get = tce_get_pseries
+};
+
 static void iommu_table_setparms(struct pci_controller *phb,
 struct device_node *dn,
 struct iommu_table *tbl)
@@ -509,8 +536,13 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
const unsigned long *basep;
const u32 *sizep;
 
-   node = phb->dn;
+   /* Test if we are going over 2GB of DMA space */
+   if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
+   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+   }
 
+   node = phb->dn;
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -519,33 +551,25 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
return;
}
 
-   tbl->it_base = (unsigned long)__va(*basep);
+   _iommu_table_setparms(tbl, phb->bus->number, 0, 
phb->dma_window_base_cur,
+ phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+ (unsigned long)__va(*basep), 
_table_pseries_ops);
 
if (!is_kdump_kernel())
memset((void *)tbl->it_base, 0, *sizep);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-   /* Units of tce entries */
-   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-   /* Test if we are going over 2GB of DMA space */
-   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
-   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
-   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-   }
-
phb->dma_window_base_cur += phb->dma_window_size;
-
-   /* Set the tce table size - measured in entries */
-   tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
-   tbl->it_index = 0;
-   tbl->it_blocksize = 16;
-   tbl->it_type = TCE_PCI;
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+   .set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+   .xchg_no_kill = tce_exchange_pseries,
+#endif
+   .clear = tce_freemulti_pSeriesLP,
+   .get = tce_get_pSeriesLP
+};
+
 /*
  * iommu_table_setparms_lpar
  *
@@ -557,28 +581,17 @@ static void iommu_table_setparms_lpar(struct 
pci_controller *phb,
  struct iommu_tab

[PATCH v4 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-04-30 Thread Leonardo Bras

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 93 --
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 955cf095416c..5a70ecd579b8 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1122,6 +1122,35 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+static struct property *ddw_property_create(const char *propname, u32 liobn, 
u64 dma_addr,
+   u32 page_shift, u32 window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return NULL;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value) {
+   kfree(win64->name);
+   kfree(win64->value);
+   kfree(win64);
+   return NULL;
+   }
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shift = cpu_to_be32(page_shift);
+   ddwprop->window_shift = cpu_to_be32(window_shift);
+
+   return win64;
+}
+
 /* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
 static int iommu_get_page_shift(u32 query_page_size)
 {
@@ -1167,11 +1196,11 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
+   u64 win_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64 = NULL;
-   struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
bool pmem_present;
@@ -1286,65 +1315,54 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << page_shift);
goto out_failed;
}
-   win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-   if (!win64) {
-   dev_info(>dev,
-   "couldn't allocate property for 64bit dma window\n");
-   goto out_failed;
-   }
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-   win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-   win64->length = sizeof(*ddwprop);
-   if (!win64->name || !win64->value) {
-   dev_info(>dev,
-   "couldn't allocate property name and value\n");
-   goto out_free_prop;
-   }
 
ret = create_ddw(dev, ddw_avail, , page_shift, len);
if (ret != 0)
-   goto out_free_prop;
-
-   ddwprop->liobn = cpu_to_be32(create.liobn);
-   ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-   create.addr_lo);
-   ddwprop->tce_shift = cpu_to_be32(page_shift);
-   ddwprop->window_shift = cpu_to_be32(len);
+   goto out_failed;
 
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = ddw_list_new_entry(pdn, ddwprop);
+   win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+   win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
+   page_shift, len);
+   if (!win64) {
+   dev_info(>dev,
+"couldn't allocate property, property name, or 
value\n&quo

[PATCH v4 05/11] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2021-04-30 Thread Leonardo Bras

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 35 --
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6f14894d2d04..955cf095416c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -849,25 +849,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
-   u64 dma_addr = 0;
+   bool found = false;
 
spin_lock(_window_list_lock);
/* check if we already created a window and dupe that config if so */
list_for_each_entry(window, _window_list, list) {
if (window->device == pdn) {
direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   *dma_addr = be64_to_cpu(direct64->dma_base);
*window_shift = be32_to_cpu(direct64->window_shift);
+   found = true;
break;
}
}
spin_unlock(_window_list_lock);
 
-   return dma_addr;
+   return found;
 }
 
 static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
@@ -1157,20 +1158,19 @@ static int iommu_get_page_shift(u32 query_page_size)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 dma_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
-   struct property *win64;
+   struct property *win64 = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
@@ -1182,8 +1182,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   dma_addr = find_existing_ddw(pdn, );
-   if (dma_addr != 0)
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, ))
goto out_unlock;
 
/*
@@ -1338,7 +1337,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
 
-   dma_addr = be64_to_cpu(ddwprop->dma_base);
+   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
goto out_unlock;
 
 out_free_window:
@@ -1351,6 +1350,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
kfree(win64->name);
kfree(win64->value);
kfree(win64);
+   win64 = NULL;
 
 out_failed:
if (default_win_removed)
@@ -1370,10 +1370,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * as RAM, then we failed to create a window to cover persistent
 * memory and need to set the DMA limit.
 */
-   if (pmem_present && dma_addr && (len == max_ram_len))
-   dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+   if (pmem_present && win64 && (len == max_ram_len))
+   dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL 
<< len);
 
-   return dma_addr;
+   return win64;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1452,11 +1452,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
break;
}
 
-   if (pdn && PCI_DN(pdn))

[PATCH v4 04/11] powerpc/pseries/iommu: Add ddw_list_new_entry() helper

2021-04-30 Thread Leonardo Bras

There are two functions creating direct_window_list entries in a
similar way, so create a ddw_list_new_entry() to avoid duplicity and
simplify those functions.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 32 +-
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index d02359ca1f9f..6f14894d2d04 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -870,6 +870,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int 
*window_shift)
return dma_addr;
 }
 
+static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
+   const struct 
dynamic_dma_window_prop *dma64)
+{
+   struct direct_window *window;
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return NULL;
+
+   window->device = pdn;
+   window->prop = dma64;
+
+   return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
int len;
@@ -882,18 +897,15 @@ static int find_existing_ddw_windows(void)
 
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64)
-   continue;
-
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
-   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
-   kfree(window);
+   if (!direct64 || len < sizeof(*direct64)) {
remove_ddw(pdn, true);
continue;
}
 
-   window->device = pdn;
-   window->prop = direct64;
+   window = ddw_list_new_entry(pdn, direct64);
+   if (!window)
+   break;
+
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
@@ -1303,7 +1315,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   window = ddw_list_new_entry(pdn, ddwprop);
if (!window)
goto out_clear_window;
 
@@ -1322,8 +1334,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   window->device = pdn;
-   window->prop = ddwprop;
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
-- 
2.30.2

[PATCH v4 03/11] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2021-04-30 Thread Leonardo Bras

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 796ab356341c..d02359ca1f9f 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-   struct iommu_table_group *table_group;
struct iommu_table *tbl;
 
-   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-  node);
-   if (!table_group)
-   return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
-   goto free_group;
+   return NULL;
 
INIT_LIST_HEAD_RCU(>it_group_list);
kref_init(>it_kref);
+   return tbl;
+}
 
-   table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+   struct iommu_table_group *table_group;
+
+   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+   if (!table_group)
+   return NULL;
 
-   return table_group;
+   table_group->tables[0] = iommu_pseries_alloc_table(node);
+   if (table_group->tables[0])
+   return table_group;
 
-free_group:
kfree(table_group);
return NULL;
 }
-- 
2.30.2

[PATCH v4 02/11] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-04-30 Thread Leonardo Bras

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 65 
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ad82dda81640..5e168bd91401 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -691,32 +691,24 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
 
-   tbl->it_reserved_start = res_start;
-   tbl->it_reserved_end = res_end;
-
-   /* Check if res_start..res_end isn't empty and overlaps the table */
-   if (res_start && res_end &&
-   (tbl->it_offset + tbl->it_size < res_start ||
-res_end < tbl->it_offset))
-   return;
+   if (res_start < tbl->it_offset)
+   res_start = tbl->it_offset;
 
-   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   set_bit(i - tbl->it_offset, tbl->it_map);
-}
+   if (res_end > (tbl->it_offset + tbl->it_size))
+   res_end = tbl->it_offset + tbl->it_size;
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-   int i;
+   /* Check if res_start..res_end is a valid range in the table */
+   if (res_start >= res_end) {
+   tbl->it_reserved_start = tbl->it_offset;
+   tbl->it_reserved_end = tbl->it_offset;
+   return;
+   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
 
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   clear_bit(i - tbl->it_offset, tbl->it_map);
+   set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
 /*
@@ -781,6 +773,22 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+   unsigned long start = 0, end;
+
+   /* ignore reserved bit0 */
+   if (tbl->it_offset == 0)
+   start = 1;
+   end = tbl->it_reserved_start - tbl->it_offset;
+   if (find_next_bit(tbl->it_map, end, start) != end)
+   return true;
+
+   start = tbl->it_reserved_end - tbl->it_offset;
+   end = tbl->it_size;
+   return find_next_bit(tbl->it_map, end, start) != end;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
unsigned long bitmap_sz;
@@ -799,10 +807,8 @@ static void iommu_table_free(struct kref *kref)
 
iommu_debugfs_del(tbl);
 
-   iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
-   if (!bitmap_empty(tbl->it_map, tbl->it_size))
+   if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
 
/* calculate bitmap size in bytes */
@@ -1108,18 +1114,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock(>pools[i].lock);
 
-   iommu_table_release_pages(tbl);
-
-   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+   if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
-   /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-   iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-   tbl->it_re

[PATCH v4 01/11] powerpc/pseries/iommu: Replace hard-coded page shift

2021-04-30 Thread Leonardo Bras

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.

IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/tce.h |  8 --
 arch/powerpc/platforms/pseries/iommu.c | 39 +++---
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..0c34d2756d92 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,7 @@
 #define TCE_VB 0
 #define TCE_PCI1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT  12
-#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE 8   /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT  12
 #define TCE_VALID  0x800   /* TCE valid */
 #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
 #define TCE_PCI_WRITE  0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 67c9953a6503..796ab356341c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long 
index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
long index,
 
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
-   rpn = __pa(uaddr) >> TCE_SHIFT;
-   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
TCE_RPN_SHIFT);
+   rpn = __pa(uaddr) >> tceshift;
+   *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
 
-   uaddr += TCE_PAGE_SIZE;
+   uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
*tbl, long index)
return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
 
while (npages--) {
-   tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+   tce = proto_tce | rpn << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
-   tce_free_pSeriesLP(liobn, tcenum_start,
+   tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
   (npages_start - (npages + 1)));
break;
}
@@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+   const unsigned long tceshift = tbl->it_page_shift;
 
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
-  tbl->it_page_shift, npages, uaddr,
+  tceshift, npages, uaddr,
   direction, attrs);
}
 
@@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
if (!tcep) {
local_irq_restore(fla

[PATCH v4 00/11] DDW + Indirect Mapping

2021-04-30 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

Using the DDW instead of the default DMA window may allow to expand the
amount of memory that can be DMA-mapped, given the number of pages (TCEs)
may stay the same (or increase) and the default DMA window offers only
4k-pages while DDW may offer larger pages (4k, 64k, 16M ...).

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #3 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #4 adds helpers for adding DDWs in the list.

Patch #5 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #6 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #7 adds new helper _iommu_table_setparms() and use it in other
*setparams*() to fill iommu_table. It will also be used for creating a
new iommu_table for indirect mapping.

Patch #8 updates remove_dma_window() to accept different property names,
so we can introduce a new property for indirect mapping.

Patch #9 extracts find_existing_ddw_windows() into
find_existing_ddw_windows_named(), and calls it by it's property name.
This will be useful when the property for indirect mapping is created,
so we can search the device-tree for both properties.

Patch #10:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.
It introduces a new property name for DDW with indirect DMA mapping.

Patch #11:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an virtio-net interface that
allows default DMA window and DDW to coexist.

Changes since v3:
- Fixed inverted free order at ddw_property_create()
- Updated goto tag naming

Changes since v2:
- Some patches got removed from the series and sent by themselves,
- New tbl created for DDW + indirect mapping reserves MMIO32 space,
- Improved reserved area algorithm,
- Improved commit messages,
- Removed define for default DMA window prop name,
- Avoided some unnecessary renaming,
- Removed some unnecessary empty lines,
- Changed some code moving to forward declarations.
v2
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=201210=%2A=both
 

Leonardo Bras (11):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_new_entry() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
enable_ddw()
  powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new
helper
  powerpc/pseries/iommu: Update remove_dma_window() to accept property
name
  powerpc/pseries/iommu: Find existing DDW with given property name
  powerpc/pseries/iommu: Make use of DDW for indirect mapping
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h   |   1 +
 arch/powerpc/include/asm/tce.h |   8 -
 arch/powerpc/kernel/iommu.c|  65 ++--
 arch/powerpc/platforms/pseries/iommu.c | 504 +++--
 4 files changed, 338 insertions(+), 240 deletions(-)

-- 
2.30.2

Re: [PATCH v3 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-04-30 Thread Leonardo Bras

On Fri, 2021-04-23 at 19:04 +1000, Alexey Kardashevskiy wrote:
> 
> > +   win64->name = kstrdup(propname, GFP_KERNEL);
> > +   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
> > +   win64->value = ddwprop;
> > +   win64->length = sizeof(*ddwprop);
> > +   if (!win64->name || !win64->value) {
> > +   kfree(win64);
> > +   kfree(win64->name);
> > +   kfree(win64->value);
> 
> 
> Wrong order.
> 

Right! Sorry about that. 
Changed for next version!

> > 
> > 
> > +out_del_win:
> 
> 
> (I would not bother but since I am commenting on the patch)
> 
> nit: the new name is not that much better than the old 
> "out_clear_window:" ("out_remove_win" would be a bit better) and it does 
> make reviewing a little bit harder. Thanks,

Replaced by out_remove_win
Thanks!

Re: [PATCH v3 01/11] powerpc/pseries/iommu: Replace hard-coded page shift

2021-04-30 Thread Leonardo Bras

Thanks Alexey!

On Fri, 2021-04-23 at 17:27 +1000, Alexey Kardashevskiy wrote:
> 
> On 22/04/2021 17:07, Leonardo Bras wrote:
> > Some functions assume IOMMU page size can only be 4K (pageshift == 12).
> > Update them to accept any page size passed, so we can use 64K pages.
> > 
> > In the process, some defines like TCE_SHIFT were made obsolete, and then
> > removed.
> > 
> > IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
> > a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
> > no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
> > It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
> > tce_buildmulti_pSeriesLP().
> 
> 
> After rereading the patch, I wonder why we had this TCE_RPN_MASK at all 
> but what is certain is that this has nothing to do with IODA3 as these 
> TCEs are guest phys addresses in pseries and IODA3 is bare metal. Except...
> 
> 
> > Most places had a tbl struct, so using tbl->it_page_shift was simple.
> > tce_free_pSeriesLP() was a special case, since callers not always have a
> > tbl struct, so adding a tceshift parameter seems the right thing to do.
> > 
> > Signed-off-by: Leonardo Bras 
> > Reviewed-by: Alexey Kardashevskiy 
> > ---
> >   arch/powerpc/include/asm/tce.h |  8 --
> >   arch/powerpc/platforms/pseries/iommu.c | 39 +++---
> >   2 files changed, 23 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
> > index db5fc2f2262d..0c34d2756d92 100644
> > --- a/arch/powerpc/include/asm/tce.h
> > +++ b/arch/powerpc/include/asm/tce.h
> > @@ -19,15 +19,7 @@
> >   #define TCE_VB0
> >   #define TCE_PCI   1
> >   
> > 
> > -/* TCE page size is 4096 bytes (1 << 12) */
> > -
> > -#define TCE_SHIFT  12
> > -#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
> > -
> >   #define TCE_ENTRY_SIZE8   /* each TCE is 64 bits 
> > */
> > -
> > -#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K 
> > pages) */
> > -#define TCE_RPN_SHIFT  12
> >   #define TCE_VALID 0x800   /* TCE valid */
> >   #define TCE_ALLIO 0x400   /* TCE valid for all lpars */
> >   #define TCE_PCI_WRITE 0x2 /* write from PCI 
> > allowed */
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 67c9953a6503..796ab356341c 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
> > long index,
> >     u64 proto_tce;
> >     __be64 *tcep;
> >     u64 rpn;
> > +   const unsigned long tceshift = tbl->it_page_shift;
> > +   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
> 
> (nit: only used once)
> 
> >   
> > 
> >     proto_tce = TCE_PCI_READ; // Read allowed
> >   
> > 
> > @@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
> > long index,
> 
> 
> ... this pseries which is not pseriesLP, i.e. no LPAR == bare metal 
> pseries such as ancient power5 or cellbe (I guess) and for those 
> TCE_RPN_MASK may actually make sense, keep it.
> 
> The rest of the patch looks good. Thanks,
> 
> 
> >   
> > 
> >     while (npages--) {
> >     /* can't move this out since we might cross MEMBLOCK boundary */
> > -   rpn = __pa(uaddr) >> TCE_SHIFT;
> > -   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
> > TCE_RPN_SHIFT);
> > +   rpn = __pa(uaddr) >> tceshift;
> > +   *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
> >   
> > 
> > -   uaddr += TCE_PAGE_SIZE;
> > +   uaddr += pagesize;
> >     tcep++;
> >     }
> >     return 0;
> > @@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
> > *tbl, long index)
> >     return be64_to_cpu(*tcep);
> >   }
> >   
> > 
> > -static void tce_free_pSeriesLP(unsigned long liobn, long, long);
> > +static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
> >   static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
> >   
> > 
> >   static int tce_build_pSeriesLP(unsigned long liobn, long t

Re: [PATCH v2 0/3] powerpc/mm/hash: Time improvements for memory hot(un)plug

2021-04-30 Thread Leonardo Bras

CC: David Gibson

http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=241574=%2A=both

[PATCH v2 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-04-30 Thread Leonardo Bras

During memory hotunplug, after each LMB is removed, the HPT may be
resized-down if it would map a max of 4 times the current amount of memory.
(2 shifts, due to introduced histeresis)

It usually is not an issue, but it can take a lot of time if HPT
resizing-down fails. This happens  because resize-down failures
usually repeat at each LMB removal, until there are no more bolted entries
conflict, which can take a while to happen.

This can be solved by doing a single HPT resize at the end of memory
hotunplug, after all requested entries are removed.

To make this happen, it's necessary to temporarily disable all HPT
resize-downs before hotunplug, re-enable them after hotunplug ends,
and then resize-down HPT to the current memory size.

As an example, hotunplugging 256GB from a 385GB guest took 621s without
this patch, and 100s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  2 +
 arch/powerpc/mm/book3s64/hash_utils.c | 45 +--
 .../platforms/pseries/hotplug-memory.c| 26 +++
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index fad4af8b8543..6cd66e7e98c9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -256,6 +256,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 void hash_batch_expand_prepare(unsigned long newsize);
+void hash_batch_shrink_begin(void);
+void hash_batch_shrink_end(void);
 
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 3fa395b3fe57..73ecd0f61acd 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -795,6 +795,9 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+
+static DEFINE_MUTEX(hpt_resize_down_lock);
+
 static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
@@ -805,7 +808,7 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size, bool shrinking)
target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
 
if (shrinking) {
-
+   int ret;
/*
 * To avoid lots of HPT resizes if memory size is fluctuating
 * across a boundary, we deliberately have some hysterisis
@@ -818,10 +821,20 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size, bool shrinking)
if (target_hpt_shift >= ppc64_pft_size - 1)
return 0;
 
-   } else if (target_hpt_shift <= ppc64_pft_size) {
-   return 0;
+   /* When batch removing entries, only resizes HPT at the end. */
+
+   if (!mutex_trylock(_resize_down_lock))
+   return 0;
+
+   ret = mmu_hash_ops.resize_hpt(target_hpt_shift);
+
+   mutex_unlock(_resize_down_lock);
+   return ret;
}
 
+   if (target_hpt_shift <= ppc64_pft_size)
+   return 0;
+
return mmu_hash_ops.resize_hpt(target_hpt_shift);
 }
 
@@ -879,6 +892,32 @@ void hash_batch_expand_prepare(unsigned long newsize)
break;
}
 }
+
+void hash_batch_shrink_begin(void)
+{
+   /* Disable HPT resize-down during hot-unplug */
+   mutex_lock(_resize_down_lock);
+}
+
+void hash_batch_shrink_end(void)
+{
+   const u64 starting_size = ppc64_pft_size;
+   unsigned long newsize;
+
+   newsize = memblock_phys_mem_size();
+   /* Resize to smallest SHIFT possible */
+   while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+
+   /* Do not try to resize to the starting size, or bigger value */
+   if (htab_shift_for_mem_size(newsize) >= starting_size)
+   break;
+   }
+
+   /* Re-enables HPT resize-down after hot-unplug */
+   mutex_unlock(_resize_down_lock);
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 48b2cfe4ce69..44bc50d72353 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -426,6 +426,9 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove)
return -EINVAL;
}
 
+   if (!radix_enabled())
+   hash_batch_shrink_begin();
+
for_each_drmem_lmb(lmb) {
rc = dlpar_remove_lmb(lmb);
if (rc)
@@ -471,

[PATCH v2 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-04-30 Thread Leonardo Bras

Every time a memory hotplug happens, and the memory limit crosses a 2^n
value, it may be necessary to perform HPT resizing-up, which can take
some time (over 100ms in my tests).

It usually is not an issue, but it can take some time if a lot of memory
is added to a guest with little starting memory:
Adding 256G to a 2GB guest, for example will require 8 HPT resizes.

Perform an HPT resize before memory hotplug, updating HPT to its
final size (considering a successful hotplug), taking the number of
HPT resizes to at most one per memory hotplug action.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c | 20 +++
 .../platforms/pseries/hotplug-memory.c|  9 +
 3 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d959b0195ad9..fad4af8b8543 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
+void hash_batch_expand_prepare(unsigned long newsize);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 608e4ed397a9..3fa395b3fe57 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -859,6 +859,26 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
 
return rc;
 }
+
+void hash_batch_expand_prepare(unsigned long newsize)
+{
+   const u64 starting_size = ppc64_pft_size;
+
+   /*
+* Resizing-up HPT should never fail, but there are some cases system 
starts with higher
+* SHIFT than required, and we go through the funny case of resizing 
HPT down while
+* adding memory
+*/
+
+   while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+
+   /* Do not try to resize to the starting size, or bigger value */
+   if (htab_shift_for_mem_size(newsize) >= starting_size)
+   break;
+   }
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..48b2cfe4ce69 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -671,6 +672,10 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   if (!radix_enabled())
+   hash_batch_expand_prepare(memblock_phys_mem_size() +
+lmbs_to_add * 
drmem_lmb_size());
+
for_each_drmem_lmb(lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
@@ -788,6 +793,10 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
drc_index)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   if (!radix_enabled())
+   hash_batch_expand_prepare(memblock_phys_mem_size() +
+ lmbs_to_add * drmem_lmb_size());
+
for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
-- 
2.30.2

[PATCH v2 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-04-30 Thread Leonardo Bras

Because hypervisors may need to create HPTs without knowing the guest
page size, the smallest used page-size (4k) may be chosen, resulting in
a HPT that is possibly bigger than needed.

On a guest with bigger page-sizes, the amount of entries for HTP may be
too high, causing the guest to ask for a HPT resize-down on the first
hotplug.

This becomes a problem when HPT resize-down fails, and causes the
HPT resize to be performed on every LMB added, until HPT size is
compatible to guest memory size, causing a major slowdown.

So, avoiding HPT resizing-down on hot-add significantly improves memory
hotplug times.

As an example, hotplugging 256GB on a 129GB guest took 710s without this
patch, and 21s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 36 ---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 581b20a2feaf..608e4ed397a9 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -795,7 +795,7 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
 
@@ -804,19 +804,25 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size)
 
target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
 
-   /*
-* To avoid lots of HPT resizes if memory size is fluctuating
-* across a boundary, we deliberately have some hysterisis
-* here: we immediately increase the HPT size if the target
-* shift exceeds the current shift, but we won't attempt to
-* reduce unless the target shift is at least 2 below the
-* current shift
-*/
-   if (target_hpt_shift > ppc64_pft_size ||
-   target_hpt_shift < ppc64_pft_size - 1)
-   return mmu_hash_ops.resize_hpt(target_hpt_shift);
+   if (shrinking) {
 
-   return 0;
+   /*
+* To avoid lots of HPT resizes if memory size is fluctuating
+* across a boundary, we deliberately have some hysterisis
+* here: we immediately increase the HPT size if the target
+* shift exceeds the current shift, but we won't attempt to
+* reduce unless the target shift is at least 2 below the
+* current shift
+*/
+
+   if (target_hpt_shift >= ppc64_pft_size - 1)
+   return 0;
+
+   } else if (target_hpt_shift <= ppc64_pft_size) {
+   return 0;
+   }
+
+   return mmu_hash_ops.resize_hpt(target_hpt_shift);
 }
 
 int hash__create_section_mapping(unsigned long start, unsigned long end,
@@ -829,7 +835,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
return -1;
}
 
-   resize_hpt_for_hotplug(memblock_phys_mem_size());
+   resize_hpt_for_hotplug(memblock_phys_mem_size(), false);
 
rc = htab_bolt_mapping(start, end, __pa(start),
   pgprot_val(prot), mmu_linear_psize,
@@ -848,7 +854,7 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 mmu_kernel_ssize);
 
-   if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+   if (resize_hpt_for_hotplug(memblock_phys_mem_size(), true) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
 
return rc;
-- 
2.30.2

[PATCH v2 0/3] powerpc/mm/hash: Time improvements for memory hot(un)plug

2021-04-30 Thread Leonardo Bras

This patchset intends to reduce time needed for processing memory
hotplug/hotunplug in hash guests.

The first one, makes sure guests with pagesize over 4k don't need to
go through HPT resize-downs after memory hotplug.

The second and third patches make hotplug / hotunplug perform a single
HPT resize per operation, instead of one for each shift change, or one
for each LMB in case of resize-down error.

Why haven't the same mechanism used for both memory hotplug and hotunplug?
They both have different requirements:

Memory hotplug causes (usually) HPT resize-ups, which are fine happening
at the start of hotplug, but resize-ups should not ever be disabled, as
other mechanisms may try to increase memory, hitting issues with a HPT
that is too small.

Memory hotunplug causes HPT resize-downs, which can be disabled (HPT will
just remain larger for a while), but need to happen at the end of an
hotunplug operation. If we want to batch it, we need to disable
resize-downs and perform it only at the end.

Tests done with this patchset in the same machine / guest config:
Starting memory: 129GB, DIMM: 256GB
Before patchset: hotplug = 710s, hotunplug = 621s.
After patchset: hotplug  = 21s, hotunplug = 100s.

Any feedback will be appreciated!

Changes since v1:
- Atomic used to disable resize was replaced by a mutex
- Removed wrappers, testing for !radix directly in hot(un)plug routine
- Added bounds to HPT resize loop
- Removed batching from dlpar_memory_*_by_index, as it adds a single LMB 

Best regards,
Leonardo Bras (3):
  powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

 arch/powerpc/include/asm/book3s/64/hash.h |  4 +
 arch/powerpc/mm/book3s64/hash_utils.c | 95 ---
 .../platforms/pseries/hotplug-memory.c| 35 +++
 3 files changed, 119 insertions(+), 15 deletions(-)

-- 
2.30.2

Re: [PATCH v3 00/11] DDW + Indirect Mapping

2021-04-22 Thread Leonardo Bras

Changes since v2:
- Some patches got removed from the series and sent by themselves,
- New tbl created for DDW + indirect mapping reserves MMIO32 space,
- Improved reserved area algorithm,
- Improved commit messages,
- Removed define for default DMA window prop name,
- Avoided some unnecessary renaming,
- Removed some unnecessary empty lines,
- Changed some code moving to forward declarations.
v2
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=201210=%2A=both
 

On Thu, 2021-04-22 at 04:07 -0300, Leonardo Bras wrote:
> So far it's assumed possible to map the guest RAM 1:1 to the bus, which
> works with a small number of devices. SRIOV changes it as the user can
> configure hundreds VFs and since phyp preallocates TCEs and does not
> allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> per a PE to limit waste of physical pages.
> 
> As of today, if the assumed direct mapping is not possible, DDW creation
> is skipped and the default DMA window "ibm,dma-window" is used instead.
> 
> Using the DDW instead of the default DMA window may allow to expand the
> amount of memory that can be DMA-mapped, given the number of pages (TCEs)
> may stay the same (or increase) and the default DMA window offers only
> 4k-pages while DDW may offer larger pages (4k, 64k, 16M ...).
> 
> Patch #1 replaces hard-coded 4K page size with a variable containing the
> correct page size for the window.
> 
> Patch #2 introduces iommu_table_in_use(), and replace manual bit-field
> checking where it's used. It will be used for aborting enable_ddw() if
> there is any current iommu allocation and we are trying single window
> indirect mapping.
> 
> Patch #3 introduces iommu_pseries_alloc_table() that will be helpful
> when indirect mapping needs to replace the iommu_table.
> 
> Patch #4 adds helpers for adding DDWs in the list.
> 
> Patch #5 refactors enable_ddw() so it returns if direct mapping is
> possible, instead of DMA offset. It helps for next patches on
> indirect DMA mapping and also allows DMA windows starting at 0x00.
> 
> Patch #6 bring new helper to simplify enable_ddw(), allowing
> some reorganization for introducing indirect mapping DDW.
> 
> Patch #7 adds new helper _iommu_table_setparms() and use it in other
> *setparams*() to fill iommu_table. It will also be used for creating a
> new iommu_table for indirect mapping.
> 
> Patch #8 updates remove_dma_window() to accept different property names,
> so we can introduce a new property for indirect mapping.
> 
> Patch #9 extracts find_existing_ddw_windows() into
> find_existing_ddw_windows_named(), and calls it by it's property name.
> This will be useful when the property for indirect mapping is created,
> so we can search the device-tree for both properties.
> 
> Patch #10:
> Instead of destroying the created DDW if it doesn't map the whole
> partition, make use of it instead of the default DMA window as it improves
> performance. Also, update the iommu_table and re-generate the pools.
> It introduces a new property name for DDW with indirect DMA mapping.
> 
> Patch #11:
> Does some renaming of 'direct window' to 'dma window', given the DDW
> created can now be also used in indirect mapping if direct mapping is not
> available.
> 
> All patches were tested into an LPAR with an virtio-net interface that
> allows default DMA window and DDW to coexist.
> 
> Leonardo Bras (11):
>   powerpc/pseries/iommu: Replace hard-coded page shift
>   powerpc/kernel/iommu: Add new iommu_table_in_use() helper
>   powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
>   powerpc/pseries/iommu: Add ddw_list_new_entry() helper
>   powerpc/pseries/iommu: Allow DDW windows starting at 0x00
>   powerpc/pseries/iommu: Add ddw_property_create() and refactor
> enable_ddw()
>   powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new
> helper
>   powerpc/pseries/iommu: Update remove_dma_window() to accept property
> name
>   powerpc/pseries/iommu: Find existing DDW with given property name
>   powerpc/pseries/iommu: Make use of DDW for indirect mapping
>   powerpc/pseries/iommu: Rename "direct window" to "dma window"
> 
>  arch/powerpc/include/asm/iommu.h   |   1 +
>  arch/powerpc/include/asm/tce.h |   8 -
>  arch/powerpc/kernel/iommu.c|  65 ++--
>  arch/powerpc/platforms/pseries/iommu.c | 504 +++--
>  4 files changed, 338 insertions(+), 240 deletions(-)
>

[PATCH v3 11/11] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2021-04-22 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

This should cause no behavioural change, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 93 +-
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 50909cbc73f6..f5d0a6f012da 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -355,7 +355,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -375,11 +375,11 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
@@ -712,7 +712,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
if (dma_window != NULL)
@@ -816,11 +819,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed DMA window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -848,37 +851,37 @@ static int remove_ddw(struct device_node *np, bool 
remove_prop, const char *win_
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove DMA window property: %d\n",
np, ret);
return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
bool found = false;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   *dma_addr = be64_to_cpu(direct64->dma_base);
-   *window_shift = be32_to_cpu(direct64->window_shift);
+   dma64 = window->prop;
+   *dma_addr = be64_to_cpu(dma64->dma_base);
+   *window_shift = be32_to_cpu(dma64->window_shift);
found = true;
break;
}
}
-   spin_unlock(_window_list_lock);
+   spin_unlock(_win_list_lock);
 
return found;
 }
 
-static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
-   const struct 
dynamic_dma_window_prop *dma64)
+static struct dma_win *ddw_list_new_entry(struct device_node *pdn,
+

[PATCH v3 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-22 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

By using DDW, indirect mapping  can get more TCEs than available for the
default DMA window, and also get access to using much larger pagesizes
(16MB as implemented in qemu vs 4k from default DMA window), causing a
significant increase on the maximum amount of memory that can be IOMMU
mapped at the same time.

Indirect mapping will only be used if direct mapping is not a
possibility.

For indirect mapping, it's necessary to re-create the iommu_table with
the new DMA window parameters, so iommu_alloc() can use it.

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

Even though there won't be both direct and indirect mappings at the
same time, we can't reuse the DIRECT64_PROPNAME property name, or else
an older kexec()ed kernel can assume direct mapping, and skip
iommu_alloc(), causing undesirable behavior.
So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info"
was created to represent a DDW that does not allow direct mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 87 +-
 1 file changed, 72 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 3367233a5535..50909cbc73f6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,7 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+static phys_addr_t ddw_memory_hotplug_max(void);
 #ifdef CONFIG_IOMMU_API
 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 
long *tce,
enum dma_data_direction *direction, bool 
realmode);
@@ -380,6 +381,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
 #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -918,6 +920,7 @@ static int find_existing_ddw_windows(void)
return 0;
 
find_existing_ddw_windows_named(DIRECT64_PROPNAME);
+   find_existing_ddw_windows_named(DMA64_PROPNAME);
 
return 0;
 }
@@ -1207,10 +1210,13 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
+   const char *win_name;
struct property *win64 = NULL;
struct failed_ddw_pdn *fpdn;
-   bool default_win_removed = false;
+   bool default_win_removed = false, direct_mapping = false;
bool pmem_present;
+   struct pci_dn *pci = PCI_DN(pdn);
+   struct iommu_table *tbl = pci->table_group->tables[0];
 
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
@@ -1218,8 +1224,12 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, ))
-   goto out_unlock;
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, )) {
+   direct_mapping = (len >= max_ram_len);
+
+   mutex_unlock(_window_init_mutex);
+   return direct_mapping;
+   }
 
/*
 * If we already went through this for a previous function of
@@ -1298,7 +1308,6 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
/* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
/*
 * The "ibm,pmemory" can appear anywhere in the address space.
 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
@@ -1320,6 +1329,17 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
+
+   len = order_base_2(query.largest_available_block << page_shift);
+   win_name =

[PATCH v3 09/11] powerpc/pseries/iommu: Find existing DDW with given property name

2021-04-22 Thread Leonardo Bras

At the moment pseries stores information about created directly mapped
DDW window in DIRECT64_PROPNAME.

With the objective of implementing indirect DMA mapping with DDW, it's
necessary to have another propriety name to make sure kexec'ing into older
kernels does not break, as it would if we reuse DIRECT64_PROPNAME.

In order to have this, find_existing_ddw_windows() needs to be able to
look for different property names.

Extract find_existing_ddw_windows() into find_existing_ddw_windows_named()
and calls it with current property name.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 075c6e08f012..3367233a5535 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -888,24 +888,21 @@ static struct direct_window *ddw_list_new_entry(struct 
device_node *pdn,
return window;
 }
 
-static int find_existing_ddw_windows(void)
+static void find_existing_ddw_windows_named(const char *name)
 {
int len;
struct device_node *pdn;
struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
-
-   if (!firmware_has_feature(FW_FEATURE_LPAR))
-   return 0;
+   const struct dynamic_dma_window_prop *dma64;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true, DIRECT64_PROPNAME);
+   for_each_node_with_property(pdn, name) {
+   dma64 = of_get_property(pdn, name, );
+   if (!dma64 || len < sizeof(*dma64)) {
+   remove_ddw(pdn, true, name);
continue;
}
 
-   window = ddw_list_new_entry(pdn, direct64);
+   window = ddw_list_new_entry(pdn, dma64);
if (!window)
break;
 
@@ -913,6 +910,14 @@ static int find_existing_ddw_windows(void)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
}
+}
+
+static int find_existing_ddw_windows(void)
+{
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return 0;
+
+   find_existing_ddw_windows_named(DIRECT64_PROPNAME);
 
return 0;
 }
-- 
2.30.2

[PATCH v3 08/11] powerpc/pseries/iommu: Update remove_dma_window() to accept property name

2021-04-22 Thread Leonardo Bras

Update remove_dma_window() so it can be used to remove DDW with a given
property name.

This enables the creation of new property names for DDW, so we can
have different usage for it, like indirect mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 0147ccaf0be4..075c6e08f012 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -823,31 +823,32 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static int remove_ddw(struct device_node *np, bool remove_prop, const char 
*win_name)
 {
struct property *win;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
int ret = 0;
 
+   win = of_find_property(np, win_name, NULL);
+   if (!win)
+   return -EINVAL;
+
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
 _avail[0], DDW_APPLICABLE_SIZE);
if (ret)
-   return;
-
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win)
-   return;
+   return 0;
 
if (win->length >= sizeof(struct dynamic_dma_window_prop))
remove_dma_window(np, ddw_avail, win);
 
if (!remove_prop)
-   return;
+   return 0;
 
ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
+   return 0;
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
@@ -900,7 +901,7 @@ static int find_existing_ddw_windows(void)
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
if (!direct64 || len < sizeof(*direct64)) {
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
continue;
}
 
@@ -1372,7 +1373,7 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
win64 = NULL;
 
 out_del_win:
-   remove_ddw(pdn, true);
+   remove_ddw(pdn, true, DIRECT64_PROPNAME);
 
 out_failed:
if (default_win_removed)
@@ -1536,7 +1537,7 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 * we have to remove the property when releasing
 * the device node.
 */
-   remove_ddw(np, false);
+   remove_ddw(np, false, DIRECT64_PROPNAME);
if (pci && pci->table_group)
iommu_pseries_free_group(pci->table_group,
np->full_name);
-- 
2.30.2

[PATCH v3 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-04-22 Thread Leonardo Bras

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, move iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 100 -
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 48c029386d94..0147ccaf0be4 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,11 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 
long *tce,
+   enum dma_data_direction *direction, bool 
realmode);
+#endif
+
 static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
struct iommu_table *tbl;
@@ -501,6 +506,28 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long 
start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
+static inline void _iommu_table_setparms(struct iommu_table *tbl, unsigned 
long busno,
+unsigned long liobn, unsigned long 
win_addr,
+unsigned long window_size, unsigned 
long page_shift,
+unsigned long base, struct 
iommu_table_ops *table_ops)
+{
+   tbl->it_busno = busno;
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_size = window_size >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_base = base;
+   tbl->it_blocksize = 16;
+   tbl->it_type = TCE_PCI;
+   tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+   .set = tce_build_pSeries,
+   .clear = tce_free_pSeries,
+   .get = tce_get_pseries
+};
+
 static void iommu_table_setparms(struct pci_controller *phb,
 struct device_node *dn,
 struct iommu_table *tbl)
@@ -509,8 +536,13 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
const unsigned long *basep;
const u32 *sizep;
 
-   node = phb->dn;
+   /* Test if we are going over 2GB of DMA space */
+   if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
+   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+   }
 
+   node = phb->dn;
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -519,33 +551,25 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
return;
}
 
-   tbl->it_base = (unsigned long)__va(*basep);
+   _iommu_table_setparms(tbl, phb->bus->number, 0, 
phb->dma_window_base_cur,
+ phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+ (unsigned long)__va(*basep), 
_table_pseries_ops);
 
if (!is_kdump_kernel())
memset((void *)tbl->it_base, 0, *sizep);
 
-   tbl->it_busno = phb->bus->number;
-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-   /* Units of tce entries */
-   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-   /* Test if we are going over 2GB of DMA space */
-   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
-   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
-   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
-   }
-
phb->dma_window_base_cur += phb->dma_window_size;
-
-   /* Set the tce table size - measured in entries */
-   tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
-
-   tbl->it_index = 0;
-   tbl->it_blocksize = 16;
-   tbl->it_type = TCE_PCI;
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+   .set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+   .xchg_no_kill = tce_exchange_pseries,
+#endif
+   .clear = tce_freemulti_pSeriesLP,
+   .get = tce_get_pSeriesLP
+};
+
 /*
  * iommu_table_setparms_lpar
  *
@@ -557,28 +581,17 @@ static void iommu_table_setparms_lpar(struct 
pci_controller *phb,
  struct iommu_tab

[PATCH v3 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-04-22 Thread Leonardo Bras

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 93 --
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 955cf095416c..48c029386d94 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1122,6 +1122,35 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+static struct property *ddw_property_create(const char *propname, u32 liobn, 
u64 dma_addr,
+   u32 page_shift, u32 window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return NULL;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value) {
+   kfree(win64);
+   kfree(win64->name);
+   kfree(win64->value);
+   return NULL;
+   }
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shift = cpu_to_be32(page_shift);
+   ddwprop->window_shift = cpu_to_be32(window_shift);
+
+   return win64;
+}
+
 /* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
 static int iommu_get_page_shift(u32 query_page_size)
 {
@@ -1167,11 +1196,11 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
+   u64 win_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64 = NULL;
-   struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
bool pmem_present;
@@ -1286,65 +1315,54 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << page_shift);
goto out_failed;
}
-   win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-   if (!win64) {
-   dev_info(>dev,
-   "couldn't allocate property for 64bit dma window\n");
-   goto out_failed;
-   }
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-   win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-   win64->length = sizeof(*ddwprop);
-   if (!win64->name || !win64->value) {
-   dev_info(>dev,
-   "couldn't allocate property name and value\n");
-   goto out_free_prop;
-   }
 
ret = create_ddw(dev, ddw_avail, , page_shift, len);
if (ret != 0)
-   goto out_free_prop;
-
-   ddwprop->liobn = cpu_to_be32(create.liobn);
-   ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-   create.addr_lo);
-   ddwprop->tce_shift = cpu_to_be32(page_shift);
-   ddwprop->window_shift = cpu_to_be32(len);
+   goto out_failed;
 
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = ddw_list_new_entry(pdn, ddwprop);
+   win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+   win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
+   page_shift, len);
+   if (!win64) {
+   dev_info(>dev,
+"couldn't allocate property, property name, or 
value\n&

[PATCH v3 05/11] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2021-04-22 Thread Leonardo Bras

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 35 --
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6f14894d2d04..955cf095416c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -849,25 +849,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int 
*window_shift)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
-   u64 dma_addr = 0;
+   bool found = false;
 
spin_lock(_window_list_lock);
/* check if we already created a window and dupe that config if so */
list_for_each_entry(window, _window_list, list) {
if (window->device == pdn) {
direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   *dma_addr = be64_to_cpu(direct64->dma_base);
*window_shift = be32_to_cpu(direct64->window_shift);
+   found = true;
break;
}
}
spin_unlock(_window_list_lock);
 
-   return dma_addr;
+   return found;
 }
 
 static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
@@ -1157,20 +1158,19 @@ static int iommu_get_page_shift(u32 query_page_size)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 dma_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
-   struct property *win64;
+   struct property *win64 = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
@@ -1182,8 +1182,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   dma_addr = find_existing_ddw(pdn, );
-   if (dma_addr != 0)
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset, ))
goto out_unlock;
 
/*
@@ -1338,7 +1337,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
 
-   dma_addr = be64_to_cpu(ddwprop->dma_base);
+   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
goto out_unlock;
 
 out_free_window:
@@ -1351,6 +1350,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
kfree(win64->name);
kfree(win64->value);
kfree(win64);
+   win64 = NULL;
 
 out_failed:
if (default_win_removed)
@@ -1370,10 +1370,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * as RAM, then we failed to create a window to cover persistent
 * memory and need to set the DMA limit.
 */
-   if (pmem_present && dma_addr && (len == max_ram_len))
-   dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+   if (pmem_present && win64 && (len == max_ram_len))
+   dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL 
<< len);
 
-   return dma_addr;
+   return win64;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1452,11 +1452,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
break;
}
 
-   if (pdn && PCI_DN(pdn))

[PATCH v3 04/11] powerpc/pseries/iommu: Add ddw_list_new_entry() helper

2021-04-22 Thread Leonardo Bras

There are two functions creating direct_window_list entries in a
similar way, so create a ddw_list_new_entry() to avoid duplicity and
simplify those functions.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 32 +-
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index d02359ca1f9f..6f14894d2d04 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -870,6 +870,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int 
*window_shift)
return dma_addr;
 }
 
+static struct direct_window *ddw_list_new_entry(struct device_node *pdn,
+   const struct 
dynamic_dma_window_prop *dma64)
+{
+   struct direct_window *window;
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return NULL;
+
+   window->device = pdn;
+   window->prop = dma64;
+
+   return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
int len;
@@ -882,18 +897,15 @@ static int find_existing_ddw_windows(void)
 
for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
-   if (!direct64)
-   continue;
-
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
-   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
-   kfree(window);
+   if (!direct64 || len < sizeof(*direct64)) {
remove_ddw(pdn, true);
continue;
}
 
-   window->device = pdn;
-   window->prop = direct64;
+   window = ddw_list_new_entry(pdn, direct64);
+   if (!window)
+   break;
+
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
@@ -1303,7 +1315,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   window = ddw_list_new_entry(pdn, ddwprop);
if (!window)
goto out_clear_window;
 
@@ -1322,8 +1334,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   window->device = pdn;
-   window->prop = ddwprop;
spin_lock(_window_list_lock);
list_add(>list, _window_list);
spin_unlock(_window_list_lock);
-- 
2.30.2

[PATCH v3 03/11] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2021-04-22 Thread Leonardo Bras

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 796ab356341c..d02359ca1f9f 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-   struct iommu_table_group *table_group;
struct iommu_table *tbl;
 
-   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-  node);
-   if (!table_group)
-   return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
-   goto free_group;
+   return NULL;
 
INIT_LIST_HEAD_RCU(>it_group_list);
kref_init(>it_kref);
+   return tbl;
+}
 
-   table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+   struct iommu_table_group *table_group;
+
+   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+   if (!table_group)
+   return NULL;
 
-   return table_group;
+   table_group->tables[0] = iommu_pseries_alloc_table(node);
+   if (table_group->tables[0])
+   return table_group;
 
-free_group:
kfree(table_group);
return NULL;
 }
-- 
2.30.2

[PATCH v3 02/11] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-04-22 Thread Leonardo Bras

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 65 
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ad82dda81640..5e168bd91401 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -691,32 +691,24 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
 
-   tbl->it_reserved_start = res_start;
-   tbl->it_reserved_end = res_end;
-
-   /* Check if res_start..res_end isn't empty and overlaps the table */
-   if (res_start && res_end &&
-   (tbl->it_offset + tbl->it_size < res_start ||
-res_end < tbl->it_offset))
-   return;
+   if (res_start < tbl->it_offset)
+   res_start = tbl->it_offset;
 
-   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   set_bit(i - tbl->it_offset, tbl->it_map);
-}
+   if (res_end > (tbl->it_offset + tbl->it_size))
+   res_end = tbl->it_offset + tbl->it_size;
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-   int i;
+   /* Check if res_start..res_end is a valid range in the table */
+   if (res_start >= res_end) {
+   tbl->it_reserved_start = tbl->it_offset;
+   tbl->it_reserved_end = tbl->it_offset;
+   return;
+   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
 
for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   clear_bit(i - tbl->it_offset, tbl->it_map);
+   set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
 /*
@@ -781,6 +773,22 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+   unsigned long start = 0, end;
+
+   /* ignore reserved bit0 */
+   if (tbl->it_offset == 0)
+   start = 1;
+   end = tbl->it_reserved_start - tbl->it_offset;
+   if (find_next_bit(tbl->it_map, end, start) != end)
+   return true;
+
+   start = tbl->it_reserved_end - tbl->it_offset;
+   end = tbl->it_size;
+   return find_next_bit(tbl->it_map, end, start) != end;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
unsigned long bitmap_sz;
@@ -799,10 +807,8 @@ static void iommu_table_free(struct kref *kref)
 
iommu_debugfs_del(tbl);
 
-   iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
-   if (!bitmap_empty(tbl->it_map, tbl->it_size))
+   if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
 
/* calculate bitmap size in bytes */
@@ -1108,18 +1114,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock(>pools[i].lock);
 
-   iommu_table_release_pages(tbl);
-
-   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+   if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
-   /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-   iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-   tbl->it_re

[PATCH v3 01/11] powerpc/pseries/iommu: Replace hard-coded page shift

2021-04-22 Thread Leonardo Bras

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed.

IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show
a RPN of 52-bit, and considers a 12-bit pageshift, so there should be
no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn.
It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and
tce_buildmulti_pSeriesLP().

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/tce.h |  8 --
 arch/powerpc/platforms/pseries/iommu.c | 39 +++---
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..0c34d2756d92 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,7 @@
 #define TCE_VB 0
 #define TCE_PCI1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT  12
-#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE 8   /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT  12
 #define TCE_VALID  0x800   /* TCE valid */
 #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
 #define TCE_PCI_WRITE  0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 67c9953a6503..796ab356341c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long 
index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
long index,
 
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
-   rpn = __pa(uaddr) >> TCE_SHIFT;
-   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
TCE_RPN_SHIFT);
+   rpn = __pa(uaddr) >> tceshift;
+   *tcep = cpu_to_be64(proto_tce | rpn << tceshift);
 
-   uaddr += TCE_PAGE_SIZE;
+   uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
*tbl, long index)
return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
 
while (npages--) {
-   tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+   tce = proto_tce | rpn << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
-   tce_free_pSeriesLP(liobn, tcenum_start,
+   tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
   (npages_start - (npages + 1)));
break;
}
@@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+   const unsigned long tceshift = tbl->it_page_shift;
 
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
-  tbl->it_page_shift, npages, uaddr,
+  tceshift, npages, uaddr,
   direction, attrs);
}
 
@@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
if (!tcep) {
local_irq_restore(fla

[PATCH v3 00/11] DDW + Indirect Mapping

2021-04-22 Thread Leonardo Bras

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW creation
is skipped and the default DMA window "ibm,dma-window" is used instead.

Using the DDW instead of the default DMA window may allow to expand the
amount of memory that can be DMA-mapped, given the number of pages (TCEs)
may stay the same (or increase) and the default DMA window offers only
4k-pages while DDW may offer larger pages (4k, 64k, 16M ...).

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #3 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #4 adds helpers for adding DDWs in the list.

Patch #5 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #6 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #7 adds new helper _iommu_table_setparms() and use it in other
*setparams*() to fill iommu_table. It will also be used for creating a
new iommu_table for indirect mapping.

Patch #8 updates remove_dma_window() to accept different property names,
so we can introduce a new property for indirect mapping.

Patch #9 extracts find_existing_ddw_windows() into
find_existing_ddw_windows_named(), and calls it by it's property name.
This will be useful when the property for indirect mapping is created,
so we can search the device-tree for both properties.

Patch #10:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.
It introduces a new property name for DDW with indirect DMA mapping.

Patch #11:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an virtio-net interface that
allows default DMA window and DDW to coexist.

Leonardo Bras (11):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_new_entry() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
enable_ddw()
  powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new
helper
  powerpc/pseries/iommu: Update remove_dma_window() to accept property
name
  powerpc/pseries/iommu: Find existing DDW with given property name
  powerpc/pseries/iommu: Make use of DDW for indirect mapping
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h   |   1 +
 arch/powerpc/include/asm/tce.h |   8 -
 arch/powerpc/kernel/iommu.c|  65 ++--
 arch/powerpc/platforms/pseries/iommu.c | 504 +++--
 4 files changed, 338 insertions(+), 240 deletions(-)

-- 
2.30.2

Re: [PATCH 1/1] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE() to save TCEs

2021-04-22 Thread Leonardo Bras

Hello,

This patch was also reviewed when it was part of another patchset:
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200911170738.82818-4-leobra...@gmail.com/

On Thu, 2021-03-18 at 14:44 -0300, Leonardo Bras wrote:
> Currently both iommu_alloc_coherent() and iommu_free_coherent() align the
> desired allocation size to PAGE_SIZE, and gets system pages and IOMMU
> mappings (TCEs) for that value.
> 
> When IOMMU_PAGE_SIZE < PAGE_SIZE, this behavior may cause unnecessary
> TCEs to be created for mapping the whole system page.
> 
> Example:
> - PAGE_SIZE = 64k, IOMMU_PAGE_SIZE() = 4k
> - iommu_alloc_coherent() is called for 128 bytes
> - 1 system page (64k) is allocated
> - 16 IOMMU pages (16 x 4k) are allocated (16 TCEs used)
> 
> It would be enough to use a single TCE for this, so 15 TCEs are
> wasted in the process.
> 
> Update iommu_*_coherent() to make sure the size alignment happens only
> for IOMMU_PAGE_SIZE() before calling iommu_alloc() and iommu_free().
> 
> Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
> with IOMMU_PAGE_ALIGN(n, tbl), which is easier to read and does the
> same.
> 
> Signed-off-by: Leonardo Bras 
> Reviewed-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/kernel/iommu.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 5b69a6a72a0e..3329ef045805 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -851,6 +851,7 @@ void *iommu_alloc_coherent(struct device *dev, struct 
> iommu_table *tbl,
>   unsigned int order;
>   unsigned int nio_pages, io_order;
>   struct page *page;
> + size_t size_io = size;
>  
> 
>   size = PAGE_ALIGN(size);
>   order = get_order(size);
> @@ -877,8 +878,9 @@ void *iommu_alloc_coherent(struct device *dev, struct 
> iommu_table *tbl,
>   memset(ret, 0, size);
>  
> 
>   /* Set up tces to cover the allocated range */
> - nio_pages = size >> tbl->it_page_shift;
> - io_order = get_iommu_order(size, tbl);
> + size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
> + nio_pages = size_io >> tbl->it_page_shift;
> + io_order = get_iommu_order(size_io, tbl);
>   mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
>     mask >> tbl->it_page_shift, io_order, 0);
>   if (mapping == DMA_MAPPING_ERROR) {
> @@ -893,10 +895,9 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
> size,
>    void *vaddr, dma_addr_t dma_handle)
>  {
>   if (tbl) {
> - unsigned int nio_pages;
> + size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
> + unsigned int nio_pages = size_io >> tbl->it_page_shift;
>  
> 
> - size = PAGE_ALIGN(size);
> - nio_pages = size >> tbl->it_page_shift;
>   iommu_free(tbl, dma_handle, nio_pages);
>   size = PAGE_ALIGN(size);
>   free_pages((unsigned long)vaddr, get_order(size));

Re: [PATCH 1/1] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc

2021-04-22 Thread Leonardo Bras

Hello,

FYI: This patch was reviewed when it was part of another patchset:
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200817234033.442511-4-leobra...@gmail.com/


On Thu, 2021-03-18 at 14:44 -0300, Leonardo Bras wrote:
> As of today, doing iommu_range_alloc() only for !largealloc (npages <= 15)
> will only be able to use 3/4 of the available pages, given pages on
> largepool  not being available for !largealloc.
> 
> This could mean some drivers not being able to fully use all the available
> pages for the DMA window.
> 
> Add pages on largepool as a last resort for !largealloc, making all pages
> of the DMA window available.
> 
> Signed-off-by: Leonardo Bras 
> Reviewed-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/kernel/iommu.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 3329ef045805..ae6ad8dca605 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -255,6 +255,15 @@ static unsigned long iommu_range_alloc(struct device 
> *dev,
>   pass++;
>   goto again;
>  
> 
> + } else if (pass == tbl->nr_pools + 1) {
> + /* Last resort: try largepool */
> + spin_unlock(>lock);
> + pool = >large_pool;
> + spin_lock(>lock);
> + pool->hint = pool->start;
> + pass++;
> + goto again;
> +
>   } else {
>   /* Give up */
>   spin_unlock_irqrestore(&(pool->lock), flags);

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-21 Thread Leonardo Bras

On Tue, 2021-04-20 at 17:34 -0500, Rob Herring wrote:
> > [...]
> > I think the point here is bus resources not getting the MEM_64 flag,
> > but device resources getting it correctly. Is that supposed to happen?
> 
> I experimented with this on Arm with qemu and it seems fine there too.
> Looks like the BARs are first read and will have bit 2 set by default
> (or hardwired?). Now I'm just wondering why powerpc needs the code it
> has...
> 
> Anyways, I'll apply the patch.
> 
> Rob

Thanks Rob!

Re: [PATCH 1/1] powerpc/pseries/iommu: Fix window size for direct mapping with pmem

2021-04-19 Thread Leonardo Bras

On Tue, 2021-04-20 at 15:18 +1000, Alexey Kardashevskiy wrote:
> 
> On 20/04/2021 14:54, Leonardo Bras wrote:
> > As of today, if the DDW is big enough to fit (1 << MAX_PHYSMEM_BITS) it's
> > possible to use direct DMA mapping even with pmem region.
> > 
> > But, if that happens, the window size (len) is set to
> > (MAX_PHYSMEM_BITS - page_shift) instead of MAX_PHYSMEM_BITS, causing a
> > pagesize times smaller DDW to be created, being insufficient for correct
> > usage.
> > 
> > Fix this so the correct window size is used in this case.
> 
> Good find indeed.
> 
> afaict this does not create a huge problem though as 
> query.largest_available_block is always smaller than (MAX_PHYSMEM_BITS - 
> page_shift) where it matters (phyp).
> 
> 
> Reviewed-by: Alexey Kardashevskiy 
> 

Thanks for reviewing!

Leonardo Bras

[PATCH 1/1] powerpc/pseries/iommu: Fix window size for direct mapping with pmem

2021-04-19 Thread Leonardo Bras

As of today, if the DDW is big enough to fit (1 << MAX_PHYSMEM_BITS) it's
possible to use direct DMA mapping even with pmem region.

But, if that happens, the window size (len) is set to
(MAX_PHYSMEM_BITS - page_shift) instead of MAX_PHYSMEM_BITS, causing a
pagesize times smaller DDW to be created, being insufficient for correct
usage.

Fix this so the correct window size is used in this case.

Fixes: bf6e2d562bbc4("powerpc/dma: Fallback to dma_ops when persistent memory 
present")
Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..836cbbe0ecc5 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1229,7 +1229,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (pmem_present) {
if (query.largest_available_block >=
(1ULL << (MAX_PHYSMEM_BITS - page_shift)))
-   len = MAX_PHYSMEM_BITS - page_shift;
+   len = MAX_PHYSMEM_BITS;
else
dev_info(>dev, "Skipping ibm,pmemory");
}
-- 
2.30.2

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-19 Thread Leonardo Bras

On Mon, 2021-04-19 at 20:39 -0500, Rob Herring wrote:
> On Mon, Apr 19, 2021 at 7:35 PM Leonardo Bras  wrote:
> > 
> > On Mon, 2021-04-19 at 10:44 -0500, Rob Herring wrote:
> > > On Fri, Apr 16, 2021 at 3:58 PM Leonardo Bras  wrote:
> > > > 
> > > > Hello Rob, thanks for this feedback!
> > > > 
> > > > On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> > > > > +PPC and PCI lists
> > > > > 
> > > > > On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  
> > > > > wrote:
> > > > > > 
> > > > > > Many other resource flag parsers already add this flag when the 
> > > > > > input
> > > > > > has bits 24 & 25 set, so update this one to do the same.
> > > > > 
> > > > > Many others? Looks like sparc and powerpc to me.
> > > > > 
> > > > 
> > > > s390 also does that, but it look like it comes from a device-tree.
> > > 
> > > I'm only looking at DT based platforms, and s390 doesn't use DT.
> > 
> > Correct.
> > Sorry, I somehow write above the opposite of what I was thinking.
> > 
> > > 
> > > > > Those would be the
> > > > > ones I worry about breaking. Sparc doesn't use of/address.c so it's
> > > > > fine. Powerpc version of the flags code was only fixed in 2019, so I
> > > > > don't think powerpc will care either.
> > > > 
> > > > In powerpc I reach this function with this stack, while configuring a
> > > > virtio-net device for a qemu/KVM pseries guest:
> > > > 
> > > > pci_process_bridge_OF_ranges+0xac/0x2d4
> > > > pSeries_discover_phbs+0xc4/0x158
> > > > discover_phbs+0x40/0x60
> > > > do_one_initcall+0x60/0x2d0
> > > > kernel_init_freeable+0x308/0x3a8
> > > > kernel_init+0x2c/0x168
> > > > ret_from_kernel_thread+0x5c/0x70
> > > > 
> > > > For this, both MMIO32 and MMIO64 resources will have flags 0x200.
> > > 
> > > Oh good, powerpc has 2 possible flags parsing functions. So in the
> > > above path, do we need to set PCI_BASE_ADDRESS_MEM_TYPE_64?
> > > 
> > > Does pci_parse_of_flags() get called in your case?
> > > 
> > 
> > It's called in some cases, but not for the device I am debugging
> > (virtio-net pci@8002000).
> > 
> > For the above device, here is an expanded stack trace:
> > 
> > of_bus_pci_get_flags() (from parser->bus->get_flags())
> > of_pci_range_parser_one() (from macro for_each_of_pci_range)
> > pci_process_bridge_OF_ranges+0xac/0x2d4
> > pSeries_discover_phbs+0xc4/0x158
> > discover_phbs+0x40/0x60
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > For other devices, I could also see the following stack trace:
> > ## device ethernet@8
> > 
> > pci_parse_of_flags()
> > of_create_pci_dev+0x7f0/0xa40
> > __of_scan_bus+0x248/0x320
> > pcibios_scan_phb+0x370/0x3b0
> > pcibios_init+0x8c/0x12c
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > Devices that get parsed with of_bus_pci_get_flags() appears first at
> > dmesg (around 0.015s in my test), while devices that get parsed by
> > pci_parse_of_flags() appears later (0.025s in my test).
> > 
> > I am not really used to this code, but having the term "discover phbs"
> > in the first trace and the term "scan phb" in the second, makes me
> > wonder if the first trace is seen on devices that are seen/described in
> > the device-tree and the second trace is seen in devices not present in
> > the device-tree and found scanning pci bus.
> 
> That was my guess as well. I think on pSeries that most PCI devices
> are in the DT whereas on Arm and other flattened DT (non OpenFirmware)
> platforms PCI devices are not in DT.
> 

It makes sense to me. 

>  Of course, for virtio devices,
> they would not be in DT in either case.

I don't get this part... in pseries it looks like virtio devices can be
in device-tree.

Oh, I think I get it... this pci@8002000 looks like a bus
(described in device-tree, so discovered), and then the devices are
inside it, getting scanned.

The virtio device gets the correct flags (from pci_parse_of_flags), but
the bus (pci@8002000) does not seem to get it correctly,
because

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-19 Thread Leonardo Bras

On Mon, 2021-04-19 at 10:44 -0500, Rob Herring wrote:
> On Fri, Apr 16, 2021 at 3:58 PM Leonardo Bras  wrote:
> > 
> > Hello Rob, thanks for this feedback!
> > 
> > On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> > > +PPC and PCI lists
> > > 
> > > On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > > > 
> > > > Many other resource flag parsers already add this flag when the input
> > > > has bits 24 & 25 set, so update this one to do the same.
> > > 
> > > Many others? Looks like sparc and powerpc to me.
> > > 
> > 
> > s390 also does that, but it look like it comes from a device-tree.
> 
> I'm only looking at DT based platforms, and s390 doesn't use DT.

Correct. 
Sorry, I somehow write above the opposite of what I was thinking.

> 
> > > Those would be the
> > > ones I worry about breaking. Sparc doesn't use of/address.c so it's
> > > fine. Powerpc version of the flags code was only fixed in 2019, so I
> > > don't think powerpc will care either.
> > 
> > In powerpc I reach this function with this stack, while configuring a
> > virtio-net device for a qemu/KVM pseries guest:
> > 
> > pci_process_bridge_OF_ranges+0xac/0x2d4
> > pSeries_discover_phbs+0xc4/0x158
> > discover_phbs+0x40/0x60
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > For this, both MMIO32 and MMIO64 resources will have flags 0x200.
> 
> Oh good, powerpc has 2 possible flags parsing functions. So in the
> above path, do we need to set PCI_BASE_ADDRESS_MEM_TYPE_64?
> 
> Does pci_parse_of_flags() get called in your case?
> 

It's called in some cases, but not for the device I am debugging
(virtio-net pci@8002000). 

For the above device, here is an expanded stack trace:

of_bus_pci_get_flags() (from parser->bus->get_flags()) 
of_pci_range_parser_one() (from macro for_each_of_pci_range)
pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For other devices, I could also see the following stack trace:
## device ethernet@8

pci_parse_of_flags()
of_create_pci_dev+0x7f0/0xa40
__of_scan_bus+0x248/0x320
pcibios_scan_phb+0x370/0x3b0
pcibios_init+0x8c/0x12c
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

Devices that get parsed with of_bus_pci_get_flags() appears first at
dmesg (around 0.015s in my test), while devices that get parsed by
pci_parse_of_flags() appears later (0.025s in my test).

I am not really used to this code, but having the term "discover phbs"
in the first trace and the term "scan phb" in the second, makes me
wonder if the first trace is seen on devices that are seen/described in
the device-tree and the second trace is seen in devices not present in
the device-tree and found scanning pci bus.

> > > I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> > > the flags. AFAICT, that's not set anywhere outside of arch code. So
> > > never for riscv, arm and arm64 at least. That leads me to
> > > pci_std_update_resource() which is where the PCI code sets BARs and
> > > just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> > > IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> > > neither is prefetch.
> > > 
> > 
> > I am not sure if you mean here:
> > a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
> > anything else, or
> > b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64
> > (or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
> > it's how it's added in powerpc/sparc, and else there is no point.
> 
> I'm wondering if a) is incomplete and PCI_BASE_ADDRESS_MEM_TYPE_64
> also needs to be set. The question is ultimately are BARs getting set
> correctly for 64-bit? It looks to me like they aren't.

I am not used to these terms, does BAR means 'Base Address Register'?

If so, those are the addresses stored in pci->phb->mem_resources[i] and
pci->phb->mem_offset[i], printed from enable_ddw() (which takes place a
lot after discovering the device (0.17s in my run)).

resource #1 pci@8002000: start=0x20008000
end=0x2000 flags=0x200 desc=0x0 offset=0x2000
resource #2 pci@8002000: start=0x2100
end=0x21ff flags=0x200 desc=0x0 offset=0x0

The message above was printed without this patch.
With the patch, the flags for memory resource #2 gets ORed with 
0x0010.

Is it enough to know if BARs are correctly set for 64-bit?
If it's not, how can I check?

> 
> Rob

Thanks Rob!

Leonardo Brás

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-16 Thread Leonardo Bras

Hello Rob, thanks for this feedback!

On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> +PPC and PCI lists
> 
> On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > 
> > Many other resource flag parsers already add this flag when the input
> > has bits 24 & 25 set, so update this one to do the same.
> 
> Many others? Looks like sparc and powerpc to me. 
> 

s390 also does that, but it look like it comes from a device-tree.

> Those would be the
> ones I worry about breaking. Sparc doesn't use of/address.c so it's
> fine. Powerpc version of the flags code was only fixed in 2019, so I
> don't think powerpc will care either.

In powerpc I reach this function with this stack, while configuring a
virtio-net device for a qemu/KVM pseries guest:

pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For this, both MMIO32 and MMIO64 resources will have flags 0x200.

> 
> I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> the flags. AFAICT, that's not set anywhere outside of arch code. So
> never for riscv, arm and arm64 at least. That leads me to
> pci_std_update_resource() which is where the PCI code sets BARs and
> just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> neither is prefetch.
> 

I am not sure if you mean here:
a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
anything else, or
b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64 
(or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
it's how it's added in powerpc/sparc, and else there is no point.

Again, thanks for helping!

Best regards,
Leonardo Bras

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-16 Thread Leonardo Bras

Hello Rob, thanks for this feedback!

On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> +PPC and PCI lists
> 
> On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > 
> > Many other resource flag parsers already add this flag when the input
> > has bits 24 & 25 set, so update this one to do the same.
> 
> Many others? Looks like sparc and powerpc to me. 
> 

s390 also does that, but it look like it comes from a device-tree.

> Those would be the
> ones I worry about breaking. Sparc doesn't use of/address.c so it's
> fine. Powerpc version of the flags code was only fixed in 2019, so I
> don't think powerpc will care either.

In powerpc I reach this function with this stack, while configuring a
virtio-net device for a qemu/KVM pseries guest:

pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For this, both MMIO32 and MMIO64 resources will have flags 0x200.

> 
> I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> the flags. AFAICT, that's not set anywhere outside of arch code. So
> never for riscv, arm and arm64 at least. That leads me to
> pci_std_update_resource() which is where the PCI code sets BARs and
> just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> neither is prefetch.
> 

I am not sure if you mean here:
a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
anything else, or
b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64 
(or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
it's how it's added in powerpc/sparc, and else there is no point.

Again, thanks for helping!

Best regards,
Leonardo Bras

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-13 Thread Leonardo Bras

On Mon, 2021-04-12 at 17:21 -0500, Segher Boessenkool wrote:
> On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> > On 08/04/2021 19:04, Michael Ellerman wrote:
> > > > > > +#define QUERY_DDW_PGSIZE_4K0x01
> > > > > > +#define QUERY_DDW_PGSIZE_64K   0x02
> > > > > > +#define QUERY_DDW_PGSIZE_16M   0x04
> > > > > > +#define QUERY_DDW_PGSIZE_32M   0x08
> > > > > > +#define QUERY_DDW_PGSIZE_64M   0x10
> > > > > > +#define QUERY_DDW_PGSIZE_128M  0x20
> > > > > > +#define QUERY_DDW_PGSIZE_256M  0x40
> > > > > > +#define QUERY_DDW_PGSIZE_16G   0x80
> > > > > 
> > > > > I'm not sure the #defines really gain us much vs just putting the
> > > > > literal values in the array below?
> > > > 
> > > > Then someone says "u magic values" :) I do not mind either way. 
> > > > Thanks,
> > > 
> > > Yeah that's true. But #defining them doesn't make them less magic, if
> > > you only use them in one place :)
> > 
> > Defining them with "QUERY_DDW" in the names kinda tells where they are 
> > from. Can also grep QEMU using these to see how the other side handles 
> > it. Dunno.
> 
> And *not* defining anything reduces the mental load a lot.  You can add
> a comment at the single spot you use them, explaining what this is, in a
> much better way!
> 
> Comments are *good*.
> 
> 
> Segher

Thanks for the feedback Alexey, Michael and Segher!

I have sent a v3 for this patch. 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210408201915.174217-1-leobra...@gmail.com/

Please let me know of your feedback in it.

Best regards,
Leonardo Bras

Re: [PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-13 Thread Leonardo Bras

On Tue, 2021-04-13 at 18:24 +1000, Alexey Kardashevskiy wrote:
> 
> On 13/04/2021 17:58, Leonardo Bras wrote:
> > On Tue, 2021-04-13 at 17:41 +1000, Alexey Kardashevskiy wrote:
> > > 
> > > On 13/04/2021 17:33, Leonardo Bras wrote:
> > > > On Tue, 2021-04-13 at 17:18 +1000, Alexey Kardashevskiy wrote:
> > > > > 
> > > > > On 13/04/2021 15:49, Leonardo Bras wrote:
> > > > > > Thanks for the feedback!
> > > > > > 
> > > > > > On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> > > > > > > > -static bool find_existing_ddw(struct device_node *pdn, u64 
> > > > > > > > *dma_addr)
> > > > > > > > +static phys_addr_t ddw_memory_hotplug_max(void)
> > > > > > > 
> > > > > > > 
> > > > > > > Please, forward declaration or a separate patch; this creates
> > > > > > > unnecessary noise to the actual change.
> > > > > > > 
> > > > > > 
> > > > > > Sure, done!
> > > > > > 
> > > > > > > 
> > > > > > > > +   _iommu_table_setparms(tbl, 
> > > > > > > > pci->phb->bus->number, create.liobn, win_addr,
> > > > > > > > + 1UL << len, page_shift, 
> > > > > > > > 0, _table_lpar_multi_ops);
> > > > > > > > +   iommu_init_table(tbl, pci->phb->node, 0, 0);
> > > > > > > 
> > > > > > > 
> > > > > > > It is 0,0 only if win_addr>0 which is not the QEMU case.
> > > > > > > 
> > > > > > 
> > > > > > Oh, ok.
> > > > > > I previously though it was ok to use 0,0 here as any other usage in
> > > > > > this file was also 0,0.
> > > > > > 
> > > > > > What should I use to get the correct parameters? Use the previous 
> > > > > > tbl
> > > > > > it_reserved_start and tbl->it_reserved_end is enough?
> > > > > 
> > > > > depends on whether you carry reserved start/end even if they are 
> > > > > outside
> > > > > of the dma window.
> > > > > 
> > > > 
> > > > Oh, that makes sense.
> > > > On a previous patch (5/14 IIRC), I changed the behavior to only store
> > > > the valid range on tbl, but now I understand why it's important to
> > > > store the raw value.
> > > > 
> > > > Ok, I will change it back so the reserved range stays in tbl even if it
> > > > does not intersect with the DMA window. This way I can reuse the values
> > > > in case of indirect mapping with DDW.
> > > > 
> > > > Is that ok? Are the reserved values are supposed to stay the same after
> > > > changing from Default DMA window to DDW?
> > > 
> > > I added them to know what bits in it_map to ignore when checking if
> > > there is any active user of the table. If you have non zero reserved
> > > start/end but they do not affect it_map, then it is rather weird way to
> > > carry reserved start/end from DDW to no-DDW.
> > > 
> > 
> > Ok, agreed.
> > 
> > >   May be do not set these at
> > > all for DDW with window start at 1<<59 and when going back to no-DDW (or
> > > if DDW starts at 0) - just set them from MMIO32, just as they are
> > > initialized in the first place.
> > > 
> > 
> > If I get it correctly from pci_of_scan.c, MMIO32 = {0, 32MB}, is that
> > correct?
> 
> No, under QEMU it is 0x8000.-0x1..:
> 
> /proc/device-tree/pci@8002000/ranges
> 
> 7 cells for each resource, the second one is MMIO32 (the first is IO 
> ports, the last is 64bit MMIO).
> > 
> > So, if DDW starts at any value in this range (most probably at zero),
> > we should remove the rest, is that correct?
> > 
> > Could it always use iommu_init_table(..., 0, 32MB) here, so it always
> > reserve any part of the DMA window that's in this range? Ot there may
> > be other reserved values range?
> > 
> > > and when going back to no-DDW
> > 
> > After iommu_init_table() there should be no failure, so it looks like
> > there is no 'going back to no-DDW'. Am I missing something?
> 
> Well, a random driver could request 32bit DMA and if the new window is 
> 1:1, then it would break but this does not seem to happen and we do not 
> support it anyway so no loss here.
> 

So you would recommend reading "ranges" with of_get_property() and
using the second entry (cells 7 - 13) in this point, get base & size to
make sure it does not map anything here? (should have no effect if the
value does not intersect with the DMA window)

Thank you for reviewing!
Leonardo Bras

Re: [PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-13 Thread Leonardo Bras

On Tue, 2021-04-13 at 17:41 +1000, Alexey Kardashevskiy wrote:
> 
> On 13/04/2021 17:33, Leonardo Bras wrote:
> > On Tue, 2021-04-13 at 17:18 +1000, Alexey Kardashevskiy wrote:
> > > 
> > > On 13/04/2021 15:49, Leonardo Bras wrote:
> > > > Thanks for the feedback!
> > > > 
> > > > On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> > > > > > -static bool find_existing_ddw(struct device_node *pdn, u64 
> > > > > > *dma_addr)
> > > > > > +static phys_addr_t ddw_memory_hotplug_max(void)
> > > > > 
> > > > > 
> > > > > Please, forward declaration or a separate patch; this creates
> > > > > unnecessary noise to the actual change.
> > > > > 
> > > > 
> > > > Sure, done!
> > > > 
> > > > > 
> > > > > > +   _iommu_table_setparms(tbl, pci->phb->bus->number, 
> > > > > > create.liobn, win_addr,
> > > > > > + 1UL << len, page_shift, 0, 
> > > > > > _table_lpar_multi_ops);
> > > > > > +   iommu_init_table(tbl, pci->phb->node, 0, 0);
> > > > > 
> > > > > 
> > > > > It is 0,0 only if win_addr>0 which is not the QEMU case.
> > > > > 
> > > > 
> > > > Oh, ok.
> > > > I previously though it was ok to use 0,0 here as any other usage in
> > > > this file was also 0,0.
> > > > 
> > > > What should I use to get the correct parameters? Use the previous tbl
> > > > it_reserved_start and tbl->it_reserved_end is enough?
> > > 
> > > depends on whether you carry reserved start/end even if they are outside
> > > of the dma window.
> > > 
> > 
> > Oh, that makes sense.
> > On a previous patch (5/14 IIRC), I changed the behavior to only store
> > the valid range on tbl, but now I understand why it's important to
> > store the raw value.
> > 
> > Ok, I will change it back so the reserved range stays in tbl even if it
> > does not intersect with the DMA window. This way I can reuse the values
> > in case of indirect mapping with DDW.
> > 
> > Is that ok? Are the reserved values are supposed to stay the same after
> > changing from Default DMA window to DDW?
> 
> I added them to know what bits in it_map to ignore when checking if 
> there is any active user of the table. If you have non zero reserved 
> start/end but they do not affect it_map, then it is rather weird way to 
> carry reserved start/end from DDW to no-DDW.
> 

Ok, agreed.

>  May be do not set these at 
> all for DDW with window start at 1<<59 and when going back to no-DDW (or 
> if DDW starts at 0) - just set them from MMIO32, just as they are 
> initialized in the first place.
> 

If I get it correctly from pci_of_scan.c, MMIO32 = {0, 32MB}, is that
correct?

So, if DDW starts at any value in this range (most probably at zero),
we should remove the rest, is that correct?

Could it always use iommu_init_table(..., 0, 32MB) here, so it always
reserve any part of the DMA window that's in this range? Ot there may
be other reserved values range?

> and when going back to no-DDW 

After iommu_init_table() there should be no failure, so it looks like
there is no 'going back to no-DDW'. Am I missing something?

Thanks for helping!

Best regards,
Leonardo Bras

Re: [PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-13 Thread Leonardo Bras

On Tue, 2021-04-13 at 17:18 +1000, Alexey Kardashevskiy wrote:
> 
> On 13/04/2021 15:49, Leonardo Bras wrote:
> > Thanks for the feedback!
> > 
> > On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> > > > -static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> > > > +static phys_addr_t ddw_memory_hotplug_max(void)
> > > 
> > > 
> > > Please, forward declaration or a separate patch; this creates
> > > unnecessary noise to the actual change.
> > > 
> > 
> > Sure, done!
> > 
> > > 
> > > > +   _iommu_table_setparms(tbl, pci->phb->bus->number, 
> > > > create.liobn, win_addr,
> > > > + 1UL << len, page_shift, 0, 
> > > > _table_lpar_multi_ops);
> > > > +   iommu_init_table(tbl, pci->phb->node, 0, 0);
> > > 
> > > 
> > > It is 0,0 only if win_addr>0 which is not the QEMU case.
> > > 
> > 
> > Oh, ok.
> > I previously though it was ok to use 0,0 here as any other usage in
> > this file was also 0,0.
> > 
> > What should I use to get the correct parameters? Use the previous tbl
> > it_reserved_start and tbl->it_reserved_end is enough?
> 
> depends on whether you carry reserved start/end even if they are outside 
> of the dma window.
> 

Oh, that makes sense.
On a previous patch (5/14 IIRC), I changed the behavior to only store
the valid range on tbl, but now I understand why it's important to
store the raw value.

Ok, I will change it back so the reserved range stays in tbl even if it
does not intersect with the DMA window. This way I can reuse the values
in case of indirect mapping with DDW.

Is that ok? Are the reserved values are supposed to stay the same after
changing from Default DMA window to DDW?

Best regards,
Leonardo Bras

Re: [PATCH v2 14/14] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2021-04-13 Thread Leonardo Bras

On Wed, 2020-09-30 at 17:29 +1000, Alexey Kardashevskiy wrote:
> 
> On 30/09/2020 06:54, Leonardo Bras wrote:
> > On Tue, 2020-09-29 at 13:55 +1000, Alexey Kardashevskiy wrote:
> > > 
> > > On 12/09/2020 03:07, Leonardo Bras wrote:
> > > > Cc: linuxppc-dev@lists.ozlabs.org, linux-ker...@vger.kernel.org,
> > > > 
> > > > A previous change introduced the usage of DDW as a bigger indirect DMA
> > > > mapping when the DDW available size does not map the whole partition.
> > > > 
> > > > As most of the code that manipulates direct mappings was reused for
> > > > indirect mappings, it's necessary to rename all names and debug/info
> > > > messages to reflect that it can be used for both kinds of mapping.
> > > > 
> > > > Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
> > > > it's the name of the default DMA window.
> > > 
> > > "ibm,dma-window" is so old so it does not need a macro (which btw would
> > > be DMA_WIN_PROPNAME to match the other names) :)
> > 
> > Thanks for bringing that to my attention!
> > In fact, DMA_WIN_PROPNAME makes more sense, but it's still generic and
> > doesn't look to point to a generic one.
> > 
> > Would that be ok to call it DEFAULT_WIN_PROPNAME ?
> 
> 
> I would not touch it at all, the property name is painfully known and 
> not going to change ever. Does anyone else define it as a macro? I do 
> not see any:

Ok then, reverting define :)

Thanks!

> 
> [fstn1-p1 kernel-dma-bypass]$ git grep "ibm,dma-window"  | wc -l
> 8
> [fstn1-p1 kernel-dma-bypass]$ git grep "define.*ibm,dma-window"  | wc -l
> 0
> 
> 
> 
> > 
> > 
> > > 
> > > 
> > > > Those changes are not supposed to change how the code works in any
> > > > way, just adjust naming.
> > > 
> > > I simply have this in my .vimrc for the cases like this one:
> > > 
> > > ===
> > > This should cause no behavioural change.
> > > ===
> > 
> > Great tip! I will make sure to have this saved here :)
> > 
> > Thank you!
> > 
>

Re: [PATCH v2 13/14] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-04-12 Thread Leonardo Bras

Thanks for the feedback!

On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> > -static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> > +static phys_addr_t ddw_memory_hotplug_max(void)
> 
> 
> Please, forward declaration or a separate patch; this creates 
> unnecessary noise to the actual change.
> 

Sure, done!

> 
> > +   _iommu_table_setparms(tbl, pci->phb->bus->number, create.liobn, 
> > win_addr,
> > + 1UL << len, page_shift, 0, 
> > _table_lpar_multi_ops);
> > +   iommu_init_table(tbl, pci->phb->node, 0, 0);
> 
> 
> It is 0,0 only if win_addr>0 which is not the QEMU case.
> 

Oh, ok.
I previously though it was ok to use 0,0 here as any other usage in
this file was also 0,0. 

What should I use to get the correct parameters? Use the previous tbl
it_reserved_start and tbl->it_reserved_end is enough?

Best regards,
Leonardo Bras
>

Re: [PATCH v2 11/14] powerpc/pseries/iommu: Update remove_dma_window() to accept property name

2021-04-12 Thread Leonardo Bras

On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> 
> On 12/09/2020 03:07, Leonardo Bras wrote:
> > Cc: linuxppc-dev@lists.ozlabs.org, linux-ker...@vger.kernel.org,
> > 
> > Update remove_dma_window() so it can be used to remove DDW with a given
> > property name.
> > 
> 
> Out of context this seems useless. How about?
> ===
> At the moment pseries stores information about created directly mapped 
> DDW window in DIRECT64_PROPNAME. We are going to implement indirect DDW 
> window which we need to preserve during kexec so we need another 
> property for that.
> ===
> 
> Feel free to correct my english :)

Thanks Alexey! It helped a lot me better describing the reasoning
before the change!

> > 
> >     ret = of_remove_property(np, win);
> >     if (ret)
> >     pr_warn("%pOF: failed to remove direct window property: %d\n",
> >     np, ret);
> > +   return 0;
> 
> 
> You do not test the return code anywhere until 13/14 so I'd say merge 
> this one into 13/14, the same comment applies to 12/14. If you do not 
> move chunks in 13/14, it is going to be fairly small patch.

I have applied most suggested changes for patches 11,12,13, but on a
single diff it still amounts to 275 lines. 
To be honest, after 7 months of sending this patchset (and working on
other stuff), patch 13 looks a lot like to read alone, and merging with
11 & 12 seems to be too much.

Would it be ok to apply the changes and leave them all separated, or as
a mid ground just merging 11 & 12 together? 

Adding your suggested text above should be enough to get enough context
for them. I could also say why the return code is left unused for now.

Best regards,
Leonardo Bras

Re: [PATCH v2 10/14] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-04-11 Thread Leonardo Bras

On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> 
> On 12/09/2020 03:07, Leonardo Bras wrote:
> > Cc: linuxppc-dev@lists.ozlabs.org, linux-ker...@vger.kernel.org,
> > 
> > Add a new helper _iommu_table_setparms(), and use it in
> > iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
> > code.
> > 
> > Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
> > so move it to the new helper. Since we need the iommu_table_ops to be
> > declared before used, move iommu_table_lpar_multi_ops and
> > iommu_table_pseries_ops to before their respective iommu_table_setparms*().
> > 
> > The tce_exchange_pseries() also had to be moved up, since it's used in
> > iommu_table_lpar_multi_ops.xchg_no_kill.
> 
> 
> Use forward declarations (preferred) or make a separate patch for moving 
> chunks (I do not see much point).

Fixed :)

> > @@ -509,8 +559,13 @@ static void iommu_table_setparms(struct pci_controller 
> > *phb,
> >     const unsigned long *basep;
> >     const u32 *sizep;

> > -   node = phb->dn;
> > +   /* Test if we are going over 2GB of DMA space */
> > 
> > 
> > 
> > +   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
> > +   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
> > PHB.\n");
> > +   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
> > +   }
> 
> 
> s/0x8000ul/2*SZ_1G/

Done!

> 
> but more to the point - why this check? QEMU can create windows at 0 and 
> as big as the VM requested. And I am pretty sure I can construct QEMU 
> command line such as it won't have MMIO32 at all and a 4GB default DMA 
> window.
> 

Oh, the diff was a little strange here. I did not add this snippet, it
was already in that function, but since I created the helper, the diff
made it look like I introduced this piece of code.
Please take a look in the diff snippet bellow. (This same lines were
there.)

> > @@ -519,33 +574,25 @@ static void iommu_table_setparms(struct 
> > pci_controller *phb,
> >     return;
> >     }
> >   
> > -   tbl->it_base = (unsigned long)__va(*basep);
> > 
> > 
> > 
> > +   _iommu_table_setparms(tbl, phb->bus->number, 0, 
> > phb->dma_window_base_cur,
> > + phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
> > + (unsigned long)__va(*basep), 
> > _table_pseries_ops);
> > if (!is_kdump_kernel())
> > 
> > 
> > 
> >     memset((void *)tbl->it_base, 0, *sizep);
> > 
> > -   tbl->it_busno = phb->bus->number;
> > -   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
> > -
> > -   /* Units of tce entries */
> > -   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
> > -
> > -   /* Test if we are going over 2GB of DMA space */
> > -   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
> > -   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
> > PHB.\n");
> > -   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
> > -   }
> > -
> >     phb->dma_window_base_cur += phb->dma_window_size;
> > -
> > -   /* Set the tce table size - measured in entries */
> > -   tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
> > -
> > -   tbl->it_index = 0;
> > -   tbl->it_blocksize = 16;
> > -   tbl->it_type = TCE_PCI;
> >   }
> >   

Thanks for reviewing, Alexey!

Re: [PATCH v2 09/14] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-04-11 Thread Leonardo Bras

On Tue, 2020-09-29 at 13:56 +1000, Alexey Kardashevskiy wrote:
> > 
> >     dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
> > - create.liobn, dn);
> > +   create.liobn, dn);
> 
> 
> Unrelated. If you think the spaces/tabs thing needs to be fixed, make it 
> a separate patch and do all these changes there at once.

Sorry, it was some issue with my editor / diff. 
I removed those changes for next version.

> > -out_free_prop:
> > +out_prop_free:
> 
> 
> Really? :) s/out_prop_del/out_del_prop/ may be? The less unrelated 
> changes the better.

I changed all labels I added to have out__, I think
that will allow it to stay like existing labels.

Thanks for reviewing!
Leonardo Bras

Re: [PATCH v2 05/14] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-04-11 Thread Leonardo Bras

Hello Alexey, thanks for the feedback!

On Tue, 2020-09-29 at 13:57 +1000, Alexey Kardashevskiy wrote:
> 
> On 12/09/2020 03:07, Leonardo Bras wrote:
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index ffb2637dc82b..c838da3d8f32 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -655,34 +655,21 @@ static void iommu_table_reserve_pages(struct 
> > iommu_table *tbl,
> >     if (tbl->it_offset == 0)
> >     set_bit(0, tbl->it_map);
> >   
> > 
> > 
> > 
> > +   /* Check if res_start..res_end is a valid range in the table */
> > +   if (res_start >= res_end || res_start < tbl->it_offset ||
> > +   res_end > (tbl->it_offset + tbl->it_size)) {
> > +   tbl->it_reserved_start = tbl->it_offset;
> > +   tbl->it_reserved_end = tbl->it_offset;
> 
> 
> This silently ignores overlapped range of the reserved area and the 
> window which does not seem right.

Humm, that makes sense.
Would it be better to do something like this?

if (res_start < tbl->it_offset) 
res_start = tbl->it_offset;

if (res_end > (tbl->it_offset + tbl->it_size))
res_end = tbl->it_offset + tbl->it_size;

if (res_start >= res_end) {
tbl->it_reserved_start = tbl->it_offset;
tbl->it_reserved_end = tbl->it_offset;
return;
}


> > +   return;
> > +   }
> > +
> >     tbl->it_reserved_start = res_start;
> >     tbl->it_reserved_end = res_end;

> >   - /* Check if res_start..res_end isn't empty and overlaps the table */
> > -   if (res_start && res_end &&
> > -   (tbl->it_offset + tbl->it_size < res_start ||
> > -res_end < tbl->it_offset))
> > -   return;
> > -
> >     for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
> >     set_bit(i - tbl->it_offset, tbl->it_map);
> >   }
> > +bool iommu_table_in_use(struct iommu_table *tbl)
> > +{
> > +   unsigned long start = 0, end;
> > +
> > +   /* ignore reserved bit0 */
> > +   if (tbl->it_offset == 0)
> > +   start = 1;
> > +   end = tbl->it_reserved_start - tbl->it_offset;
> > +   if (find_next_bit(tbl->it_map, end, start) != end)
> > +   return true;
> > +
> > +   start = tbl->it_reserved_end - tbl->it_offset;
> > +   end = tbl->it_size;
> > +   return find_next_bit(tbl->it_map, end, start) != end;
> > +
> 
> Unnecessary empty line.

Sure, removing. 
Thanks!

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

Em sex., 9 de abr. de 2021 01:36, Alexey Kardashevskiy 
escreveu:

>
>
> On 08/04/2021 19:04, Michael Ellerman wrote:
> > Alexey Kardashevskiy  writes:
> >> On 08/04/2021 15:37, Michael Ellerman wrote:
> >>> Leonardo Bras  writes:
> >>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page
> Sizes"
> >>>> will let the OS know all possible pagesizes that can be used for
> creating a
> >>>> new DDW.
> >>>>
> >>>> Currently Linux will only try using 3 of the 8 available options:
> >>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M,
> 64M,
> >>>> 128M, 256M and 16G.
> >>>
> >>> Do we know of any hardware & hypervisor combination that will actually
> >>> give us bigger pages?
> >>
> >>
> >> On P8 16MB host pages and 16MB hardware iommu pages worked.
> >>
> >> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
> >> hardware IOMMU pages.
> >
> > The current code already tries 16MB though.
> >
> > I'm wondering if we're going to ask for larger sizes that have never
> > been tested and possibly expose bugs. But it sounds like this is mainly
> > targeted at future platforms.
>
>
> I tried for fun to pass through a PCI device to a guest with this patch as:
>
> pbuild/qemu-killslof-aiku1904le-ppc64/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline \
> -nographic \
> -vga none \
> -enable-kvm \
> -m 16G \
> -kernel ./vmldbg \
> -initrd /home/aik/t/le.cpio \
> -device vfio-pci,id=vfio0001_01_00_0,host=0001:01:00.0 \
> -mem-prealloc \
> -mem-path qemu_hp_1G_node0 \
> -global spapr-pci-host-bridge.pgsz=0xff000 \
> -machine cap-cfpc=broken,cap-ccf-assist=off \
> -smp 1,threads=1 \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events \
> -d guest_errors,mmu \
> -chardev socket,id=SOCKET0,server=on,wait=off,path=qemu.mon.1_1_0_0 \
> -mon chardev=SOCKET0,mode=control
>
>
> The guest created a huge window:
>
> xhci_hcd :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000
> 22 22 returned 0 (liobn = 0x8001 starting addr = 800 0)
>
> The first "22" is page_shift in hex (16GB), the second "22" is
> window_shift (so we have 1 TCE).
>
> On the host side the window#1 was created with 1GB pages:
> pci 0001:01 : [PE# fd] Setting up window#1
> 800..80007ff pg=4000
>
>
> The XHCI seems working. Without the patch 16MB was the maximum.
>
>
> >
> >>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c
> b/arch/powerpc/platforms/pseries/iommu.c
> >>>> index 9fc5217f0c8e..6cda1c92597d 100644
> >>>> --- a/arch/powerpc/platforms/pseries/iommu.c
> >>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
> >>>> @@ -53,6 +53,20 @@ enum {
> >>>>DDW_EXT_QUERY_OUT_SIZE = 2
> >>>>};
> >>>
> >>> A comment saying where the values come from would be good.
> >>>
> >>>> +#define QUERY_DDW_PGSIZE_4K   0x01
> >>>> +#define QUERY_DDW_PGSIZE_64K  0x02
> >>>> +#define QUERY_DDW_PGSIZE_16M  0x04
> >>>> +#define QUERY_DDW_PGSIZE_32M  0x08
> >>>> +#define QUERY_DDW_PGSIZE_64M  0x10
> >>>> +#define QUERY_DDW_PGSIZE_128M 0x20
> >>>> +#define QUERY_DDW_PGSIZE_256M 0x40
> >>>> +#define QUERY_DDW_PGSIZE_16G  0x80
> >>>
> >>> I'm not sure the #defines really gain us much vs just putting the
> >>> literal values in the array below?
> >>
> >> Then someone says "u magic values" :) I do not mind either way.
> Thanks,
> >
> > Yeah that's true. But #defining them doesn't make them less magic, if
> > you only use them in one place :)
>
> Defining them with "QUERY_DDW" in the names kinda tells where they are
> from. Can also grep QEMU using these to see how the other side handles
> it. Dunno.
>
> btw the bot complained about __builtin_ctz(SZ_16G) which should be
> __builtin_ctzl(SZ_16G) so we have to ask Leonardo to repost anyway :)
>

Thanks for testing!

http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210408201915.174217-1-leobra...@gmail.com/

I sent a v3 a few hours ago, fixing this by using __builtin_ctzll() instead
of __builtin_ctz() in all sizes, and it worked like a charm.

I also reverted to the previous approach of not having QUERY_DDW defines
for masks, as Michael suggested.

I can revert back to v2 approach if you guys decide it's better.

Best regards,
Leonardo Bras

Re: [PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for commenting.

On Tue, 2021-03-23 at 10:45 +1100, David Gibson wrote:
> > @@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long 
> > new_mem_size, bool shrinking)
> >     if (shrinking) {
> > 
> > +   /* When batch removing entries, only resizes HPT at the end. */
> > +   if (atomic_read_acquire(_resize_disable))
> > +   return 0;
> > +
> 
> I'm not quite convinced by this locking.  Couldn't hpt_resize_disable
> be set after this point, but while you're still inside
> resize_hpt_for_hotplug()?  Probably better to use an explicit mutex
> (and mutex_trylock()) to make the critical sections clearer.

Sure, I can do that for v2.

> Except... do we even need the fancy mechanics to suppress the resizes
> in one place to do them elswhere.  Couldn't we just replace the
> existing resize calls with the batched ones?

How do you think of having batched resizes-down in HPT? 
Other than the current approach, I could only think of a way that would
touch a lot of generic code, and/or duplicate some functions, as
dlpar_add_lmb() does a lot of other stuff.

> > +void hash_memory_batch_shrink_end(void)
> > +{
> > +   unsigned long newsize;
> > +
> > +   /* Re-enables HPT resize-down after hot-unplug */
> > +   atomic_set_release(_resize_disable, 0);
> > +
> > +   newsize = memblock_phys_mem_size();
> > +   /* Resize to smallest SHIFT possible */
> > +   while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
> > +   newsize *= 2;
> 
> As noted earlier, doing this without an explicit cap on the new hpt
> size (of the existing size) this makes me nervous. 
> 

I can add a stop in v2.

>  Less so, but doing
> the calculations on memory size, rather than explictly on HPT size /
> HPT order also seems kinda clunky.

Agree, but at this point, it would seem kind of a waste to find the
shift from newsize, then calculate (1 << shift) for each retry of
resize_hpt_for_hotplug() only to point that we are retrying the order
value.

But sure, if you think it looks better, I can change that. 

> > +void memory_batch_shrink_begin(void)
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_shrink_begin();
> > +}
> > +
> > +void memory_batch_shrink_end(void)
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_shrink_end();
> > +}
> 
> Again, these wrappers don't seem particularly useful to me.

Options would be add 'if (!radix_enabled())' to hotplug-memory.c
functions or to hash* functions, which look kind of wrong.

> > +   memory_batch_shrink_end();
> 
> remove_by_index only removes a single LMB, so there's no real point to
> batching here.

Sure, will be fixed for v2.

> > @@ -700,6 +712,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
> >     if (lmbs_added != lmbs_to_add) {
> >     pr_err("Memory hot-add failed, removing any added LMBs\n");
> > 
> > +   memory_batch_shrink_begin();
> 
> 
> The effect of these on the memory grow path is far from clear.
> 

On hotplug, HPT is resized-up before adding LMBs.
On hotunplug, HPT is resized-down after removing LMBs.
And each one has it's own mechanism to batch HPT resizes...

I can't understand exactly how using it on hotplug fail path can be any
different than using it on hotunplug.
> 

Can you please help me understanding this?

Best regards,
Leonardo Bras

Re: [PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for the feedback!

On Mon, 2021-03-22 at 18:55 +1100, David Gibson wrote:
> > +void hash_memory_batch_expand_prepare(unsigned long newsize)
> > +{
> > +   /*
> > +* Resizing-up HPT should never fail, but there are some cases system 
> > starts with higher
> > +* SHIFT than required, and we go through the funny case of resizing 
> > HPT down while
> > +* adding memory
> > +*/
> > +
> > +   while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
> > +   newsize *= 2;
> > +   pr_warn("Hash collision while resizing HPT\n");
> 
> This unbounded increase in newsize makes me nervous - we should be
> bounded by the current size of the HPT at least.  In practice we
> should be fine, since the resize should always succeed by the time we
> reach our current HPT size, but that's far from obvious from this
> point in the code.

Sure, I will add bounds in v2.

> 
> And... you're doubling newsize which is a value which might not be a
> power of 2.  I'm wondering if there's an edge case where this could
> actually cause us to skip the current size and erroneously resize to
> one bigger than we have currently.

I also though that at the start, but it seems quite reliable.
Before using this value, htab_shift_for_mem_size() will always round it
to next power of 2. 
Ex.
Any value between 0b0101 and 0b1000 will be rounded to 0b1000 for shift
calculation. If we multiply it by 2 (same as << 1), we have that
anything between 0b01010 and 0b1 will be rounded to 0b1. 

This works just fine as long as we are multiplying. 
Division may have the behavior you expect, as 0b0101 >> 1 would become
0b010 and skip a shift.

> > +void memory_batch_expand_prepare(unsigned long newsize)
> 
> This wrapper doesn't seem useful.

Yeah, it does little, but I can't just jump into hash_* functions
directly from hotplug-memory.c, without even knowing if it's using hash
pagetables. (in case the suggestion would be test for disable_radix
inside hash_memory_batch*)

> 
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_expand_prepare(newsize);
> > +}
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >  
> > 
> > +   memory_batch_expand_prepare(memblock_phys_mem_size() +
> > +drmem_info->n_lmbs * drmem_lmb_size());
> 
> This doesn't look right.  memory_add_by_index() is adding a *single*
> LMB, I think using drmem_info->n_lmbs here means you're counting this
> as adding again as much memory as you already have hotplugged.

Yeah, my mistake. This makes sense.
I will change it to something like 
memblock_phys_mem_size() + drmem_lmb_size()

> > 
> > +   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
> > drmem_lmb_size());
> > +
> >     for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> >     if (lmb->flags & DRCONF_MEM_ASSIGNED)
> >     continue;
> 
> I don't see memory_batch_expand_prepare() suppressing any existing HPT
> resizes.  Won't this just resize to the right size for the full add,
> then resize several times again as we perform the add?  Or.. I guess
> that will be suppressed by patch 1/3. 

Correct.

>  That's seems kinda fragile, though.

What do you mean by fragile here?
What would you suggest doing different?

Best regards,
Leonardo Bras

Re: [PATCH 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for your feedback.

On Mon, 2021-03-22 at 17:49 +1100, David Gibson wrote:
> I don't love this approach.  Adding the extra flag at this level seems
> a bit inelegant, and it means we're passing up an easy opportunity to
> reduce our resource footprint on the host.

I understand, but trying to reduce resource footprint in host, and
mostly failing is what causes hot-add and hot-remove to take so long.

> But... maybe we'll have to do it.  I'd like to see if we can get
> things to work well enough with just the "batching" to avoid multiple
> resize attempts first.

This batching is something I had thought a lot about.
Problem is that there are a lot of generic interfaces between memory
hotplug and actually resizing HPT. I tried a simpler approach in
patches 2 & 3, so I don't touch much stuff there.

Best regards,
Leonardo Bras

[PATCH v3 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
Changes since v2:
 - Restore 'int array & shift' strategy
 - Remove defines for RTAS "IO Page Size" output of ibm,query-pe-dma-window
 - Added/Improved comments
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210407195613.131140-1-leobra...@gmail.com/
Changes since v1:
- Remove page shift defines, replace by __builtin_ctzll(SZ_XXX)
- Add bit field defines for RTAS "IO Page Shift" output of 
ibm,query-pe-dma-window
- Use struct array instead of int array to be more explicit on pagesizes
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/
 

 arch/powerpc/platforms/pseries/iommu.c | 37 +-
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..67c9953a6503 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1099,6 +1099,33 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   /* Supported IO page-sizes according to LoPAR */
+   const int shift[] = {
+   __builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), 
__builtin_ctzll(SZ_16M),
+   __builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), 
__builtin_ctzll(SZ_128M),
+   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G)
+   };
+
+   int i = ARRAY_SIZE(shift) - 1;
+
+   /*
+* On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a 
bit field:
+* - bit 31 means 4k pages are supported,
+* - bit 30 means 64k pages are supported, and so on.
+* Larger pagesizes map more memory with the same amount of TCEs, so 
start probing them.
+*/
+   for (; i >= 0 ; i--) {
+   if (query_page_size & (1 << i))
+   return shift[i];
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1233,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.30.2

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
> > > +#define QUERY_DDW_PGSIZE_4K  0x01
> > > +#define QUERY_DDW_PGSIZE_64K 0x02
> > > +#define QUERY_DDW_PGSIZE_16M 0x04
> > > +#define QUERY_DDW_PGSIZE_32M 0x08
> > > +#define QUERY_DDW_PGSIZE_64M 0x10
> > > +#define QUERY_DDW_PGSIZE_128M0x20
> > > +#define QUERY_DDW_PGSIZE_256M0x40
> > > +#define QUERY_DDW_PGSIZE_16G 0x80
> > 
> > I'm not sure the #defines really gain us much vs just putting the
> > literal values in the array below?
> 
> My v1 did not use the define approach, what do you think of that?
> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/
> 
> 
(of course, it would be that without the pageshift defines also, using
the __builtin_ctz() approach suggested by Alexey.)

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

Hello Michael, thank you for this feedback!
Comments inline:

On Thu, 2021-04-08 at 15:37 +1000, Michael Ellerman wrote:
> Leonardo Bras  writes:
> > According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> > will let the OS know all possible pagesizes that can be used for creating a
> > new DDW.
> > 
> > Currently Linux will only try using 3 of the 8 available options:
> > 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> > 128M, 256M and 16G.
> 
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?
> 
> > Enabling bigger pages would be interesting for direct mapping systems
> > with a lot of RAM, while using less TCE entries.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 49 ++
> >  1 file changed, 42 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 9fc5217f0c8e..6cda1c92597d 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,6 +53,20 @@ enum {
> >     DDW_EXT_QUERY_OUT_SIZE = 2
> >  };
> 
> A comment saying where the values come from would be good.

Sure, I will add the information about LoPAR.

> 
> > +#define QUERY_DDW_PGSIZE_4K0x01
> > +#define QUERY_DDW_PGSIZE_64K   0x02
> > +#define QUERY_DDW_PGSIZE_16M   0x04
> > +#define QUERY_DDW_PGSIZE_32M   0x08
> > +#define QUERY_DDW_PGSIZE_64M   0x10
> > +#define QUERY_DDW_PGSIZE_128M  0x20
> > +#define QUERY_DDW_PGSIZE_256M  0x40
> > +#define QUERY_DDW_PGSIZE_16G   0x80
> 
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?

My v1 did not use the define approach, what do you think of that?
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/

> 
> > +struct iommu_ddw_pagesize {
> > +   u32 mask;
> > +   int shift;
> > +};
> > +
> >  static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> >  {
> >     struct iommu_table_group *table_group;
> > @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, 
> > struct device_node *par_dn)
> >  ret);
> >  }
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > +/* Returns page shift based on "IO Page Sizes" output at 
> > ibm,query-pe-dma-window. See LoPAR */
> > +static int iommu_get_page_shift(u32 query_page_size)
> > +{
> > +   const struct iommu_ddw_pagesize ddw_pagesize[] = {
> > +   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
> > +   { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> > +   { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> > +   { QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
> > +   { QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
> > +   { QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
> > +   { QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
> > +   { QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
> > +   };
> 
> 
> cheers

Best regards,
Leonardo Bras

Re: [PATCH 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-07 Thread Leonardo Bras

Hello Alexey,

On Tue, 2021-03-23 at 18:41 +1100, Alexey Kardashevskiy wrote:
[...]
> > +#define IOMMU_PAGE_SHIFT_16G   34
> > +#define IOMMU_PAGE_SHIFT_256M  28
> > +#define IOMMU_PAGE_SHIFT_128M  27
> > +#define IOMMU_PAGE_SHIFT_64M   26
> > +#define IOMMU_PAGE_SHIFT_32M   25
> > +#define IOMMU_PAGE_SHIFT_16M   24
> > +#define IOMMU_PAGE_SHIFT_64K   16
> 
> 
> These are not very descriptive, these are just normal shifts, could be 
> as simple as __builtin_ctz(SZ_4K) (gcc will optimize this) and so on.
> 
> OTOH the PAPR page sizes need macros as they are the ones which are 
> weird and screaming for macros.
> 
> I'd steal/rework spapr_page_mask_to_query_mask() from QEMU. Thanks,
> 

Thanks for this feedback!
I just sent a v2 applying your suggestions.

Best regards,
Leonardo Bras

[PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-07 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 49 ++
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..6cda1c92597d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,20 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+#define QUERY_DDW_PGSIZE_4K0x01
+#define QUERY_DDW_PGSIZE_64K   0x02
+#define QUERY_DDW_PGSIZE_16M   0x04
+#define QUERY_DDW_PGSIZE_32M   0x08
+#define QUERY_DDW_PGSIZE_64M   0x10
+#define QUERY_DDW_PGSIZE_128M  0x20
+#define QUERY_DDW_PGSIZE_256M  0x40
+#define QUERY_DDW_PGSIZE_16G   0x80
+
+struct iommu_ddw_pagesize {
+   u32 mask;
+   int shift;
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Returns page shift based on "IO Page Sizes" output at 
ibm,query-pe-dma-window. See LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   const struct iommu_ddw_pagesize ddw_pagesize[] = {
+   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
+   { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
+   { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
+   { QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
+   { QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
+   { QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
+   { QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
+   { QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
+   };
+
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(ddw_pagesize); i++) {
+   if (query_page_size & ddw_pagesize[i].mask)
+   return ddw_pagesize[i].shift;
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1245,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.30.2

[PATCH 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-03-22 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h   |  8 
 arch/powerpc/platforms/pseries/iommu.c | 28 +++---
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..c170048b7a1b 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -19,6 +19,14 @@
 #include 
 #include 
 
+#define IOMMU_PAGE_SHIFT_16G   34
+#define IOMMU_PAGE_SHIFT_256M  28
+#define IOMMU_PAGE_SHIFT_128M  27
+#define IOMMU_PAGE_SHIFT_64M   26
+#define IOMMU_PAGE_SHIFT_32M   25
+#define IOMMU_PAGE_SHIFT_16M   24
+#define IOMMU_PAGE_SHIFT_64K   16
+
 #define IOMMU_PAGE_SHIFT_4K  12
 #define IOMMU_PAGE_SIZE_4K   (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K)
 #define IOMMU_PAGE_MASK_4K   (~((1 << IOMMU_PAGE_SHIFT_4K) - 1))
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..02958e80aa91 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1099,6 +1099,24 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Returns page shift based on "IO Page Sizes" output at 
ibm,query-pe-dma-window. SeeL LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   const int shift[] = {IOMMU_PAGE_SHIFT_4K,   IOMMU_PAGE_SHIFT_64K,  
IOMMU_PAGE_SHIFT_16M,
+IOMMU_PAGE_SHIFT_32M,  IOMMU_PAGE_SHIFT_64M,  
IOMMU_PAGE_SHIFT_128M,
+IOMMU_PAGE_SHIFT_256M, IOMMU_PAGE_SHIFT_16G};
+   int i = ARRAY_SIZE(shift) - 1;
+
+   /* Looks for the largest page size supported */
+   for (; i >= 0; i--) {
+   if (query_page_size & (1 << i))
+   return shift[i];
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1224,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.29.2

[PATCH 1/1] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc

2021-03-18 Thread Leonardo Bras

As of today, doing iommu_range_alloc() only for !largealloc (npages <= 15)
will only be able to use 3/4 of the available pages, given pages on
largepool  not being available for !largealloc.

This could mean some drivers not being able to fully use all the available
pages for the DMA window.

Add pages on largepool as a last resort for !largealloc, making all pages
of the DMA window available.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/iommu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3329ef045805..ae6ad8dca605 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -255,6 +255,15 @@ static unsigned long iommu_range_alloc(struct device *dev,
pass++;
goto again;
 
+   } else if (pass == tbl->nr_pools + 1) {
+   /* Last resort: try largepool */
+   spin_unlock(>lock);
+   pool = >large_pool;
+   spin_lock(>lock);
+   pool->hint = pool->start;
+   pass++;
+   goto again;
+
} else {
/* Give up */
spin_unlock_irqrestore(&(pool->lock), flags);
-- 
2.29.2

[PATCH 1/1] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE() to save TCEs

2021-03-18 Thread Leonardo Bras

Currently both iommu_alloc_coherent() and iommu_free_coherent() align the
desired allocation size to PAGE_SIZE, and gets system pages and IOMMU
mappings (TCEs) for that value.

When IOMMU_PAGE_SIZE < PAGE_SIZE, this behavior may cause unnecessary
TCEs to be created for mapping the whole system page.

Example:
- PAGE_SIZE = 64k, IOMMU_PAGE_SIZE() = 4k
- iommu_alloc_coherent() is called for 128 bytes
- 1 system page (64k) is allocated
- 16 IOMMU pages (16 x 4k) are allocated (16 TCEs used)

It would be enough to use a single TCE for this, so 15 TCEs are
wasted in the process.

Update iommu_*_coherent() to make sure the size alignment happens only
for IOMMU_PAGE_SIZE() before calling iommu_alloc() and iommu_free().

Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
with IOMMU_PAGE_ALIGN(n, tbl), which is easier to read and does the
same.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/iommu.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5b69a6a72a0e..3329ef045805 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -851,6 +851,7 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
unsigned int order;
unsigned int nio_pages, io_order;
struct page *page;
+   size_t size_io = size;
 
size = PAGE_ALIGN(size);
order = get_order(size);
@@ -877,8 +878,9 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
memset(ret, 0, size);
 
/* Set up tces to cover the allocated range */
-   nio_pages = size >> tbl->it_page_shift;
-   io_order = get_iommu_order(size, tbl);
+   size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
+   nio_pages = size_io >> tbl->it_page_shift;
+   io_order = get_iommu_order(size_io, tbl);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
  mask >> tbl->it_page_shift, io_order, 0);
if (mapping == DMA_MAPPING_ERROR) {
@@ -893,10 +895,9 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
 void *vaddr, dma_addr_t dma_handle)
 {
if (tbl) {
-   unsigned int nio_pages;
+   size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
+   unsigned int nio_pages = size_io >> tbl->it_page_shift;
 
-   size = PAGE_ALIGN(size);
-   nio_pages = size >> tbl->it_page_shift;
iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
-- 
2.29.2

[PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-03-11 Thread Leonardo Bras

During memory hotunplug, after each LMB is removed, the HPT may be
resized-down if it would map a max of 4 times the current amount of memory.
(2 shifts, due to introduced histeresis)

It usually is not an issue, but it can take a lot of time if HPT
resizing-down fails. This happens  because resize-down failures
usually repeat at each LMB removal, until there are no more bolted entries
conflict, which can take a while to happen.

This can be solved by doing a single HPT resize at the end of memory
hotunplug, after all requested entries are removed.

To make this happen, it's necessary to temporarily disable all HPT
resize-downs before hotunplug, re-enable them after hotunplug ends,
and then resize-down HPT to the current memory size.

As an example, hotunplugging 256GB from a 385GB guest took 621s without
this patch, and 100s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  2 ++
 arch/powerpc/include/asm/sparsemem.h  |  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c | 28 +++
 arch/powerpc/mm/book3s64/pgtable.c| 12 
 .../platforms/pseries/hotplug-memory.c| 16 +++
 5 files changed, 60 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 843b0a178590..f92697c107f7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -256,6 +256,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 void hash_memory_batch_expand_prepare(unsigned long newsize);
+void hash_memory_batch_shrink_begin(void);
+void hash_memory_batch_shrink_end(void);
 
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 16b5f5300c84..a7a8a0d070fc 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -18,6 +18,8 @@ extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
 
 void memory_batch_expand_prepare(unsigned long newsize);
+void memory_batch_shrink_begin(void);
+void memory_batch_shrink_end(void);
 
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 1f6aa0bf27e7..e16f207de8e4 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -794,6 +794,9 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+
+atomic_t hpt_resize_disable = ATOMIC_INIT(0);
+
 static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
@@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size, bool shrinking)
 
if (shrinking) {
 
+   /* When batch removing entries, only resizes HPT at the end. */
+   if (atomic_read_acquire(_resize_disable))
+   return 0;
+
/*
 * To avoid lots of HPT resizes if memory size is fluctuating
 * across a boundary, we deliberately have some hysterisis
@@ -872,6 +879,27 @@ void hash_memory_batch_expand_prepare(unsigned long 
newsize)
pr_warn("Hash collision while resizing HPT\n");
}
 }
+
+void hash_memory_batch_shrink_begin(void)
+{
+   /* Disable HPT resize-down during hot-unplug */
+   atomic_set_release(_resize_disable, 1);
+}
+
+void hash_memory_batch_shrink_end(void)
+{
+   unsigned long newsize;
+
+   /* Re-enables HPT resize-down after hot-unplug */
+   atomic_set_release(_resize_disable, 0);
+
+   newsize = memblock_phys_mem_size();
+   /* Resize to smallest SHIFT possible */
+   while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+   }
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index f1cd8af0f67f..e01681e22e00 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -199,6 +199,18 @@ void memory_batch_expand_prepare(unsigned long newsize)
if (!radix_enabled())
hash_memory_batch_expand_prepare(newsize);
 }
+
+void memory_batch_shrink_begin(void)
+{
+   if (!radix_enabled())
+   hash_memory_batch_shrink_begin();
+}
+
+void memory_batch_shrink_end(void)
+{
+   if (!radix_enabled())
+   hash_memory_batch_shrink_end();
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 void __init mmu_partition_table_init(void)
diff --git a/ar

[PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-03-11 Thread Leonardo Bras

Every time a memory hotplug happens, and the memory limit crosses a 2^n
value, it may be necessary to perform HPT resizing-up, which can take
some time (over 100ms in my tests).

It usually is not an issue, but it can take some time if a lot of memory
is added to a guest with little starting memory:
Adding 256G to a 2GB guest, for example will require 8 HPT resizes.

Perform an HPT resize before memory hotplug, updating HPT to its
final size (considering a successful hotplug), taking the number of
HPT resizes to at most one per memory hotplug action.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h   |  2 ++
 arch/powerpc/include/asm/sparsemem.h|  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c   | 14 ++
 arch/powerpc/mm/book3s64/pgtable.c  |  6 ++
 arch/powerpc/platforms/pseries/hotplug-memory.c |  6 ++
 5 files changed, 30 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d959b0195ad9..843b0a178590 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
+void hash_memory_batch_expand_prepare(unsigned long newsize);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index d072866842e4..16b5f5300c84 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -17,6 +17,8 @@ extern int remove_section_mapping(unsigned long start, 
unsigned long end);
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
 
+void memory_batch_expand_prepare(unsigned long newsize);
+
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
 #else
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index cfb3ec164f56..1f6aa0bf27e7 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -858,6 +858,20 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
 
return rc;
 }
+
+void hash_memory_batch_expand_prepare(unsigned long newsize)
+{
+   /*
+* Resizing-up HPT should never fail, but there are some cases system 
starts with higher
+* SHIFT than required, and we go through the funny case of resizing 
HPT down while
+* adding memory
+*/
+
+   while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+   }
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 5b3a3bae21aa..f1cd8af0f67f 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -193,6 +193,12 @@ int __meminit remove_section_mapping(unsigned long start, 
unsigned long end)
 
return hash__remove_section_mapping(start, end);
 }
+
+void memory_batch_expand_prepare(unsigned long newsize)
+{
+   if (!radix_enabled())
+   hash_memory_batch_expand_prepare(newsize);
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 void __init mmu_partition_table_init(void)
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..353c71249214 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -671,6 +671,8 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
drmem_lmb_size());
+
for_each_drmem_lmb(lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
@@ -734,6 +736,8 @@ static int dlpar_memory_add_by_index(u32 drc_index)
 
pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() +
+drmem_info->n_lmbs * drmem_lmb_size());
lmb_found = 0;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
@@ -788,6 +792,8 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
drc_index)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
drmem_lmb_size());
+

[PATCH 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-03-11 Thread Leonardo Bras

Because hypervisors may need to create HPTs without knowing the guest
page size, the smallest used page-size (4k) may be chosen, resulting in
a HPT that is possibly bigger than needed.

On a guest with bigger page-sizes, the amount of entries for HTP may be
too high, causing the guest to ask for a HPT resize-down on the first
hotplug.

This becomes a problem when HPT resize-down fails, and causes the
HPT resize to be performed on every LMB added, until HPT size is
compatible to guest memory size, causing a major slowdown.

So, avoiding HPT resizing-down on hot-add significantly improves memory
hotplug times.

As an example, hotplugging 256GB on a 129GB guest took 710s without this
patch, and 21s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 36 ---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 73b06adb6eeb..cfb3ec164f56 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -794,7 +794,7 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
 
@@ -803,19 +803,25 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size)
 
target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
 
-   /*
-* To avoid lots of HPT resizes if memory size is fluctuating
-* across a boundary, we deliberately have some hysterisis
-* here: we immediately increase the HPT size if the target
-* shift exceeds the current shift, but we won't attempt to
-* reduce unless the target shift is at least 2 below the
-* current shift
-*/
-   if (target_hpt_shift > ppc64_pft_size ||
-   target_hpt_shift < ppc64_pft_size - 1)
-   return mmu_hash_ops.resize_hpt(target_hpt_shift);
+   if (shrinking) {
 
-   return 0;
+   /*
+* To avoid lots of HPT resizes if memory size is fluctuating
+* across a boundary, we deliberately have some hysterisis
+* here: we immediately increase the HPT size if the target
+* shift exceeds the current shift, but we won't attempt to
+* reduce unless the target shift is at least 2 below the
+* current shift
+*/
+
+   if (target_hpt_shift >= ppc64_pft_size - 1)
+   return 0;
+
+   } else if (target_hpt_shift <= ppc64_pft_size) {
+   return 0;
+   }
+
+   return mmu_hash_ops.resize_hpt(target_hpt_shift);
 }
 
 int hash__create_section_mapping(unsigned long start, unsigned long end,
@@ -828,7 +834,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
return -1;
}
 
-   resize_hpt_for_hotplug(memblock_phys_mem_size());
+   resize_hpt_for_hotplug(memblock_phys_mem_size(), false);
 
rc = htab_bolt_mapping(start, end, __pa(start),
   pgprot_val(prot), mmu_linear_psize,
@@ -847,7 +853,7 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 mmu_kernel_ssize);
 
-   if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+   if (resize_hpt_for_hotplug(memblock_phys_mem_size(), true) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
 
return rc;
-- 
2.29.2

[PATCH 0/3] powerpc/mm/hash: Time improvements for memory hot(un)plug

2021-03-11 Thread Leonardo Bras

This patchset intends to reduce time needed for processing memory
hotplug/hotunplug in hash guests.

The first one, makes sure guests with pagesize over 4k don't need to
go through HPT resize-downs after memory hotplug.

The second and third patches make hotplug / hotunplug perform a single
HPT resize per operation, instead of one for each shift change, or one
for each LMB in case of resize-down error.

Why haven't the same mechanism used for both memory hotplug and hotunplug?
They both have different requirements:

Memory hotplug causes (usually) HPT resize-ups, which are fine happening
at the start of hotplug, but resize-ups should not ever be disabled, as
other mechanisms may try to increase memory, hitting issues with a HPT
that is too small.

Memory hotunplug causes HPT resize-downs, which can be disabled (HPT will
just remain larger for a while), but need to happen at the end of an
hotunplug operation. If we want to batch it, we need to disable
resize-downs and perform it only at the end.

Tests done with this patchset in the same machine / guest config:
Starting memory: 129GB, DIMM: 256GB
Before patchset: hotplug = 710s, hotunplug = 621s.
After patchset: hotplug  = 21s, hotunplug = 100s.

Any feedback will be appreciated!
I believe the code may not be very well placed in available files,
so please give some feedback on that.

Best regards,

Leonardo Bras (3):
  powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

 arch/powerpc/include/asm/book3s/64/hash.h |  4 +
 arch/powerpc/include/asm/sparsemem.h  |  4 +
 arch/powerpc/mm/book3s64/hash_utils.c | 78 +++
 arch/powerpc/mm/book3s64/pgtable.c| 18 +
 .../platforms/pseries/hotplug-memory.c| 22 ++
 5 files changed, 111 insertions(+), 15 deletions(-)

-- 
2.29.2

Re: [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation

2021-02-22 Thread Leonardo Bras

On Mon, 2021-02-22 at 16:24 +1100, Alexey Kardashevskiy wrote:
> 
> On 18/02/2021 06:32, Leonardo Bras wrote:
> > On Tue, 2021-02-16 at 14:33 +1100, Alexey Kardashevskiy wrote:
> > > Most platforms allocate IOMMU table structures (specifically it_map)
> > > at the boot time and when this fails - it is a valid reason for panic().
> > > 
> > > However the powernv platform allocates it_map after a device is returned
> > > to the host OS after being passed through and this happens long after
> > > the host OS booted. It is quite possible to trigger the it_map allocation
> > > panic() and kill the host even though it is not necessary - the host OS
> > > can still use the DMA bypass mode (requires a tiny fraction of it_map's
> > > memory) and even if that fails, the host OS is runnnable as it was without
> > > the device for which allocating it_map causes the panic.
> > > 
> > > Instead of immediately crashing in a powernv/ioda2 system, this prints
> > > an error and continues. All other platforms still call panic().
> > > 
> > > Signed-off-by: Alexey Kardashevskiy 
> > 
> > Hello Alexey,
> > 
> > This looks like a good change, that passes panic() decision to platform
> > code. Everything looks pretty straightforward, but I have a question
> > regarding this:
> > 
> > > @@ -1930,16 +1931,16 @@ static long 
> > > pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
> > >   res_start = pe->phb->ioda.m32_pci_base >> 
> > > tbl->it_page_shift;
> > >   res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
> > >   }
> > > - iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
> > > - rc = pnv_pci_ioda2_set_window(>table_group, 0, tbl);
> > > 
> > > + if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
> > > + rc = pnv_pci_ioda2_set_window(>table_group, 0, tbl);
> > > + else
> > > + rc = -ENOMEM;
> > >   if (rc) {
> > > - pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
> > > - rc);
> > > + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", 
> > > rc);
> > >   iommu_tce_table_put(tbl);
> > > - return rc;
> > > + tbl = NULL; /* This clears iommu_table_base below */
> > >   }
> > > -
> > >   if (!pnv_iommu_bypass_disabled)
> > >   pnv_pci_ioda2_set_bypass(pe, true);
> > >   
> > > 
> > > 
> > > 
> > > 
> > 
> > If I could understand correctly, previously if iommu_init_table() did
> > not panic(), and pnv_pci_ioda2_set_window() returned something other
> > than 0, it would return rc in the if (rc) clause, but now it does not
> > happen anymore, going through if (!pnv_iommu_bypass_disabled) onwards.
> > 
> > Is that desired?
> 
> 
> Yes. A PE (==device, pretty much) has 2 DMA windows:
> - the default one which requires some RAM to operate
> - a bypass mode which tells the hardware that PCI addresses are 
> statically mapped to RAM 1:1.
> 
> This bypass mode does not require extra memory to work and is used in 
> the most cases on the bare metal as long as the device supports 64bit 
> DMA which is everything except GPUs. Since it is cheap to enable and 
> this what we prefer anyway, no urge to fail.
> 
> 
> > As far as I could see, returning rc there seems a good procedure after
> > iommu_init_table returning -ENOMEM.
> 
> This change is intentional and yes it could be done by a separate patch 
> but I figured there is no that much value in splitting.

Ok then, thanks for clarifying.
FWIW:

Reviewed-by: Leonardo Bras

Re: [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation

2021-02-17 Thread Leonardo Bras

On Tue, 2021-02-16 at 14:33 +1100, Alexey Kardashevskiy wrote:
> Most platforms allocate IOMMU table structures (specifically it_map)
> at the boot time and when this fails - it is a valid reason for panic().
> 
> However the powernv platform allocates it_map after a device is returned
> to the host OS after being passed through and this happens long after
> the host OS booted. It is quite possible to trigger the it_map allocation
> panic() and kill the host even though it is not necessary - the host OS
> can still use the DMA bypass mode (requires a tiny fraction of it_map's
> memory) and even if that fails, the host OS is runnnable as it was without
> the device for which allocating it_map causes the panic.
> 
> Instead of immediately crashing in a powernv/ioda2 system, this prints
> an error and continues. All other platforms still call panic().
> 
> Signed-off-by: Alexey Kardashevskiy 

Hello Alexey,

This looks like a good change, that passes panic() decision to platform
code. Everything looks pretty straightforward, but I have a question
regarding this:

> @@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct 
> pnv_ioda_pe *pe)
>   res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
>   res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
>   }
> - iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
> - rc = pnv_pci_ioda2_set_window(>table_group, 0, tbl);
> 
> + if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
> + rc = pnv_pci_ioda2_set_window(>table_group, 0, tbl);
> + else
> + rc = -ENOMEM;
>   if (rc) {
> - pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
> - rc);
> + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", 
> rc);
>   iommu_tce_table_put(tbl);
> - return rc;
> + tbl = NULL; /* This clears iommu_table_base below */
>   }
> -
>   if (!pnv_iommu_bypass_disabled)
>   pnv_pci_ioda2_set_bypass(pe, true);
>  
> 

If I could understand correctly, previously if iommu_init_table() did
not panic(), and pnv_pci_ioda2_set_window() returned something other
than 0, it would return rc in the if (rc) clause, but now it does not
happen anymore, going through if (!pnv_iommu_bypass_disabled) onwards.

Is that desired?

As far as I could see, returning rc there seems a good procedure after
iommu_init_table returning -ENOMEM.

Best regards, 
Leonardo Bras

Re: [PATCH kernel 1/2] powerpc/iommu: Allocate it_map by vmalloc

2021-02-17 Thread Leonardo Bras

On Tue, 2021-02-16 at 14:33 +1100, Alexey Kardashevskiy wrote:
> The IOMMU table uses the it_map bitmap to keep track of allocated DMA
> pages. This has always been a contiguous array allocated at either
> the boot time or when a passed through device is returned to the host OS.
> The it_map memory is allocated by alloc_pages() which allocates
> contiguous physical memory.
> 
> Such allocation method occasionally creates a problem when there is
> no big chunk of memory available (no free memory or too fragmented).
> On powernv/ioda2 the default DMA window requires 16MB for it_map.
> 
> This replaces alloc_pages_node() with vzalloc_node() which allocates
> contiguous block but in virtual memory. This should reduce changes of
> failure but should not cause other behavioral changes as it_map is only
> used by the kernel's DMA hooks/api when MMU is on.
> 
> Signed-off-by: Alexey Kardashevskiy 

It looks a very good change, and also makes code much simpler to read.

FWIW:

Reviewed-by: Leonardo Bras 

> ---
>  arch/powerpc/kernel/iommu.c | 15 +++
>  1 file changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index c00214a4355c..8eb6eb0afa97 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -719,7 +719,6 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>  {
>   unsigned long sz;
>   static int welcomed = 0;
> - struct page *page;
>   unsigned int i;
>   struct iommu_pool *p;
>  
> 
> 
> 
> @@ -728,11 +727,9 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>   /* number of bytes needed for the bitmap */
>   sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
> 
> 
> 
> - page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
> - if (!page)
> + tbl->it_map = vzalloc_node(sz, nid);
> + if (!tbl->it_map)
>   panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> - tbl->it_map = page_address(page);
> - memset(tbl->it_map, 0, sz);
>  
> 
> 
> 
>   iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> 
> 
> 
> @@ -774,8 +771,6 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid,
>  
> 
> 
> 
>  static void iommu_table_free(struct kref *kref)
>  {
> - unsigned long bitmap_sz;
> - unsigned int order;
>   struct iommu_table *tbl;
>  
> 
> 
> 
>   tbl = container_of(kref, struct iommu_table, it_kref);
> @@ -796,12 +791,8 @@ static void iommu_table_free(struct kref *kref)
>   if (!bitmap_empty(tbl->it_map, tbl->it_size))
>   pr_warn("%s: Unexpected TCEs\n", __func__);
>  
> 
> 
> 
> - /* calculate bitmap size in bytes */
> - bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
> -
>   /* free bitmap */
> - order = get_order(bitmap_sz);
> - free_pages((unsigned long) tbl->it_map, order);
> + vfree(tbl->it_map);
>  
> 
> 
> 
>   /* free table */
>   kfree(tbl);

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-08 Thread Leonardo Bras

Hello Nick,

On Sat, 2021-02-06 at 13:03 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 5:01 pm:
> > Hey Nick, thanks for reviewing :)
> > 
> > On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> > > Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > > > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > > > After exitting guest, the register is reverted to it's original value.
> > > > 
> > > > If one tries to get the timestamp from host between those changes, it
> > > > will present an incorrect value.
> > > > 
> > > > An example would be trying to add a tracepoint in
> > > > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > > > acquired could actually cause the host to crash.
> > > > 
> > > > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > > > get the correct timestamp.
> > > 
> > > Ouch. Not sure how reasonable it is to half switch into guest registers 
> > > and expect to call into the wider kernel, fixing things up as we go. 
> > > What if mftb is used in other places?
> > 
> > IIUC, the CPU is not supposed to call anything as host between guest
> > entry and guest exit, except guest-related cases, like
> 
> When I say "call", I'm including tracing in that. If a function is not 
> marked as no trace, then it will call into the tracing subsystem.
> 
> > kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
> > will still get the same value as before.
> 
> Right, so it'll be out of whack again.
> 
> > This is only supposed to change stuff that depends on sched_clock, like
> > Tracepoints, that can happen in those exceptions.
> 
> If they depend on sched_clock that's one thing. Do they definitely have 
> no dependencies on mftb from other calls?

We could change that on get_tb() or mftb() @ timebase.h, which would
have a broader reach, but would not reach any mftb from asm code.

> > > Especially as it doesn't seem like there is a reason that function _has_
> > > to be called after the timebase is switched to guest, that's just how 
> > > the code is structured.
> > 
> > Correct, but if called, like in rb routines, used by tracepoints, the
> > difference between last tb and current (lower) tb may cause the CPU to
> > trap PROGRAM exception, crashing host. 
> 
> Yes, so I agree with Michael any function that is involved when we begin 
> to switch into guest context (or have not completed switching back to 
> host going the other way) should be marked as no trace (noinstr even, 
> perhaps).

Sure, that would avoid having to get paca->tb_offset for every mftb()
called, and avoid inconsistencies when different ways to get time are
used in code.

On the other hand, it would make it very hard to debug functions like
kvmppc_guest_entry_inject_int() as I am doing right now.

> 
> > > As a local hack to work out a bug okay. If you really need it upstream 
> > > could you put it under a debug config option?
> > 
> > You mean something that is automatically selected whenever those
> > configs are enabled? 
> > 
> > CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64
> > 
> > Or something the user need to select himself in menuconfig?
> 
> Yeah I meant a default n thing under powerpc kernel debugging somewhere.

So, IIUC all we can do is split this in 2 changes:
1 - Adding notrace to those functions
2 - Introducing a kernel debug config that reverts (1) and 'fixes' mftb

If that's correct, I have some ideas we can use. 

For debug option, should we add the offset on get_tb() or mftb()?

Another option would be to adding this tb_offset only in the routines
used by tracing. But this could probably mean having to add a function
in arch-generic code, but still an option.

What do you think?

> 
> Thanks,
> Nick

Thank you!
Leonardo Bras

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Leonardo Bras

Hello Fabiano, 
Thanks for reviewing! 
(answers inline)

On Fri, 2021-02-05 at 10:09 -0300, Fabiano Rosas wrote:
> Leonardo Bras  writes:
> 
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> > 
> > Signed-off-by: Leonardo Bras 
> > Suggested-by: Paul Mackerras 
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c | 1 +
> >  arch/powerpc/kernel/time.c| 8 +++-
> >  arch/powerpc/kvm/book3s_hv.c  | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> > b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >     u64 cfar;
> >     u64 ppr;
> >     u64 host_fscr;
> > +   u64 tb_offset;  /* Timebase offset: keeps correct
> > timebase while on guest */
> 
> Couldn't you use the vc->tb_offset_applied for this? We have a reference
> for the vcore in the hstate already.

But it's a pointer, which means we would have to keep checking for NULL
every time we need sched_clock(). 
Potentially it would cost a cache miss for PACA memory region that
contain vc, another for getting the part of *vc that contains the
tb_offset_applied, instead of only one for PACA struct region that
contains tb_offset.

On the other hand, it got me thinking: If the offset is applied per
cpu, why don't we get this info only in PACA, instead of in vc?
It could be a general way to get an offset applied for any purpose and
still get the sched_clock() right. 
(Not that I have any idea of any other purpose we could use it) 

Best regards!
Leonardo Bras

> 
> >  #endif
> >  };
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >     HSTATE_FIELD(HSTATE_CFAR, cfar);
> >     HSTATE_FIELD(HSTATE_PPR, ppr);
> >     HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> > 
> >  #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> >   */
> >  notrace unsigned long long sched_clock(void)
> >  {
> > -   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > +   u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > +   tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > +   return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> >  }
> > 
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> >     if ((tb & 0xff) < (new_tb & 0xff))
> >     mtspr(SPRN_TBU40, new_tb + 0x100);
> >     vc->tb_offset_applied = vc->tb_offset;
> > +   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> >     }
> > 
> >     if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> &

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Hey Nick, thanks for reviewing :)

On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> 
> Ouch. Not sure how reasonable it is to half switch into guest registers 
> and expect to call into the wider kernel, fixing things up as we go. 
> What if mftb is used in other places?

IIUC, the CPU is not supposed to call anything as host between guest
entry and guest exit, except guest-related cases, like
kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
will still get the same value as before.

This is only supposed to change stuff that depends on sched_clock, like
Tracepoints, that can happen in those exceptions.


> Especially as it doesn't seem like there is a reason that function _has_
> to be called after the timebase is switched to guest, that's just how 
> the code is structured.

Correct, but if called, like in rb routines, used by tracepoints, the
difference between last tb and current (lower) tb may cause the CPU to
trap PROGRAM exception, crashing host. 

> As a local hack to work out a bug okay. If you really need it upstream 
> could you put it under a debug config option?

You mean something that is automatically selected whenever those
configs are enabled? 

CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64

Or something the user need to select himself in menuconfig?

> 
> Thanks,
> Nick
> 

Thank you!
Leonardo Bras

> > Signed-off-by: Leonardo Bras 
> > Suggested-by: Paul Mackerras 
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c | 1 +
> >  arch/powerpc/kernel/time.c| 8 +++-
> >  arch/powerpc/kvm/book3s_hv.c  | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> > b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >     u64 cfar;
> >     u64 ppr;
> >     u64 host_fscr;
> > +   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
> > while on guest */
> >  #endif
> >  };
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >     HSTATE_FIELD(HSTATE_CFAR, cfar);
> >     HSTATE_FIELD(HSTATE_PPR, ppr);
> >     HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
>

[PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Before guest entry, TBU40 register is changed to reflect guest timebase.
After exitting guest, the register is reverted to it's original value.

If one tries to get the timestamp from host between those changes, it
will present an incorrect value.

An example would be trying to add a tracepoint in
kvmppc_guest_entry_inject_int(), which depending on last tracepoint
acquired could actually cause the host to crash.

Save the Timebase Offset to PACA and use it on sched_clock() to always
get the correct timestamp.

Signed-off-by: Leonardo Bras 
Suggested-by: Paul Mackerras 
---
Changes since v1:
- Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
  CONFIG_PPC_BOOK3S_64 are defined.
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kernel/time.c| 8 +++-
 arch/powerpc/kvm/book3s_hv.c  | 2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..e2c12a10eed2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -131,6 +131,7 @@ struct kvmppc_host_state {
u64 cfar;
u64 ppr;
u64 host_fscr;
+   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
while on guest */
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..0beb8fdc6352 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -706,6 +706,7 @@ int main(void)
HSTATE_FIELD(HSTATE_CFAR, cfar);
HSTATE_FIELD(HSTATE_PPR, ppr);
HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 67feb3524460..f27f0163792b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
  */
 notrace unsigned long long sched_clock(void)
 {
-   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+   u64 tb = get_tb() - boot_tb;
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
+   tb -= local_paca->kvm_hstate.tb_offset;
+#endif
+
+   return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3731572295e..c08593c63353 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = vc->tb_offset;
+   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
}
 
if (vc->pcr)
@@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = 0;
+   local_paca->kvm_hstate.tb_offset = 0;
}
 
mtspr(SPRN_HDEC, 0x7fff);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b73140607875..8f7a9f7f4ee6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
cmpdi   r8,0
beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
+   std r8, HSTATE_TB_OFFSET(r13)
mftbr6  /* current host timebase */
add r8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
@@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
beq 17f
li  r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
+   std r0, HSTATE_TB_OFFSET(r13)
mftbr6  /* current guest timebase */
subfr8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
-- 
2.29.2

[PATCH 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Before guest entry, TBU40 register is changed to reflect guest timebase.
After exitting guest, the register is reverted to it's original value.

If one tries to get the timestamp from host between those changes, it
will present an incorrect value.

An example would be trying to add a tracepoint in
kvmppc_guest_entry_inject_int(), which depending on last tracepoint
acquired could actually cause the host to crash.

Save the Timebase Offset to PACA and use it on sched_clock() to always
get the correct timestamp.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kernel/time.c| 3 ++-
 arch/powerpc/kvm/book3s_hv.c  | 2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
 5 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..e2c12a10eed2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -131,6 +131,7 @@ struct kvmppc_host_state {
u64 cfar;
u64 ppr;
u64 host_fscr;
+   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
while on guest */
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..0beb8fdc6352 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -706,6 +706,7 @@ int main(void)
HSTATE_FIELD(HSTATE_CFAR, cfar);
HSTATE_FIELD(HSTATE_PPR, ppr);
HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 67feb3524460..adf6648e3572 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -699,7 +699,8 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
  */
 notrace unsigned long long sched_clock(void)
 {
-   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+   return mulhdu(get_tb() - boot_tb - local_paca->kvm_hstate.tb_offset, 
tb_to_ns_scale)
+   << tb_to_ns_shift;
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3731572295e..c08593c63353 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = vc->tb_offset;
+   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
}
 
if (vc->pcr)
@@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = 0;
+   local_paca->kvm_hstate.tb_offset = 0;
}
 
mtspr(SPRN_HDEC, 0x7fff);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b73140607875..8f7a9f7f4ee6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
cmpdi   r8,0
beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
+   std r8, HSTATE_TB_OFFSET(r13)
mftbr6  /* current host timebase */
add r8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
@@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
beq 17f
li  r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
+   std r0, HSTATE_TB_OFFSET(r13)
mftbr6  /* current guest timebase */
subfr8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
-- 
2.29.2

[PATCH v2 1/1] powerpc/kvm: Fix mask size for emulated msgsndp

2020-12-08 Thread Leonardo Bras

According to ISAv3.1 and ISAv3.0b, the msgsndp is described to split RB in:
msgtype <- (RB) 32:36
payload <- (RB) 37:63
t   <- (RB) 57:63

The current way of getting 'msgtype', and 't' is missing their MSB:
msgtype: ((arg >> 27) & 0xf) : Gets (RB) 33:36, missing bit 32
t:   (arg &= 0x3f)   : Gets (RB) 58:63, missing bit 57

Fixes this by applying the correct mask.

Signed-off-by: Leonardo Bras 
---
Changes since v1:
- Commit message 's/LSB/MSB/', because ISA ordering is big-endian.

 arch/powerpc/kvm/book3s_hv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e3b1839fc251..5af0a429cee8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1241,9 +1241,9 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
switch (get_xop(inst)) {
case OP_31_XOP_MSGSNDP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
-   arg &= 0x3f;
+   arg &= 0x7f;
if (arg >= kvm->arch.emul_smt_mode)
break;
tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
@@ -1256,7 +1256,7 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
break;
case OP_31_XOP_MSGCLRP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
vcpu->arch.vcore->dpdes = 0;
vcpu->arch.doorbell_request = 0;
-- 
2.25.4

1 2 3 4 5 >

1 - 100 of 433 matches

Mail list logo