[patch 10/18] iommu/vt-d: Adjust system_state checks

2017-05-14 Thread Thomas Gleixner
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state checks in dmar_parse_one_atsr() and
dmar_iommu_notify_scope_dev() to handle the extra states.

Signed-off-by: Thomas Gleixner 
Cc: David Woodhouse 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/intel-iommu.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4312,7 +4312,7 @@ int dmar_parse_one_atsr(struct acpi_dmar
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
 
-   if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+   if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
 
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4562,7 +4562,7 @@ int dmar_iommu_notify_scope_dev(struct d
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
 
-   if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+   if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
 
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 11/18] iommu/of: Adjust system_state check

2017-05-14 Thread Thomas Gleixner
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state check in of_iommu_driver_present() to handle the
extra states.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/of_iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(stru
 * it never will be. We don't want to defer indefinitely, nor attempt
 * to dereference __iommu_of_table after it's been freed.
 */
-   if (system_state > SYSTEM_BOOTING)
+   if (system_state >= SYSTEM_RUNNING)
return false;
 
return of_match_node(&__iommu_of_table, np);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 10/17] iommu/vt-d: Adjust system_state checks

2017-05-16 Thread Thomas Gleixner
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state checks in dmar_parse_one_atsr() and
dmar_iommu_notify_scope_dev() to handle the extra states.

Signed-off-by: Thomas Gleixner 
Acked-by: Joerg Roedel 
Cc: David Woodhouse 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/intel-iommu.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4312,7 +4312,7 @@ int dmar_parse_one_atsr(struct acpi_dmar
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
 
-   if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+   if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
 
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4562,7 +4562,7 @@ int dmar_iommu_notify_scope_dev(struct d
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
 
-   if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+   if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
 
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 11/17] iommu/of: Adjust system_state check

2017-05-16 Thread Thomas Gleixner
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state check in of_iommu_driver_present() to handle the
extra states.

Signed-off-by: Thomas Gleixner 
Acked-by: Joerg Roedel 
Acked-by: Robin Murphy 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/of_iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(stru
 * it never will be. We don't want to defer indefinitely, nor attempt
 * to dereference __iommu_of_table after it's been freed.
 */
-   if (system_state > SYSTEM_BOOTING)
+   if (system_state >= SYSTEM_RUNNING)
return false;
 
return of_match_node(&__iommu_of_table, np);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 12/55] iommu/amd: Use named irq domain interface

2017-06-19 Thread Thomas Gleixner
Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/amd_iommu.c |   13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4395,13 +4395,20 @@ static struct irq_chip amd_ir_chip = {
 
 int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
 {
-   iommu->ir_domain = irq_domain_add_tree(NULL, &amd_ir_domain_ops, iommu);
+   struct fwnode_handle *fn;
+
+   fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
+   if (!fn)
+   return -ENOMEM;
+   iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, 
iommu);
+   kfree(fn);
if (!iommu->ir_domain)
return -ENOMEM;
 
iommu->ir_domain->parent = arch_get_ir_parent_domain();
-   iommu->msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
-
+   iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
+"AMD-IR-MSI",
+iommu->index);
return 0;
 }
 


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 11/55] iommu/vt-d: Use named irq domain interface

2017-06-19 Thread Thomas Gleixner
Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/intel_irq_remapping.c |   22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -500,8 +500,9 @@ static void iommu_enable_irq_remapping(s
 static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 {
struct ir_table *ir_table;
-   struct page *pages;
+   struct fwnode_handle *fn;
unsigned long *bitmap;
+   struct page *pages;
 
if (iommu->ir_table)
return 0;
@@ -525,15 +526,24 @@ static int intel_setup_irq_remapping(str
goto out_free_pages;
}
 
-   iommu->ir_domain = irq_domain_add_hierarchy(arch_get_ir_parent_domain(),
-   0, INTR_REMAP_TABLE_ENTRIES,
-   NULL, &intel_ir_domain_ops,
-   iommu);
+   fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+   if (!fn)
+   goto out_free_bitmap;
+
+   iommu->ir_domain =
+   irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
+   0, INTR_REMAP_TABLE_ENTRIES,
+   fn, &intel_ir_domain_ops,
+   iommu);
+   kfree(fn);
if (!iommu->ir_domain) {
pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
goto out_free_bitmap;
}
-   iommu->ir_msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
+   iommu->ir_msi_domain =
+   arch_create_remap_msi_irq_domain(iommu->ir_domain,
+"INTEL-IR-MSI",
+iommu->seq_id);
 
ir_table->base = page_address(pages);
ir_table->bitmap = bitmap;


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 02/55] iommu/amd: Add name to irq chip

2017-06-19 Thread Thomas Gleixner
Add the missing name, so debugging will work proper.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/amd_iommu.c |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4386,10 +4386,11 @@ static void ir_compose_msi_msg(struct ir
 }
 
 static struct irq_chip amd_ir_chip = {
-   .irq_ack = ir_ack_apic_edge,
-   .irq_set_affinity = amd_ir_set_affinity,
-   .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
-   .irq_compose_msi_msg = ir_compose_msi_msg,
+   .name   = "AMD-IR",
+   .irq_ack= ir_ack_apic_edge,
+   .irq_set_affinity   = amd_ir_set_affinity,
+   .irq_set_vcpu_affinity  = amd_ir_set_vcpu_affinity,
+   .irq_compose_msi_msg= ir_compose_msi_msg,
 };
 
 int amd_iommu_create_irq_domain(struct amd_iommu *iommu)


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 03/55] iommu/vt-d: Add name to irq chip

2017-06-19 Thread Thomas Gleixner
Add the missing name, so debugging will work proper.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/intel_irq_remapping.c |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1205,10 +1205,11 @@ static int intel_ir_set_vcpu_affinity(st
 }
 
 static struct irq_chip intel_ir_chip = {
-   .irq_ack = ir_ack_apic_edge,
-   .irq_set_affinity = intel_ir_set_affinity,
-   .irq_compose_msi_msg = intel_ir_compose_msi_msg,
-   .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
+   .name   = "INTEL-IR",
+   .irq_ack= ir_ack_apic_edge,
+   .irq_set_affinity   = intel_ir_set_affinity,
+   .irq_compose_msi_msg= intel_ir_compose_msi_msg,
+   .irq_set_vcpu_affinity  = intel_ir_set_vcpu_affinity,
 };
 
 static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 10/55] x86/msi: Provide new iommu irqdomain interface

2017-06-19 Thread Thomas Gleixner
Provide a new interface for creating the iommu remapping domains, so that
the caller can supply a name and a id in order to create named irqdomains.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 arch/x86/include/asm/irq_remapping.h |2 ++
 arch/x86/kernel/apic/msi.c   |   15 +++
 2 files changed, 17 insertions(+)

--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -56,6 +56,8 @@ irq_remapping_get_irq_domain(struct irq_
 
 /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
 extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain 
*parent);
+extern struct irq_domain *
+arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int 
id);
 
 /* Get parent irqdomain for interrupt remapping irqdomain */
 static inline struct irq_domain *arch_get_ir_parent_domain(void)
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -167,10 +167,25 @@ static struct msi_domain_info pci_msi_ir
.handler_name   = "edge",
 };
 
+struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
+   const char *name, int id)
+{
+   struct fwnode_handle *fn;
+   struct irq_domain *d;
+
+   fn = irq_domain_alloc_named_id_fwnode(name, id);
+   if (!fn)
+   return NULL;
+   d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
+   kfree(fn);
+   return d;
+}
+
 struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
 {
return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
 }
+
 #endif
 
 #ifdef CONFIG_DMAR_TABLE


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 06/36] x86/mm: Add Secure Memory Encryption (SME) support

2017-06-20 Thread Thomas Gleixner
On Fri, 16 Jun 2017, Tom Lendacky wrote:
>  
> +config ARCH_HAS_MEM_ENCRYPT
> + def_bool y
> + depends on X86

That one is silly. The config switch is in the x86 KConfig file, so X86 is
on. If you intended to move this to some generic place outside of
x86/Kconfig then this should be

config ARCH_HAS_MEM_ENCRYPT
bool

and x86/Kconfig should have

select ARCH_HAS_MEM_ENCRYPT

and that should be selected by AMD_MEM_ENCRYPT

> +config AMD_MEM_ENCRYPT
> + bool "AMD Secure Memory Encryption (SME) support"
> + depends on X86_64 && CPU_SUP_AMD
> + ---help---
> +   Say yes to enable support for the encryption of system memory.
> +   This requires an AMD processor that supports Secure Memory
> +   Encryption (SME).

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 07/36] x86/mm: Don't use phys_to_virt in ioremap() if SME is active

2017-06-20 Thread Thomas Gleixner
On Fri, 16 Jun 2017, Tom Lendacky wrote:

> Currently there is a check if the address being mapped is in the ISA
> range (is_ISA_range()), and if it is then phys_to_virt() is used to
> perform the mapping.  When SME is active, however, this will result
> in the mapping having the encryption bit set when it is expected that
> an ioremap() should not have the encryption bit set. So only use the
> phys_to_virt() function if SME is not active

This does not make sense to me. What the heck has phys_to_virt() to do with
the encryption bit. Especially why would the encryption bit be set on that
mapping in the first place?

I'm probably missing something, but this want's some coherent explanation
understandable by mere mortals both in the changelog and the code comment.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 08/36] x86/mm: Add support to enable SME in early boot processing

2017-06-21 Thread Thomas Gleixner
On Fri, 16 Jun 2017, Tom Lendacky wrote:
> diff --git a/arch/x86/include/asm/mem_encrypt.h 
> b/arch/x86/include/asm/mem_encrypt.h
> index a105796..988b336 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -15,16 +15,24 @@
>  
>  #ifndef __ASSEMBLY__
>  
> +#include 
> +
>  #ifdef CONFIG_AMD_MEM_ENCRYPT
>  
>  extern unsigned long sme_me_mask;
>  
> +void __init sme_enable(void);
> +
>  #else/* !CONFIG_AMD_MEM_ENCRYPT */
>  
>  #define sme_me_mask  0UL
>  
> +static inline void __init sme_enable(void) { }
> +
>  #endif   /* CONFIG_AMD_MEM_ENCRYPT */
>  
> +unsigned long sme_get_me_mask(void);

Why is this an unconditional function? Isn't the mask simply 0 when the MEM
ENCRYPT support is disabled?

> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 6225550..ef12729 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -78,7 +78,29 @@ startup_64:
>   call__startup_64
>   popq%rsi
>  
> - movq$(early_top_pgt - __START_KERNEL_map), %rax
> + /*
> +  * Encrypt the kernel if SME is active.
> +  * The real_mode_data address is in %rsi and that register can be
> +  * clobbered by the called function so be sure to save it.
> +  */
> + push%rsi
> + callsme_encrypt_kernel
> + pop %rsi

That does not make any sense. Neither the call to sme_encrypt_kernel() nor
the following call to sme_get_me_mask().

__startup_64() is already C code, so why can't you simply call that from
__startup_64() in C and return the mask from there?

> @@ -98,7 +120,20 @@ ENTRY(secondary_startup_64)
>   /* Sanitize CPU configuration */
>   call verify_cpu
>  
> - movq$(init_top_pgt - __START_KERNEL_map), %rax
> + /*
> +  * Get the SME encryption mask.
> +  *  The encryption mask will be returned in %rax so we do an ADD
> +  *  below to be sure that the encryption mask is part of the
> +  *  value that will stored in %cr3.
> +  *
> +  * The real_mode_data address is in %rsi and that register can be
> +  * clobbered by the called function so be sure to save it.
> +  */
> + push%rsi
> + callsme_get_me_mask
> + pop %rsi

Do we really need a call here? The mask is established at this point, so
it's either 0 when the encryption stuff is not compiled in or it can be
retrieved from a variable which is accessible at this point.

> +
> + addq$(init_top_pgt - __START_KERNEL_map), %rax
>  1:
>  
>   /* Enable PAE mode, PGE and LA57 */

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 10/36] x86/mm: Provide general kernel support for memory encryption

2017-06-21 Thread Thomas Gleixner
On Fri, 16 Jun 2017, Tom Lendacky wrote:
>  
> +#ifndef pgprot_encrypted
> +#define pgprot_encrypted(prot)   (prot)
> +#endif
> +
> +#ifndef pgprot_decrypted

That looks wrong. It's not decrypted it's rather unencrypted, right?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 07/36] x86/mm: Don't use phys_to_virt in ioremap() if SME is active

2017-06-21 Thread Thomas Gleixner
On Fri, 16 Jun 2017, Tom Lendacky wrote:
> Currently there is a check if the address being mapped is in the ISA
> range (is_ISA_range()), and if it is then phys_to_virt() is used to
> perform the mapping.  When SME is active, however, this will result
> in the mapping having the encryption bit set when it is expected that
> an ioremap() should not have the encryption bit set. So only use the
> phys_to_virt() function if SME is not active
> 
> Reviewed-by: Borislav Petkov 
> Signed-off-by: Tom Lendacky 
> ---
>  arch/x86/mm/ioremap.c |7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
> index 4c1b5fd..a382ba9 100644
> --- a/arch/x86/mm/ioremap.c
> +++ b/arch/x86/mm/ioremap.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -106,9 +107,11 @@ static void __iomem *__ioremap_caller(resource_size_t 
> phys_addr,
>   }
>  
>   /*
> -  * Don't remap the low PCI/ISA area, it's always mapped..
> +  * Don't remap the low PCI/ISA area, it's always mapped.
> +  *   But if SME is active, skip this so that the encryption bit
> +  *   doesn't get set.
>*/
> - if (is_ISA_range(phys_addr, last_addr))
> + if (is_ISA_range(phys_addr, last_addr) && !sme_active())
>   return (__force void __iomem *)phys_to_virt(phys_addr);

More thoughts about that.

Making this conditional on !sme_active() is not the best idea. I'd rather
remove that whole thing and make it unconditional so the code pathes get
always exercised and any subtle wreckage is detected on a broader base and
not only on that hard to access and debug SME capable machine owned by Joe
User.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 08/36] x86/mm: Add support to enable SME in early boot processing

2017-06-21 Thread Thomas Gleixner
On Wed, 21 Jun 2017, Tom Lendacky wrote:
> On 6/21/2017 2:16 AM, Thomas Gleixner wrote:
> > Why is this an unconditional function? Isn't the mask simply 0 when the MEM
> > ENCRYPT support is disabled?
> 
> I made it unconditional because of the call from head_64.S. I can't make
> use of the C level static inline function and since the mask is not a
> variable if CONFIG_AMD_MEM_ENCRYPT is not configured (#defined to 0) I
> can't reference the variable directly.
> 
> I could create a #define in head_64.S that changes this to load rax with
> the variable if CONFIG_AMD_MEM_ENCRYPT is configured or a zero if it's
> not or add a #ifdef at that point in the code directly. Thoughts on
> that?

See below.

> > That does not make any sense. Neither the call to sme_encrypt_kernel() nor
> > the following call to sme_get_me_mask().
> > 
> > __startup_64() is already C code, so why can't you simply call that from
> > __startup_64() in C and return the mask from there?
> 
> I was trying to keep it explicit as to what was happening, but I can
> move those calls into __startup_64().

That's much preferred. And the return value wants to be documented in both
C and ASM code.

> I'll still need the call to sme_get_me_mask() in the secondary_startup_64
> path, though (depending on your thoughts to the above response).

call verify_cpu

movq$(init_top_pgt - __START_KERNEL_map), %rax

So if you make that:

/*
 * Sanitize CPU configuration and retrieve the modifier
 * for the initial pgdir entry which will be programmed
 * into CR3. Depends on enabled SME encryption, normally 0.
 */
call __startup_secondary_64

addq$(init_top_pgt - __START_KERNEL_map), %rax

You can hide that stuff in C-code nicely without adding any cruft to the
ASM code.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 08/36] x86/mm: Add support to enable SME in early boot processing

2017-06-21 Thread Thomas Gleixner
On Wed, 21 Jun 2017, Tom Lendacky wrote:
> On 6/21/2017 10:38 AM, Thomas Gleixner wrote:
> > /*
> >  * Sanitize CPU configuration and retrieve the modifier
> >  * for the initial pgdir entry which will be programmed
> >  * into CR3. Depends on enabled SME encryption, normally 0.
> >  */
> > call __startup_secondary_64
> > 
> >  addq$(init_top_pgt - __START_KERNEL_map), %rax
> > 
> > You can hide that stuff in C-code nicely without adding any cruft to the
> > ASM code.
> > 
> 
> Moving the call to verify_cpu into the C-code might be quite a bit of
> change.  Currently, the verify_cpu code is included code and not a
> global function.

Ah. Ok. I missed that.

> I can still do the __startup_secondary_64() function and then look to
> incorporate verify_cpu into both __startup_64() and
> __startup_secondary_64() as a post-patch to this series.

Yes, just having __startup_secondary_64() for now and there the extra bits
for that encryption stuff is fine.

> At least the secondary path will have a base C routine to which
> modifications can be made in the future if needed.  How does that sound?

Sounds like a plan.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v10 00/38] x86: Secure Memory Encryption (AMD)

2017-07-18 Thread Thomas Gleixner
On Mon, 17 Jul 2017, Tom Lendacky wrote:
> This patch series provides support for AMD's new Secure Memory Encryption 
> (SME)
> feature.
> 
> SME can be used to mark individual pages of memory as encrypted through the
> page tables. A page of memory that is marked encrypted will be automatically
> decrypted when read from DRAM and will be automatically encrypted when
> written to DRAM. Details on SME can found in the links below.
> 
> The SME feature is identified through a CPUID function and enabled through
> the SYSCFG MSR. Once enabled, page table entries will determine how the
> memory is accessed. If a page table entry has the memory encryption mask set,
> then that memory will be accessed as encrypted memory. The memory encryption
> mask (as well as other related information) is determined from settings
> returned through the same CPUID function that identifies the presence of the
> feature.
> 
> The approach that this patch series takes is to encrypt everything possible
> starting early in the boot where the kernel is encrypted. Using the page
> table macros the encryption mask can be incorporated into all page table
> entries and page allocations. By updating the protection map, userspace
> allocations are also marked encrypted. Certain data must be accounted for
> as having been placed in memory before SME was enabled (EFI, initrd, etc.)
> and accessed accordingly.
> 
> This patch series is a pre-cursor to another AMD processor feature called
> Secure Encrypted Virtualization (SEV). The support for SEV will build upon
> the SME support and will be submitted later. Details on SEV can be found
> in the links below.

Well done series. Thanks to all people involved, especially Tom and Boris!
It was a pleasure to review that.

Reviewed-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: amd-iommu/x2apic: sleeping function called from invalid context

2017-07-26 Thread Thomas Gleixner
On Tue, 25 Jul 2017, Artem Savkov wrote:

> Hi,
> 
> Commit 1c3c5ea "sched/core: Enable might_sleep() and smp_processor_id()
> checks early" seem to have uncovered an issue with amd-iommu/x2apic.
> 
> Starting with that commit the following warning started to show up on AMD
> systems during boot:
 
> [0.16] BUG: sleeping function called from invalid context at 
> kernel/locking/mutex.c:747 

> [0.16]  mutex_lock_nested+0x1b/0x20 
> [0.16]  register_syscore_ops+0x1d/0x70 
> [0.16]  state_next+0x119/0x910 
> [0.16]  iommu_go_to_state+0x29/0x30 
> [0.16]  amd_iommu_enable+0x13/0x23 
> [0.16]  irq_remapping_enable+0x1b/0x39 
> [0.16]  enable_IR_x2apic+0x91/0x196 
> [0.16]  default_setup_apic_routing+0x16/0x6e 
> [0.16]  native_smp_prepare_cpus+0x257/0x2d5 

Yep, that's clearly stupid. The completely untested patch below should cure
the issue.

Thanks,

tglx

8<---   

--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -2440,7 +2440,6 @@ static int __init state_next(void)
break;
case IOMMU_ACPI_FINISHED:
early_enable_iommus();
-   register_syscore_ops(&amd_iommu_syscore_ops);
x86_platform.iommu_shutdown = disable_iommus;
init_state = IOMMU_ENABLED;
break;
@@ -2559,6 +2558,8 @@ static int __init amd_iommu_init(void)
for_each_iommu(iommu)
iommu_flush_all_caches(iommu);
}
+   } else {
+   register_syscore_ops(&amd_iommu_syscore_ops);
}
 
return ret;


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/amd: Fix schedule-while-atomic BUG in initialization code

2017-07-26 Thread Thomas Gleixner
On Wed, 26 Jul 2017, Joerg Roedel wrote:
> Yes, that should fix it, but I think its better to just move the
> register_syscore_ops() call to a later initialization step, like in the
> patch below. I tested it an will queue it to my iommu/fixes branch.

Fair enough. Acked-by-me.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] x86: enable swiotlb for > 4GiG ram on 32-bit kernels

2018-10-14 Thread Thomas Gleixner
On Sun, 14 Oct 2018, Christoph Hellwig wrote:

> We already build the swiotlb code for 32b-t kernels with PAE support,
> but the code to actually use swiotlb has only been enabled for 64-bit
> kernel for an unknown reason.
> 
> Before Linux 4.18 we papers over this fact because the networking code,
> the scsi layer and some random block drivers implenented their own
> bounce buffering scheme.
> 
> Fixes: 21e07dba ("scsi: reduce use of block bounce buffers")
> Fixes: ab74cfeb ("net: remove the PCI_DMA_BUS_IS_PHYS check in 
> illegal_highdma")
> Reported-by: tedheadster 
> Tested-by: tedheadster 

I'll add your SOB when picking this up :)

> ---
>  arch/x86/kernel/pci-swiotlb.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
> index 661583662430..71c0b01d93b1 100644
> --- a/arch/x86/kernel/pci-swiotlb.c
> +++ b/arch/x86/kernel/pci-swiotlb.c
> @@ -42,10 +42,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
>  int __init pci_swiotlb_detect_4gb(void)
>  {
>   /* don't initialize swiotlb if iommu=off (no_iommu=1) */
> -#ifdef CONFIG_X86_64
>   if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
>   swiotlb = 1;
> -#endif
>  
>   /*
>* If SME is active then swiotlb will be set to 1 so that bounce
> -- 
> 2.19.1
> 
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] x86: enable swiotlb for > 4GiG ram on 32-bit kernels

2018-10-14 Thread Thomas Gleixner
On Sun, 14 Oct 2018, Christoph Hellwig wrote:

> On Sun, Oct 14, 2018 at 10:13:31AM +0200, Thomas Gleixner wrote:
> > On Sun, 14 Oct 2018, Christoph Hellwig wrote:
> > 
> > > We already build the swiotlb code for 32b-t kernels with PAE support,
> > > but the code to actually use swiotlb has only been enabled for 64-bit
> > > kernel for an unknown reason.
> > > 
> > > Before Linux 4.18 we papers over this fact because the networking code,
> > > the scsi layer and some random block drivers implenented their own
> > > bounce buffering scheme.
> > > 
> > > Fixes: 21e07dba ("scsi: reduce use of block bounce buffers")

Please use the first 12 characters of the commit SHA for fixes tags in the
future, as documented. No need to resend, I fixed it up for you and added a
Cc: stable as well

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] x86: enable swiotlb for > 4GiG ram on 32-bit kernels

2018-10-17 Thread Thomas Gleixner
On Tue, 16 Oct 2018, tedheadster wrote:
> On Sun, Oct 14, 2018 at 3:52 AM Christoph Hellwig  wrote:
> >
> > We already build the swiotlb code for 32b-t kernels with PAE support,
> > but the code to actually use swiotlb has only been enabled for 64-bit
> > kernel for an unknown reason.
> >
> > Before Linux 4.18 we papers over this fact because the networking code,
> > the scsi layer and some random block drivers implenented their own
> > bounce buffering scheme.
> >
> > Fixes: 21e07dba ("scsi: reduce use of block bounce buffers")
> > Fixes: ab74cfeb ("net: remove the PCI_DMA_BUS_IS_PHYS check in 
> > illegal_highdma")
> > Reported-by: tedheadster 
> > Tested-by: tedheadster 
> > ---
> >
> 
> Christoph,
>   this fix has causes performance to decrease dramatically. Kernel
> builds that used to take 10-15 minutes are now taking 45-60 minutes on
> the same machine.

Christoph, can you have a look please?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 5/7] x86/dma/amd-gart: Stop resizing dma_debug_entry pool

2018-12-10 Thread Thomas Gleixner
On Mon, 10 Dec 2018, Robin Murphy wrote:

> dma-debug is now capable of adding new entries to its pool on-demand if
> the initial preallocation was insufficient, so the IOMMU_LEAK logic no
> longer needs to explicitly change the pool size. This does lose it the
> ability to save a couple of megabytes of RAM by reducing the pool size
> below its default, but it seems unlikely that that is a realistic
> concern these days (or indeed that anyone is actively debugging AGP
> drivers' DMA usage any more). Getting rid of dma_debug_resize_entries()
> will make room for further streamlining in the dma-debug code itself.
> 
> Removing the call reveals quite a lot of cruft which has been useless
> for nearly a decade since commit 19c1a6f5764d ("x86 gart: reimplement
> IOMMU_LEAK feature by using DMA_API_DEBUG"), including the entire
> 'iommu=leak' parameter, which controlled nothing except whether
> dma_debug_resize_entries() was called or not.
> 
> CC: Thomas Gleixner 
> CC: Ingo Molnar 
> CC: Borislav Petkov 
> CC: "H. Peter Anvin" 
> CC: x...@kernel.org
> Reviewed-by: Christoph Hellwig 
> Signed-off-by: Robin Murphy 

Acked-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V3 1/3] x86/Hyper-V: Set x2apic destination mode to physical when x2apic is available

2019-02-10 Thread Thomas Gleixner
On Thu, 7 Feb 2019, lantianyu1...@gmail.com wrote:

> From: Lan Tianyu 
> 
> Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
> set x2apic destination mode to physcial mode when x2apic is available
> and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs have
> 8-bit APIC id.

This looks good now. Can that be applied independent of the IOMMU stuff or
should this go together. If the latter:

   Reviewed-by: Thomas Gleixner 

If not, I just queue if for 5.1. Let me know,

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 7/7] x86: remove the x86_dma_fallback_dev hack

2019-03-23 Thread Thomas Gleixner
On Thu, 21 Mar 2019, Christoph Hellwig wrote:

Please change the subject to:

 x86/dma: Remove the  hack

> Now that we removed support for the NULL device argument in the DMA API,
> there is no need to cater for that in the x86 code.

> Signed-off-by: Christoph Hellwig 
> ---
>  arch/x86/include/asm/dma-mapping.h | 10 --
>  arch/x86/kernel/amd_gart_64.c  |  6 --
>  arch/x86/kernel/pci-dma.c  | 20 
>  kernel/dma/mapping.c   |  7 ---
>  4 files changed, 43 deletions(-)

Feel free to route that through your DMA tree.

Reviewed-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] treewide: Rename "unencrypted" to "decrypted"

2020-03-17 Thread Thomas Gleixner
Borislav Petkov  writes:

> On Tue, Mar 17, 2020 at 01:35:12PM -0700, Dave Hansen wrote:
>> On 3/17/20 4:18 AM, Borislav Petkov wrote:
>> > Back then when the whole SME machinery started getting mainlined, it
>> > was agreed that for simplicity, clarity and sanity's sake, the terms
>> > denoting encrypted and not-encrypted memory should be "encrypted" and
>> > "decrypted". And the majority of the code sticks to that convention
>> > except those two. So rename them.
>> 
>> Don't "unencrypted" and "decrypted" mean different things?
>> 
>> Unencrypted to me means "encryption was never used for this data".
>> 
>> Decrypted means "this was/is encrypted but here is a plaintext copy".
>
> Maybe but linguistical semantics is not the point here.
>
> The idea is to represent a "binary" concept of memory being encrypted
> or memory being not encrypted. And at the time we decided to use
> "encrypted" and "decrypted" for those two things.
>
> Do you see the need to differentiate a third "state", so to speak, of
> memory which was never encrypted?

I think so.

encrypted data is something you can't use without having the key

decrypted data is the plaintext copy of something encrypted, so
it might be of sensible nature.

unencrypted data can still be sensible, but nothing ever bothered to
encrypt it in the first place.

So having this distinction is useful in terms of setting the context
straight.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH -v2] treewide: Rename "unencrypted" to "decrypted"

2020-03-19 Thread Thomas Gleixner
Borislav Petkov  writes:

> On Thu, Mar 19, 2020 at 11:06:15AM +, Robin Murphy wrote:
>> Let me add another vote from a native English speaker that "unencrypted" is
>> the appropriate term to imply the *absence* of encryption, whereas
>> "decrypted" implies the *reversal* of applied encryption.
>> 
>> Naming things is famously hard, for good reason - names are *important* for
>> understanding. Just because a decision was already made one way doesn't mean
>> that that decision was necessarily right. Churning one area to be
>> consistently inaccurate just because it's less work than churning another
>> area to be consistently accurate isn't really the best excuse.
>
> Well, the reason we chose "decrypted" vs something else is so to be as
> different from "encrypted" as possible. If we called it "unencrypted"
> you'd have stuff like:
>
>if (force_dma_unencrypted(dev))
> set_memory_encrypted((unsigned long)cpu_addr, 1 << 
> page_order);

TBH, I don't see how

if (force_dma_decrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
   
makes more sense than the above. It's both non-sensical unless there is
a big fat comment explaining what this is about.

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH -v2] treewide: Rename "unencrypted" to "decrypted"

2020-03-19 Thread Thomas Gleixner
Borislav Petkov  writes:
> On Thu, Mar 19, 2020 at 06:25:49PM +0100, Thomas Gleixner wrote:
>> TBH, I don't see how
>> 
>>  if (force_dma_decrypted(dev))
>>  set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
>>
>> makes more sense than the above. It's both non-sensical unless there is
>
> 9087c37584fb ("dma-direct: Force unencrypted DMA under SME for certain DMA 
> masks")

Reading the changelog again...

I have to say that force_dma_unencrypted() makes way more sense in that
context than force_dma_decrypted(). It still wants a comment.

Linguistical semantics and correctness matters a lot. Consistency is
required as well, but not for the price of ambiguous wording.

Thanks,

tglx


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/7] docs: x86: Add a documentation for ENQCMD

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:

s/Add a documentation/Add documentation/

> From: Ashok Raj 
>
> ENQCMD and Data Streaming Accelerator (DSA) and all of their associated
> features are a complicated stack with lots of interconnected pieces.
> This documentation provides a big picture overview for all of the
> features.
>
> Signed-off-by: Ashok Raj 
> Co-developed-by: Fenghua Yu 
> Signed-off-by: Fenghua Yu 
> Reviewed-by: Tony Luck 
> ---
>  Documentation/x86/enqcmd.rst | 185 +++

How is that hooked up into the Documentation index?

 Documentation/x86/enqcmd.rst: WARNING: document isn't included in any toctree

> +++ b/Documentation/x86/enqcmd.rst
> @@ -0,0 +1,185 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +Improved Device Interaction Overview

So the document is about ENQCMD, right? Can you please make that in some
way consistently named?

> +
> +== Background ==

This lacks any docbook formatting The resulting HTML looks like ...

> +
> +Shared Virtual Addressing (SVA) allows the processor and device to use the
> +same virtual addresses avoiding the need for software to translate virtual
> +addresses to physical addresses. ENQCMD is a new instruction on Intel
> +platforms that allows user applications to directly notify hardware of new
> +work, much like doorbells are used in some hardware, but carries a payload
> +that carries the PASID and some additional device specific commands
> +along with it.

Sorry that's not background information, that's an agglomeration of
words.

Can you please explain properly what's the background of SVA, how it
differs from regular device addressing and what kind of requirements it
has?

ENQCMD is not related to background. It's part of the new technology.

> +== Address Space Tagging ==
> +
> +A new MSR (MSR_IA32_PASID) allows an application address space to be
> +associated with what the PCIe spec calls a Process Address Space ID
> +(PASID). This PASID tag is carried along with all requests between
> +applications and devices and allows devices to interact with the process
> +address space.

Sigh. The important part here is not the MSR. The important part is to
explain what PASID is and where it comes from. Documentation has similar
rules as changelogs:

  1) Provide context

  2) Explain requirements
  
  3) Explain implementation

The pile you provided is completely backwards and unstructured.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/7] x86/cpufeatures: Enumerate ENQCMD and ENQCMDS instructions

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:
> A user space application can execute ENQCMD instruction to submit work
> to device. The kernel executes ENQCMDS instruction to submit work to
> device.

So a user space application _can_ execute ENQCMD and the kernel
executes ENQCMDS. And both submit work to device.

> There is a lot of other enabling needed for the instructions to actually
> be usable in user space and the kernel, and that enabling is coming later
> in the series and in device drivers.

That's important information to the enumeration of the instructions in
which way?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/7] x86/fpu/xstate: Add supervisor PASID state for ENQCMD feature

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:
> From: Yu-cheng Yu 
>
> The IA32_PASID MSR is used when a task submits work via the ENQCMD
> instruction.

Is used?

> The per task MSR is stored in the task's supervisor FPU

per task MSR? Lot's of MSRs 

> PASID state and is context switched by XSAVES/XRSTORS.
>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/7] x86/msr-index: Define IA32_PASID MSR

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:

> The IA32_PASID MSR (0xd93) contains the Process Address Space Identifier
> (PASID), a 20-bit value. Bit 31 must be set to indicate the value
> programmed in the MSR is valid. Hardware uses PASID to identify which
> process submits the work and direct responses to the right process.

No. It does not identify the process. It identifies the process' address
space as the name says.

Please provide coherent and precise information.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/7] x86/mmu: Allocate/free PASID

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:

> PASID is shared by all threads in a process. So the logical place to keep
> track of it is in the "mm". Add the field to the architecture specific
> mm_context_t structure.
>
> A PASID is allocated for an "mm" the first time any thread attaches
> to an SVM capable device. Later device atatches (whether to the same

atatches?

> device or another SVM device) will re-use the same PASID.
>
> The PASID is freed when the process exits (so no need to keep
> reference counts on how many SVM devices are sharing the PASID).

I'm not buying that. If there is an outstanding request with the PASID
of a process then tearing down the process address space and freeing the
PASID (which might be reused) is fundamentally broken.

> +void __free_pasid(struct mm_struct *mm);
> +
>  #endif /* _ASM_X86_IOMMU_H */
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index bdeae9291e5c..137bf51f19e6 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -50,6 +50,10 @@ typedef struct {
>   u16 pkey_allocation_map;
>   s16 execute_only_pkey;
>  #endif
> +
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> + int pasid;

int? It's a value which gets programmed into the MSR along with the
valid bit (bit 31) set. 

>  extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> index d7f2a5358900..da718a49e91e 100644
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -226,6 +226,45 @@ static LIST_HEAD(global_svm_list);
>   list_for_each_entry((sdev), &(svm)->devs, list) \
>   if ((d) != (sdev)->dev) {} else
>  
> +/*
> + * If this mm already has a PASID we can use it. Otherwise allocate a new 
> one.
> + * Let the caller know if we did an allocation via 'new_pasid'.
> + */
> +static int alloc_pasid(struct intel_svm *svm, struct mm_struct *mm,
> +int pasid_max,  bool *new_pasid, int flags)

Again, data types please. flags are generally unsigned and not plain
int. Also pasid_max is certainly not plain int either.

> +{
> + int pasid;
> +
> + /*
> +  * Reuse the PASID if the mm already has a PASID and not a private
> +  * PASID is requested.
> +  */
> + if (mm && mm->context.pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
> + /*
> +  * Once a PASID is allocated for this mm, the PASID
> +  * stays with the mm until the mm is dropped. Reuse
> +  * the PASID which has been already allocated for the
> +  * mm instead of allocating a new one.
> +  */
> + ioasid_set_data(mm->context.pasid, svm);

So if the PASID is reused several times for different SVMs then every
time ioasid_data->private is set to a different SVM. How is that
supposed to work?

> + *new_pasid = false;
> +
> + return mm->context.pasid;
> + }
> +
> + /*
> +  * Allocate a new pasid. Do not use PASID 0, reserved for RID to
> +  * PASID.
> +  */
> + pasid = ioasid_alloc(NULL, PASID_MIN, pasid_max - 1, svm);

ioasid_alloc() uses ioasid_t which is

typedef unsigned int ioasid_t;

Can we please have consistent types and behaviour all over the place?

> + if (pasid == INVALID_IOASID)
> + return -ENOSPC;
> +
> + *new_pasid = true;
> +
> + return pasid;
> +}
> +
>  int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
> svm_dev_ops *ops)
>  {
>   struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
> @@ -324,6 +363,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
> flags, struct svm_dev_
>   init_rcu_head(&sdev->rcu);
>  
>   if (!svm) {
> + bool new_pasid;
> +
>   svm = kzalloc(sizeof(*svm), GFP_KERNEL);
>   if (!svm) {
>   ret = -ENOMEM;
> @@ -335,15 +376,13 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, 
> int flags, struct svm_dev_
>   if (pasid_max > intel_pasid_max_id)
>   pasid_max = intel_pasid_max_id;
>  
> - /* Do not use PASID 0, reserved for RID to PASID */
> - svm->pasid = ioasid_alloc(NULL, PASID_MIN,
> -   pasid_max - 1, svm);
> - if (svm->pasid == INVALID_IOASID) {
> + svm->pasid = alloc_pasid(svm, mm, pasid_max, &new_pasid, flags);
> + if (svm->pasid < 0) {
>   kfree(svm);
>   kfree(sdev);
> - ret = -ENOSPC;

ret gets magically initialized to an error return value, right?

>   goto out;
>   }
> +
>   svm->notifier.ops = &intel_mmuops;
>   svm->mm = mm;
>   svm->flags = flags;
> @@ -353,7 +392,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
> flags, struct svm_dev_
>   if (mm) {
>   

Re: [PATCH 6/7] x86/traps: Fix up invalid PASID

2020-04-26 Thread Thomas Gleixner
Fenghua Yu  writes:
> A #GP fault is generated when ENQCMD instruction is executed without
> a valid PASID value programmed in.

Programmed in what?

> The #GP fault handler will initialize the current thread's PASID MSR.
>
> The following heuristic is used to avoid decoding the user instructions
> to determine the precise reason for the #GP fault:
> 1) If the mm for the process has not been allocated a PASID, this #GP
>cannot be fixed.
> 2) If the PASID MSR is already initialized, then the #GP was for some
>other reason
> 3) Try initializing the PASID MSR and returning. If the #GP was from
>an ENQCMD this will fix it. If not, the #GP fault will be repeated
>and we will hit case "2".
>
> Suggested-by: Thomas Gleixner 

Just for the record I also suggested to have a proper errorcode in the
#GP for ENQCMD and I surely did not suggest to avoid decoding the user
instructions.

>  void __free_pasid(struct mm_struct *mm);
> +bool __fixup_pasid_exception(void);
>  
>  #endif /* _ASM_X86_IOMMU_H */
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 6ef00eb6fbb9..369b5ba94635 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -56,6 +56,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef CONFIG_X86_64
>  #include 
> @@ -488,6 +489,16 @@ static enum kernel_gp_hint get_kernel_gp_address(struct 
> pt_regs *regs,
>   return GP_CANONICAL;
>  }
>  
> +static bool fixup_pasid_exception(void)
> +{
> + if (!IS_ENABLED(CONFIG_INTEL_IOMMU_SVM))
> + return false;
> + if (!static_cpu_has(X86_FEATURE_ENQCMD))
> + return false;
> +
> + return __fixup_pasid_exception();
> +}
> +
>  #define GPFSTR "general protection fault"
>  
>  dotraplinkage void do_general_protection(struct pt_regs *regs, long 
> error_code)
> @@ -499,6 +510,12 @@ dotraplinkage void do_general_protection(struct pt_regs 
> *regs, long error_code)
>   int ret;
>  
>   RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> +
> + if (user_mode(regs) && fixup_pasid_exception()) {
> + cond_local_irq_enable(regs);

The point of this conditional irq enable _AFTER_ calling into the fixup
function is? Also what's the reason for keeping interrupts disabled
while calling into that function? Comments exist for a reason.

> + return;
> + }
> +
>   cond_local_irq_enable(regs);
>  
>   if (static_cpu_has(X86_FEATURE_UMIP)) {
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> index da718a49e91e..5ed39a022adb 100644
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -759,3 +759,40 @@ void __free_pasid(struct mm_struct *mm)
>*/
>   ioasid_free(pasid);
>  }
> +
> +/*
> + * Fix up the PASID MSR if possible.
> + *
> + * But if the #GP was due to another reason, a second #GP might be triggered
> + * to force proper behavior.
> + */
> +bool __fixup_pasid_exception(void)
> +{
> + struct mm_struct *mm;
> + bool ret = true;
> + u64 pasid_msr;
> + int pasid;
> +
> + mm = get_task_mm(current);

Why do you need a reference to current->mm ?

> + /* This #GP was triggered from user mode. So mm cannot be NULL. */
> + pasid = mm->context.pasid;
> + /* Ensure this process has been bound to a PASID. */
> + if (!pasid) {
> + ret = false;
> + goto out;
> + }
> +
> + /* Check to see if the PASID MSR has already been set for this task. */
> + rdmsrl(MSR_IA32_PASID, pasid_msr);
> + if (pasid_msr & MSR_IA32_PASID_VALID) {
> + ret = false;
> + goto out;
> + }
> +
> + /* Fix up the MSR. */
> + wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
> +out:
> + mmput(mm);

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/7] x86/mmu: Allocate/free PASID

2020-04-27 Thread Thomas Gleixner
Fenghua Yu  writes:
> On Sun, Apr 26, 2020 at 04:55:25PM +0200, Thomas Gleixner wrote:
>> Fenghua Yu  writes:
>> > + +#ifdef CONFIG_INTEL_IOMMU_SVM + int pasid;
>> 
>> int? It's a value which gets programmed into the MSR along with the valid 
>> bit (bit 31) set.
>
> The pasid is defined as "int" in struct intel_svm and in 
> intel_svm_bind_mm() and intel_svm_unbind_mm(). So the pasid defined in this 
> patch follows the same type defined in those places.

Which are wrong to begin with.

>> ioasid_alloc() uses ioasid_t which is
>> 
>> typedef unsigned int ioasid_t;
>> 
>> Can we please have consistent types and behaviour all over the place?
>
> Should I just define "pasid", "pasid_max", "flags" as "unsigned int" for
> the new functions/code?
>
> Or should I also change their types to "unsigned int" in the original
> svm code (struct intel_svm, ...bind_mm(), etc)? I'm afraid that will be
> a lot of changes and should be in a separate preparation patch.

Yes, please. The existance of non-sensical code is not an excuse to
proliferate it.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 6/7] x86/traps: Fix up invalid PASID

2020-04-27 Thread Thomas Gleixner
Fenghua Yu  writes:
> On Sun, Apr 26, 2020 at 05:25:06PM +0200, Thomas Gleixner wrote:
>> > @@ -499,6 +510,12 @@ dotraplinkage void do_general_protection(struct 
>> > pt_regs *regs, long error_code)
>> >int ret;
>> >  
>> >RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
>> > +
>> > +  if (user_mode(regs) && fixup_pasid_exception()) {
>> > +  cond_local_irq_enable(regs);
>> 
>> The point of this conditional irq enable _AFTER_ calling into the fixup
>> function is? Also what's the reason for keeping interrupts disabled
>> while calling into that function? Comments exist for a reason.
>
> irq needs to be disabled because the fixup function requires to disable
> preempt in order to update the PASID MSR on the faulting CPU.

No, that's just wrong. It's not about the update itself.

> Will add comments here.

Factual ones and not some fairy tales please.

>> > +bool __fixup_pasid_exception(void)
>> > +{
>> > +  struct mm_struct *mm;
>> > +  bool ret = true;
>> > +  u64 pasid_msr;
>> > +  int pasid;
>> > +
>> > +  mm = get_task_mm(current);
>> 
>> Why do you need a reference to current->mm ?
>
> The PASID for the address space is per mm and is stored in mm.
> To get the PASID, we need to get the mm and the
> pasid=mm->context.pasid.

It's obvious that you need to access current-mm in order to check
current->mm->context.pasid. Let me rephrase the question:

   Why do you need to take a reference on current->mm ?

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 6/7] x86/traps: Fix up invalid PASID

2020-04-27 Thread Thomas Gleixner
"Luck, Tony"  writes:
>> Just for the record I also suggested to have a proper errorcode in the
>> #GP for ENQCMD and I surely did not suggest to avoid decoding the user
>> instructions.
>
> Is the heuristic to avoid decoding the user instructions OK (you are just 
> pointing
> out that you should not be given credit for this part of the idea)?

I surely suggested the approach, but at the same time I asked for the
error code and did not say that instruction checking needs to be
avoided.

This comment was just to make it clear that there were other options
discussed. IOW, the changelog should have some explicit explanations
why:

 - the error code idea does not work (according to HW folks)

 - the instruction decoding has no real benefit because $REASONS

Thanks,

tglx


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 6/7] x86/traps: Fix up invalid PASID

2020-04-27 Thread Thomas Gleixner
Ashok,

"Raj, Ashok"  writes:
> On Sun, Apr 26, 2020 at 05:25:06PM +0200, Thomas Gleixner wrote:
>> Just for the record I also suggested to have a proper errorcode in the
>> #GP for ENQCMD and I surely did not suggest to avoid decoding the user
>> instructions.
>
> We certainly discussed the possiblity of adding an error code to 
> identiy #GP due to ENQCMD with our HW architects. 
>
> There are only a few cases that have an error code, like move to segment
> with an invalid value for instance. There were a few but i don't
> recall that entire list. 
>
> Since the error code is 0 in most places, there isn't plumbing in hw to return
> this value in all cases. It appeared that due to some uarch reasons it
> wasn't as simple as it appears to /me sw kinds :-)

Sigh.

> So after some internal discussion we decided to take the current
> approach. Its possible that if the #GP was due to some other reason
> we might #GP another time. Since this wasn't perf or speed path we took
> this lazy approach.

I know that the HW people's mantra is that everything can be fixed in
software and therefore slapping new features into the CPUs can be done
without thinking about the consequeses.

But we all know from painful experience that this is fundamentally wrong
unless there is a really compelling reason.

For new features there is absolutely no reason at all.

Can HW people pretty please understand that hardware and software have
to be co-designed and not dictated by 'some uarch reasons'. This is
nothing fundamentally new. This problem existed 30+ years ago, is well
documented and has been ignored forever. I'm tired of that, really.

But as this seems to be unsolvable for the problem at hand can you
please document the inability, unwillingness or whatever in the
changelog?

The question why this brand new_ ENQCMD + invalid PASID induced #GP does
not generate an useful error code and needs heuristics to be dealt with
is pretty obvious.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/7] x86/mmu: Allocate/free PASID

2020-04-28 Thread Thomas Gleixner
"Jacob Pan (Jun)"  writes:
> On Sun, 26 Apr 2020 16:55:25 +0200
> Thomas Gleixner  wrote:
>> Fenghua Yu  writes:
>> > The PASID is freed when the process exits (so no need to keep
>> > reference counts on how many SVM devices are sharing the PASID).  
>> 
>> I'm not buying that. If there is an outstanding request with the PASID
>> of a process then tearing down the process address space and freeing
>> the PASID (which might be reused) is fundamentally broken.
>> 
> Device driver unbind PASID is tied to FD release. So when a process
> exits, FD close causes driver to do the following:
>
> 1. stops DMA
> 2. unbind PASID (clears the PASID entry in IOMMU, flush all TLBs, drain
> in flight page requests)

Fair enough. Explaining that somewhere might be helpful.

> For bare metal SVM, if the last mmdrop always happens after FD release,
> we can ensure no outstanding requests at the point of ioasid_free().
> Perhaps this is a wrong assumption?

If fd release cleans up then how should there be something in flight at
the final mmdrop?

> For guest SVM, there will be more users of a PASID. I am also
> working on adding refcounting to ioasid. ioasid_free() will not release
> the PASID back to the pool until all references are dropped.

What does more users mean?

>> > +  if (mm && mm->context.pasid && !(flags &
>> > SVM_FLAG_PRIVATE_PASID)) {
>> > +  /*
>> > +   * Once a PASID is allocated for this mm, the PASID
>> > +   * stays with the mm until the mm is dropped. Reuse
>> > +   * the PASID which has been already allocated for
>> > the
>> > +   * mm instead of allocating a new one.
>> > +   */
>> > +  ioasid_set_data(mm->context.pasid, svm);  
>> 
>> So if the PASID is reused several times for different SVMs then every
>> time ioasid_data->private is set to a different SVM. How is that
>> supposed to work?
>> 
> For the lifetime of the mm, there is only one PASID. svm_bind/unbind_mm
> could happen many times with different private data: intel_svm.
> Multiple devices can bind to the same PASID as well. But private data
> don't change within the first bind and last unbind.

Ok. I read through that spaghetti of intel_svm_bind_mm() again and now I
start to get an idea how that is supposed to work. What a mess.

That function really wants to be restructured in a way so it is
understandable to mere mortals. 

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 01/22] PCI/MSI: Clean up struct msi_chip argument

2014-09-25 Thread Thomas Gleixner
On Thu, 25 Sep 2014, Thierry Reding wrote:

> On Thu, Sep 25, 2014 at 11:14:11AM +0800, Yijing Wang wrote:
> > Msi_chip functions setup_irq/teardown_irq rarely use msi_chip
> > argument.
> 
> That's not true. Out of the four drivers that you modify two use the
> parameter. And the two that don't probably should be using it too.
> 
> 50% is not "rarely". =)
> 
> >   We can look up msi_chip pointer by the device pointer
> > or irq number, so clean up msi_chip argument.
> 
> I don't like this particular change. The idea was to keep the API object
> oriented so that drivers wouldn't have to know where to get the MSI chip
> from. It also makes it more resilient against code reorganizations since
> the core code is the only place that needs to know where to get the chip
> from.

Right. We have the same thing in the irq_chip callbacks. All of them
take "struct irq_data", because it's already available in the core
code and it gives easy access to all information (chip, chipdata ...)
which is necessary for the callback implementations.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 06/22] PCI/MSI: Introduce weak arch_find_msi_chip() to find MSI chip

2014-09-25 Thread Thomas Gleixner
On Thu, 25 Sep 2014, Yijing Wang wrote:

> Introduce weak arch_find_msi_chip() to find the match msi_chip.
> Currently, MSI chip associates pci bus to msi_chip. Because in
> ARM platform, there may be more than one MSI controller in system.
> Associate pci bus to msi_chip help pci device to find the match
> msi_chip and setup MSI/MSI-X irq correctly. But in other platform,
> like in x86. we only need one MSI chip, because all device use
> the same MSI address/data and irq etc. So it's no need to associate
> pci bus to MSI chip, just use a arch function, arch_find_msi_chip()
> to return the MSI chip for simplicity. The default weak
> arch_find_msi_chip() used in ARM platform, find the MSI chip
> by pci bus.

This is really backwards. On one hand you try to get rid of the weak
arch functions zoo and then you invent new ones for no good
reason. Why can't x86 store the chip in the pci bus?

Looking deeper, I'm questioning the whole notion of different
msi_chips. Are this really different MSI controllers with a different
feature set or are this defacto multiple instances of the same
controller which just need a different data set?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 06/22] PCI/MSI: Introduce weak arch_find_msi_chip() to find MSI chip

2014-09-26 Thread Thomas Gleixner
On Fri, 26 Sep 2014, Yijing Wang wrote:
> On 2014/9/25 18:38, Thomas Gleixner wrote:
> > On Thu, 25 Sep 2014, Yijing Wang wrote:
> > 
> >> Introduce weak arch_find_msi_chip() to find the match msi_chip.
> >> Currently, MSI chip associates pci bus to msi_chip. Because in
> >> ARM platform, there may be more than one MSI controller in system.
> >> Associate pci bus to msi_chip help pci device to find the match
> >> msi_chip and setup MSI/MSI-X irq correctly. But in other platform,
> >> like in x86. we only need one MSI chip, because all device use
> >> the same MSI address/data and irq etc. So it's no need to associate
> >> pci bus to MSI chip, just use a arch function, arch_find_msi_chip()
> >> to return the MSI chip for simplicity. The default weak
> >> arch_find_msi_chip() used in ARM platform, find the MSI chip
> >> by pci bus.
> > 
> > This is really backwards. On one hand you try to get rid of the weak
> > arch functions zoo and then you invent new ones for no good
> > reason. Why can't x86 store the chip in the pci bus?
> > 
> > Looking deeper, I'm questioning the whole notion of different
> > msi_chips. Are this really different MSI controllers with a different
> > feature set or are this defacto multiple instances of the same
> > controller which just need a different data set?
> 
> MSI chip in this series is to setup MSI irq, including IRQ allocation, Map,
> compose MSI msg ..., in different platform, many arch specific MSI irq 
> details in it.
> It's difficult to extract the common data and code.
> 
> I have a plan to rework MSI related irq_chips in kernel, PCI and Non-PCI, 
> currently,
> HPET, DMAR and PCI have their own irq_chip and MSI related functions, 
> write_msi_msg(),
> mask_msi_irq(), etc... I want to extract the common data set for that, so we 
> can
> remove lots of unnecessary MSI code.

That makes sense. Can you please make sure that this does not conflict
with the ongoing work Jiang is doing in the x86 irq area with
hierarchical irqdomains to distangle layered levels like MSI from the
underlying vector/irqremap mechanics.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 0/5] enhance DMA CMA on x86

2014-09-30 Thread Thomas Gleixner
On Tue, 30 Sep 2014, Peter Hurley wrote:
> I read the UFS Unified Memory Extension v1.0 (JESD220-1) specification and
> it is not clear to me that using DMA mapping is the right approach to
> supporting UM, at least on x86.
> 
> And without a mainline user, the merits of this approach are not evident.
> I cannot even find a production x86 UFS controller, much less one that
> supports UME.
> 
> The only PCI UFS controller I could find (and that mainline supports) is
> Samsung's x86 FPGA-based test unit for developing UFS devices in a x86 test
> environment, and not a production x86 design.

And how is that relevant? That device exists and you have no reason to
deny it to be supported just because you are not interested in it.
 
> Unless there's something else I've missed, I don't think these patches
> belong in mainline.

You missed that there is no reason WHY such a device should not be
supported in mainline.

> Samsung's own roadmap
> (http://www.slideshare.net/linaroorg/next-gen-mobilestorageufs)
> mentions nothing about bringing UFS to x86 designs.

And that's telling you what? 

   - That we should deny Samsung proper support for their obviously
 x86 based test card

   - That we should ignore a JEDEC Standard which is obviously never
 going to hit x86 land just because you decide it?

Your argumentation is just ass backwards. Linux wants to support the
full zoo of hardware including this particular PCI card. Period.

Whether the proposed patchset is the correct solution to support it is
a completely different question.

So either you stop this right now and help Akinobu to find the proper
solution or you just go back in your uncontaminated x86 cave and STFU.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 0/5] enhance DMA CMA on x86

2014-10-01 Thread Thomas Gleixner
On Tue, 30 Sep 2014, Peter Hurley wrote:
> On 09/30/2014 07:45 PM, Thomas Gleixner wrote:
> > Whether the proposed patchset is the correct solution to support it is
> > a completely different question.
> 
> This patchset has been in mainline since 3.16 and has already caused
> regressions, so the question of whether this is the correct solution has
> already been answered.

Agreed.
 
> > So either you stop this right now and help Akinobu to find the proper
> > solution 
> 
> If this is only a test platform for ARM parts then I don't think it
> unreasonable to suggest forking x86 swiotlb support into a iommu=cma
> selector that gets DMA mapping working for this test platform and doesn't
> cause a bunch of breakage.

Breakage is not acceptable in any case.
 
> Which is different than if the plan is to ship production units for x86;
> then a general purpose solution will be required.
> 
> As to the good design of a general purpose solution for allocating and
> mapping huge order pages, you are certainly more qualified to help Akinobu
> than I am.

Fair enough. Still this does not make the case for outright rejecting
the idea of supporting that kind of device even if it is a esoteric
case. We deal with enough esoteric hardware in Linux and if done
right, it's no harm to anyone.

I'll have a look at the technical details.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-28 Thread Thomas Gleixner
On Tue, 28 Oct 2014, Jiang Liu wrote:
> +static int msi_set_affinity(struct irq_data *data, const struct cpumask 
> *mask,
> + bool force)
> +{
> + struct irq_data *parent = data->parent_data;
> + int ret;
>  
> - msg.data &= ~MSI_DATA_VECTOR_MASK;
> - msg.data |= MSI_DATA_VECTOR(cfg->vector);
> - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> - msg.address_lo |= MSI_ADDR_DEST_ID(dest);
> + ret = parent->chip->irq_set_affinity(parent, mask, force);
> + /* No need to reprogram MSI registers if interrupt is remapped */
> + if (ret >= 0 && !msi_irq_remapped(data)) {
> + struct msi_msg msg;
>  
> - __write_msi_msg(data->msi_desc, &msg);
> + __get_cached_msi_msg(data->msi_desc, &msg);
> + msi_update_msg(&msg, data);
> + __write_msi_msg(data->msi_desc, &msg);
> + }

I'm not too happy about the msi_irq_remapped() conditional here. It
violates the whole concept of domain stacking somewhat.

A better separation would be to add a callback to the irq chip:

void (*irq_write_msi_msg)(struct irq_data *data, struct msi_desc 
*msi_desc, bool cached);

and change this code to:

if (ret >= 0)
parent->chip->irq_write_msi_msg(parent, data->msi-desc, true);
  
> - return IRQ_SET_MASK_OK_NOCOPY;
> + return ret;
>  }

And do the same here:

> +static int msi_domain_activate(struct irq_domain *domain,
> +struct irq_data *irq_data)
> +{
> + struct msi_msg msg;
> + struct irq_cfg *cfg = irqd_cfg(irq_data);
> +
> + /*
> +  * irq_data->chip_data is MSI/MSIx offset.
> +  * MSI-X message is written per-IRQ, the offset is always 0.
> +  * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
> +  */
> + if (irq_data->chip_data)
> + return 0;

parent->chip->irq_write_msi_msg(parent, data->msi_desc, false); 


> + if (msi_irq_remapped(irq_data))
> + irq_remapping_get_msi_entry(irq_data->parent_data, &msg);
> + else
> + native_compose_msi_msg(NULL, irq_data->irq, cfg->dest_apicid,
> +&msg, 0);
> + write_msi_msg(irq_data->irq, &msg);
> +
> + return 0;
> +}

And here:

> +static int msi_domain_deactivate(struct irq_domain *domain,
> +  struct irq_data *irq_data)
> +{
> + struct msi_msg msg;
> +
> + if (irq_data->chip_data)
> + return 0;
> +
> + memset(&msg, 0, sizeof(msg));
> + write_msi_msg(irq_data->irq, &msg);

parent->chip->irq_write_msi_msg(parent, NULL, false);

> + return 0;
> +}

And let the vector and the remapping domain deal with it in their callbacks.

> @@ -166,25 +264,59 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc 
> *msidesc,
>  
>  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
>  {
> - struct msi_desc *msidesc;
> - int irq, ret;
> + int irq, cnt, nvec_pow2;
> + struct irq_domain *domain;
> + struct msi_desc *msidesc, *iter;
> + struct irq_alloc_info info;
> + int node = dev_to_node(&dev->dev);
>  
> - /* Multiple MSI vectors only supported with interrupt remapping */
> - if (type == PCI_CAP_ID_MSI && nvec > 1)
> - return 1;
> + if (disable_apic)
> + return -ENOSYS;
>  
> - list_for_each_entry(msidesc, &dev->msi_list, list) {
> - irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
> + init_irq_alloc_info(&info, NULL);
> + info.msi_dev = dev;
> + if (type == PCI_CAP_ID_MSI) {
> + msidesc = list_first_entry(&dev->msi_list,
> +struct msi_desc, list);
> + WARN_ON(!list_is_singular(&dev->msi_list));
> + WARN_ON(msidesc->irq);
> + WARN_ON(msidesc->msi_attrib.multiple);
> + WARN_ON(msidesc->nvec_used);
> + info.type = X86_IRQ_ALLOC_TYPE_MSI;
> + cnt = nvec;
> + } else {
> + info.type = X86_IRQ_ALLOC_TYPE_MSIX;
> + cnt = 1;
> + }

We have a similar issue here.

> + domain = irq_remapping_get_irq_domain(&info);

We add domain specific knowledge to the MSI implementation. Not
necessary at all.

Again MSI is not an x86 problem and we really can move most of that to
the core code. The above sanity checks and the distinction between MSI
and MSIX can be handled in the core code. And every domain involved in
the MSI chain would need a alloc_msi() callback.

So native_setup_msi_irqs() would boil down to:
+ {
+   if (disable_apic)
+   return -ENOSYS;
+ 
+   return irq_domain_alloc_msi(msi_domain, dev, nvec, type);   
+ }

Now that core function performs the sanity checks for the MSI case. In
fact it should not proceed when a warning condition is detected. Not a
x86 issue at all, its true for every MSI implementation.

Then it calls dow

Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-29 Thread Thomas Gleixner
On Wed, 29 Oct 2014, Jiang Liu wrote:
> On 2014/10/29 5:37, Thomas Gleixner wrote:
> > On Tue, 28 Oct 2014, Jiang Liu wrote:
> >> +static int msi_set_affinity(struct irq_data *data, const struct cpumask 
> >> *mask,
> >> +  bool force)
> >> +{
> >> +  struct irq_data *parent = data->parent_data;
> >> +  int ret;
> >>  
> >> -  msg.data &= ~MSI_DATA_VECTOR_MASK;
> >> -  msg.data |= MSI_DATA_VECTOR(cfg->vector);
> >> -  msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
> >> -  msg.address_lo |= MSI_ADDR_DEST_ID(dest);
> >> +  ret = parent->chip->irq_set_affinity(parent, mask, force);
> >> +  /* No need to reprogram MSI registers if interrupt is remapped */
> >> +  if (ret >= 0 && !msi_irq_remapped(data)) {
> >> +  struct msi_msg msg;
> >>  
> >> -  __write_msi_msg(data->msi_desc, &msg);
> >> +  __get_cached_msi_msg(data->msi_desc, &msg);
> >> +  msi_update_msg(&msg, data);
> >> +  __write_msi_msg(data->msi_desc, &msg);
> >> +  }
> > 
> > I'm not too happy about the msi_irq_remapped() conditional here. It
> > violates the whole concept of domain stacking somewhat.
> > 
> > A better separation would be to add a callback to the irq chip:
> > 
> > void (*irq_write_msi_msg)(struct irq_data *data, struct msi_desc 
> > *msi_desc, bool cached);
> > 
> > and change this code to:
> > 
> > if (ret >= 0)
> > parent->chip->irq_write_msi_msg(parent, data->msi-desc, true);
> >   
> Hi Thomas,
>   Thanks for your great suggestion and I have worked out a draft
> patch to achieve what you want:)
>   I have made following changes to irq core to get rid of remapped
> irq logic from msi.c:
> 1) Add IRQ_SET_MASK_OK_DONE in addition to IRQ_SET_MASK_OK and
> IRQ_SET_MASK_OK_NOCOPY. IRQ_SET_MASK_OK_DONE is the same as
> IRQ_SET_MASK_OK for irq core and indicates to stacked irqchip that
> parent irqchips have done all work and skip any handling in descendant
> irqchips.
> 2) Add int (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg
> *msg) into struct irq_chip. I'm still hesitating to return void or int
> here. By returning void, irq_chip_compose_msi_msg() will be simpler,
> but it loses flexibility.

void should be sufficient. If the chip advertises this function it
better can provide a proper msi msg :)
 
> With above changes to core, we could remove all knowledge of irq
> remapping from msi.c and the irq remapping interfaces get simpler too.
> Please refer to following patch for details. The patch passes basic
> booting tests with irq remapping enabled. If it's OK, I will fold
> it into the patch set.

Yes. That looks reasonable. 
 
> IOAPIC runs into the same situation, but I need some more time
> to find a solution:)

I'm sure you will!

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-30 Thread Thomas Gleixner
On Thu, 30 Oct 2014, Jiang Liu wrote:
> On 2014/10/29 17:19, Thomas Gleixner wrote:
> >> IOAPIC runs into the same situation, but I need some more time
> >> to find a solution:)
> > 
> > I'm sure you will!
> Hi Thomas,
>   I have reviewed IOAPIC related change again, but the conclusion
> may not be what you expect:(
>   Currently IOAPIC driver detects IRQ remapping for following
> tasks:
> 1) Issue different EOI command to IOAPIC chip for unammped and remapped
>interrupts. It uses pin number instead of vector number for remapped
>interrupts.
> 2) Print different format for IOAPIC entries for unmapped and remapped
>interrupts.
> 3) ioapic_ack_level() uses different method for unmapped and remapped
>interrupts
> 4) create different IOAPIC entry content for unmapped and remapped
>interrupts
> 5) choose different flow handler for unmapped and remapped interrupts

So todays code does 

1/2/3 via irq_remap_modify_chip_defaults() and via
  x86_io_apic_ops.eoi_ioapic_pin with convoluted back and forth
  calls from remap to ioapic code.

4)is solved via x86_io_apic_ops.setup_entry

5)via setup_remapped_irq()

Now with the stacked irq domains we end up with the following two
scenarios:

ioapic_domain -> vector_domain

ioapic_domain -> remap_domain -> vector_domain

So if you look at the various issues you want to solve:

#1 Is simple to solve via:  

static void ioapic_eoi(struct irq_data *data)
{
if (data->parent->chip->irq_eoi)
data->parent->chip->irq_eoi(data->parent);
else
plain_ioapic_eoi(data);
}

#2/3 Ditto

#4/5 Should be solvable via the irq_startup callback in a similar way

static int ioapic_startup(struct irq_data *data)
{
if (data->parent->chip->irq_startup)
return data->parent->chip->irq_startup(data->parent);
else
return plain_ioapic_startup(data);
}

I.e. you set the entry and the handler from the startup function of
ioapic or remap.

It's probably not that simple as the above, but I'm pretty confident,
that we can map it without adding a gazillion of new callbacks to
irqchip.

> For MSI, it only needs to solve task 4) above. For IOAPIC, it needs
> to solve all five tasks above, which may cause big changes to irq_chip.
> And it even may cause IRQ remapping driver call back into IOAPIC driver,
> which breaks another rules that only the driver touches corresponding
> interrupt controller.

If the remap driver calls ioapic functions which are provided for that
purpose then I think that's unavoidable and ok. But I really want to
avoid the intermingled mess in the other code pathes which call back
and forth.
 
> On the other hand, MSI is almost platform independent, so it's
> reasonable to change public struct irq_chip to support MSI.

Right, but I think we can get away without adding new functions to
irqchip for the ioapic/remap thing.

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-31 Thread Thomas Gleixner
On Fri, 31 Oct 2014, Jiang Liu wrote:
> On 2014/10/29 5:37, Thomas Gleixner wrote:
> > Then it calls down the domain allocation chain. x86_msi_domain would
> > simply hand down to the parent domain. That would either be the remap
> > domain or the vector domain.
> The issue here is that, the hierarchy irqdomain maintains a tree
> topology and every irqdomain only supports one parent.
> 
> In case of irq remapping, we need to build one irqdomain for each IOMMU
> unit to support hotplug and simplify the implementation. So we need to
> build one MSI irqdomain for each IOMMU unit too instead of using a
> common MSI irqdomain.

That makes indeed a difference.
 
> Current design is that, a common MSI irqdomain to support all MSI when
> irq remapping is disabled, and one MSI irqdomain for each IOMMU unit
> when irq remapping is enabled.

> So we have the code below to choose the correct irqdomain for MSI.
> domain = irq_remapping_get_irq_domain(&info);
> if (domain == NULL)
> domain = msi_default_domain;
> if (domain == NULL)
> return -ENOSYS;

Right. I guess we need to keep it that way for now.

But looking at the code makes me wonder why we actually need to call
into the remap code and do a list walk to figure the domain out. The
association of device and iommu should be known at startup/hotplug
time already. That's out of the scope of this work, but should be
fixed eventually.

Thanks,

tglx




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: several messages

2014-11-10 Thread Thomas Gleixner
On Mon, 10 Nov 2014, Feng Wu wrote:

> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
> With VT-d Posted-Interrupts enabled, external interrupts from
> direct-assigned devices can be delivered to guests without VMM
> intervention when guest is running in non-root mode.

Can you please talk to Jiang and synchronize your work with his
refactoring of the x86 interrupt handling subsystem.

I want this stuff cleaned up first before we add new stuff to it.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/3] PCI/MSI: Initial hook for archs to declare multivector MSI support

2014-11-23 Thread Thomas Gleixner
On Fri, 21 Nov 2014, Alex Williamson wrote:
> For the most part multivector MSI is not supported and drivers and
> hardware wanting multiple vectors opt for MSI-X instead.  It seems
> though that having the ability to query the arch/platform code to
> determine whether allocating multiple MSI vectors will ever succeed
> is a useful thing.  For instance, vfio-pci can use this to determine
> whether to expose multiple MSI vectors to the user.  If we know we
> cannot ever support more than one vector, we have a better shot at
> the userspace driver working, especially if it's a guest OS, if we
> only expose one vector as being available in the interface.
> 
> Signed-off-by: Alex Williamson 
> ---
> 
>  drivers/pci/msi.c   |5 +
>  include/linux/msi.h |1 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 9fab30a..36b503a 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -79,6 +79,11 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int 
> nvec, int type)
>   return 0;
>  }
>  
> +bool __weak arch_supports_multivector_msi(struct pci_dev *dev)

Please not another weak arch function. We are in the process to reduce
them not to extend them.

arch_supports is pretty much wrong here anyway. We are moving away
from arch MSI dependencies simply because it's not a arch property per
se.

Multi MSI is a property of the underlying interrupt controllers and
there might be several MSI interrupt domains on a given system. They
can or cannot support multi MSI.

The current x86 implementation is a tangled maze and Jiang is in the
process to distangle it completely. Until thats done x86 is not going
to add new ad hoc interfaces.

Once we converted everything over to the new hierarchical irqdomains
we can add such an interface to the code, but for now I'm not
accepting anything like that into x86 msi related code.

Thanks,

tglx



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/5] Fix Intel IRQ remapping initialization order

2014-12-15 Thread Thomas Gleixner
On Mon, 15 Dec 2014, Jiang Liu wrote:
> On 2014/12/15 23:13, Joerg Roedel wrote:
> > Hi,
> > 
> > here is a patch-set against tip/x86/apic to fix an initialization order
> > problem with the IRQ remapping code.  The problem is in the ordering of
> > the irq_remapping_prepare and irq_remapping_supported functions.
> > 
> > Currently the call-order is irq_remapping_prepare ->
> > irq_remapping_supported, so that 'prepare' can succeed but 'supported'
> > fails, so that interrupt remapping gets initialized but not enabled.
> > This causes a broken interrupt setup on affected systems (machines with
> > an Intel IOMMU without, or broken, IRQ remapping support). The result
> > are lost interrupts and a non-bootable system.
> > 
> > Both functions do checks whether IRQ remapping can be enabled on the
> > machine.  The reason for this is that some checks rely on
> > dmar_table_init() and thus have to be done in irq_remapping_prepare().
> > 
> > This patch-set moves all these checks into the irq_remapping_prepare()
> > path with the right ordering and removes the irq_remapping_supported()
> > function and its call-backs. This fixes the initializion order problem
> > and simplifies the exported API from the IOMMU code.
> > 
> > Please review.
> Hi Joerg,
>   I have posted a patch set for the same purpose at:
> https://lkml.org/lkml/2014/12/10/20
>   Seems we need to combine these two patch sets:)

Actually you want to combine it also with these patches:

326c2bb2c526: iommu/vt-d: Convert allocations to GFP_KERNEL
e9220e591375: iommu/vt-d: Move iommu preparatory allocations to 
irq_remap_ops.prepare
e88edbd316ea: iommu, x86: Restructure setup of the irq remapping feature
dd60143c04f2: x86, smpboot: Remove pointless preempt_disable() in 
native_smp_prepare_cpus()

against 3.19 independent of the irqdomain stuff.

So that would be a clean base to put the rest of the irqdomain and
hotplug stuff on top.

So the ordering of the patches for 3.20 would become:

   iommu cleanup (init and allocations)
   acpi cleanup and parser extensions
   ioapic hotplug
   irqdomain conversion

I will route dd60143c04f2 "x86, smpboot: Remove pointless
preempt_disable() in native_smp_prepare_cpus()" into -rc1.  I'm going
to do so for a few other cherry-picks from x86/apic.

So can you please create a combined series, which just deals with the
init cleanup and the allocation conversion (ATOMIC -> GFP) based on
current Linus tree should be fine.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch v2 07/16] x86/apic: Refine enable_IR_x2apic() and related functions

2015-01-15 Thread Thomas Gleixner
On Wed, 7 Jan 2015, Jiang Liu wrote:

> Refine enable_IR_x2apic() and related functions for better readability.
> 
> It also changes the way to handle IR in XAPIC mode when enabling X2APIC.
> Previously it just skips X2APIC initialization without checking max CPU
> APIC ID in system, which may cause problem if system has CPU with APIC
> ID bigger than 255. So treat IR in XAPIC mode as same as IR is disabled
> when enabling CPU X2APIC.

Please don't do that. This wants to be two patches:

   1) reordering the code

   2) changing the operation

I've split it already, so no need to resend.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Advice on oops - memory trap on non-memory access instruction (invalid CR2?)

2019-10-14 Thread Thomas Gleixner
On Mon, 14 Oct 2019, Guilherme G. Piccoli wrote:
> Modules linked in: <...>
> CPU: 40 PID: 78274 Comm: qemu-system-x86 Tainted: P W  OE

Tainted: P - Proprietary module loaded ...

Try again without that module

Tainted: W - Warning issued before

Are you sure that that warning is harmless and unrelated?

> 4.4.0-45-generic #66~14.04.1-Ubuntu

Does the same problem happen with a not so dead kernel? CR2 handling got
quite some updates/fixes since then.

Thanks,

tglx


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch V2 1/4] iommu/vt-d: replace *hdr with hdr[0] in struct dmar_drhd_unit

2016-03-20 Thread Thomas Gleixner
On Sun, 20 Mar 2016, Wei Yang wrote:

> hdr in struct dmar_drhd_unit is used to point the DMAR hardware unit copied
> at the end of struct dmar_drhd_unit. One zero-sized array may be more
> elegant for this purpose.

You forget to tell why. 
 
> This patch replace *hdr with hdr[0] in struct dmar_drhd_unit.
> 
> Besides this, this patch includes other two changes:
> 1. remove unnecessary type cast in dmar_table_detect()

Again. Why is it not necessary?

> 2. type cast from acpi_dmar_header to acpi_dmar_hardware_unit directly

Don't even think about doing that. container_of() is there for a reason.

Your change works today, because the embedded structure is the first one in
the containing structure. If the containing structure gets reordered later,
the whole thing will explode in hard to debug ways.

Even if such a reordering is unlikely in that ACPI case, we just don't do
that. It's bad and sloppy coding style. The generated code is the same.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [Patch V2 1/4] iommu/vt-d: replace *hdr with hdr[0] in struct dmar_drhd_unit

2016-03-21 Thread Thomas Gleixner
On Mon, 21 Mar 2016, Wei Yang wrote:
> On Sun, Mar 20, 2016 at 04:42:29PM +0100, Thomas Gleixner wrote:
> >On Sun, 20 Mar 2016, Wei Yang wrote:
> >
> >> hdr in struct dmar_drhd_unit is used to point the DMAR hardware unit copied
> >> at the end of struct dmar_drhd_unit. One zero-sized array may be more
> >> elegant for this purpose.
> >
> >You forget to tell why. 
> > 
> 
> One possible benefit is to save some space.
> 
> Before commit 6b1972493a84 "iommu/vt-d: Implement DMAR unit hotplug 
> framework",
> dmaru->hdr just points to the memory region of DMA remapping hardware
> definition. In this case, it would have no difference to where we put hdr.
> 
> After this commit, DMA remapping hardware definition is copied and
> attach to the end of dmaru structure. By replacing a pointer with a zero-sized
> array, that would save some space for this structure.

Sure and exactly that explanation should be in the changelog. Not some
handwaving "may be more elegant". We don't care about elegance, we care about
correctness.
 
> >> This patch replace *hdr with hdr[0] in struct dmar_drhd_unit.
> >> 
> >> Besides this, this patch includes other two changes:
> >> 1. remove unnecessary type cast in dmar_table_detect()
> >
> >Again. Why is it not necessary?
> >
> 
> The second parameter's type of function acpi_get_table_with_size() is "struct
> acpi_table_header **", and type of dmar_tbl is "struct acpi_table_header *".
> 
> So without the type cast, the type of parameter and the function definition
> matches.

That's the information which a changelog wants to have, because otherwise a
reviewer is forced to lookup the prototypes 

So a simple:

   "Remove redundant type case to same type"

would have told me what you are doing.
 
> >> 2. type cast from acpi_dmar_header to acpi_dmar_hardware_unit directly
> >
> >Don't even think about doing that. container_of() is there for a reason.
> >
> >Your change works today, because the embedded structure is the first one in
> >the containing structure. If the containing structure gets reordered later,
> >the whole thing will explode in hard to debug ways.
> >
> >Even if such a reordering is unlikely in that ACPI case, we just don't do
> >that. It's bad and sloppy coding style. The generated code is the same.
> >
> 
> Yes, I agree with you that make this change should be very careful, while I
> think the original usage of container_of() is not necessary.

It's not necessary, but it is correct. Removing it is just putting assumptions
into the code, which is never a good idea.

> Literally, it converts "struct acpi_dmar_header" to "struct
> acpi_dmar_hardware_unit", because the first one is an element "header" of the
> second one. While let's look at how the dmaru->hdr is initialized in
> dmar_parse_one_drhd(), we copy the memory of "struct acpi_dmar_hardware_unit"
> to a region where dmaru->hdr points to. So the code itself implies "struct
> acpi_dmar_header" is the first element of "struct acpi_dmar_hardware_unit".

Which is wrong to begin with. Assumption of first elements are crap.

> Otherwise, we can't do this memcpy in dmar_parse_one_drhd().
 
> BTW, I am thinking changing the type of dmaru->hdr from "struct
> acpi_dmar_header *" to "struct acpi_dmar_hardware_unit *". By doing so the
> code would be more self explain, and it doesn't need to cast back and forth.

Yes, that makes sense.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 1/8] genirq/msi: Add a new MSI_FLAG_IRQ_REMAPPING flag

2016-04-22 Thread Thomas Gleixner
On Fri, 22 Apr 2016, Eric Auger wrote:
> Robin,
> On 04/22/2016 01:02 PM, Robin Murphy wrote:
> > Hi Eric,
> > 
> > On 19/04/16 18:13, Eric Auger wrote:
> >> Let's introduce a new msi_domain_info flag value, MSI_FLAG_IRQ_REMAPPING
> >> meant to tell the domain supports IRQ REMAPPING, also known as Interrupt
> >> Translation Service. On Intel HW this IRQ remapping capability is
> >> abstracted on IOMMU side while on ARM it is abstracted on MSI controller
> >> side. This flag will be used to know whether the MSI passthrough is
> >> safe.
> > 
> > Perhaps a nitpick, but given the earlier confusion about what the IOMMU
> > flag actually meant this prompts me to wonder if it's worth adjusting
> > the general terminology before we propagate it further. What I think we
> > actually care about is that one thing or the other "provides MSI
> > isolation" rather than "supports MSI remapping", since the latter is all
> > to easy to misinterpret the way we did in the SMMU drivers.
> 
> The only concern I have is https://lkml.org/lkml/2016/4/18/283 attempts
> to define a PCI bus flag dubbed PCI_BUS_FLAGS_MSI_REMAP combining the
> iommu & msi layer info. In that sense x86 people may not be keen of
> having different terminaologies. Anyway I will follow the consensus, if any.

Yes, please keep that consistent. It makes 'grep' much more conveniant.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 03/10] genirq/irq: introduce msi_doorbell_info

2016-07-19 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:
>  
> +/* Describe all the MSI doorbell regions for an irqchip */
> +struct irq_chip_msi_doorbell_info {
> + union {
> + phys_addr_t __percpu *percpu_doorbells;
> + phys_addr_t global_doorbell;
> + };
> + bool doorbell_is_percpu;
> + bool irq_remapping; /* is irq_remapping implemented? */

Please do not use tail comments. Use proper kernel doc for documentation.

> + size_t size;/* size of each doorbell */
> + int prot;   /* iommu protection flag */

Please align the members proper

union {
phys_addr_t __percpu*percpu_doorbells;
phys_addr_t global_doorbell;
};
booldoorbell_is_percpu;
boolirq_remapping;

> +};
> +
>  /**
>   * struct irq_chip - hardware interrupt chip descriptor
>   *
> @@ -349,6 +361,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct 
> irq_data *d)
>   * @irq_get_irqchip_state:   return the internal state of an interrupt
>   * @irq_set_irqchip_state:   set the internal state of a interrupt
>   * @irq_set_vcpu_affinity:   optional to target a vCPU in a virtual machine
> + * @msi_doorbell_info:   return the MSI doorbell info
>   * @ipi_send_single: send a single IPI to destination cpus
>   * @ipi_send_mask:   send an IPI to destination cpus in cpumask
>   * @flags:   chip specific flags
> @@ -394,7 +407,8 @@ struct irq_chip {
>   int (*irq_set_irqchip_state)(struct irq_data *data, enum 
> irqchip_irq_state which, bool state);
>  
>   int (*irq_set_vcpu_affinity)(struct irq_data *data, void 
> *vcpu_info);
> -
> + struct irq_chip_msi_doorbell_info *(*msi_doorbell_info)(

irq_get_msi_doorbell_info or msi_get_doorbell_info please

> + struct irq_data *data);

No need for a line break here. Please keep it as a single line.

Thanks

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 04/10] genirq/msi-doorbell: allow MSI doorbell (un)registration

2016-07-19 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:
> +
> +#include 
> +#include 
> +#include 
> +
> +struct irqchip_doorbell {
> + struct irq_chip_msi_doorbell_info info;
> + struct list_head next;

Again, please align the struct members.

> +};
> +
> +static LIST_HEAD(irqchip_doorbell_list);
> +static DEFINE_MUTEX(irqchip_doorbell_mutex);
> +
> +struct irq_chip_msi_doorbell_info *
> +msi_doorbell_register_global(phys_addr_t base, size_t size,
> +  int prot, bool irq_remapping)
> +{
> + struct irqchip_doorbell *db;
> +
> + db = kmalloc(sizeof(*db), GFP_KERNEL);
> + if (!db)
> + return ERR_PTR(-ENOMEM);
> +
> + db->info.doorbell_is_percpu = false;

Please use kzalloc and get rid of zero initialization. If you add stuff to the
struct then initialization will be automatically 0.

> +void msi_doorbell_unregister_global(struct irq_chip_msi_doorbell_info 
> *dbinfo)
> +{
> + struct irqchip_doorbell *db, *tmp;
> +
> + mutex_lock(&irqchip_doorbell_mutex);
> + list_for_each_entry_safe(db, tmp, &irqchip_doorbell_list, next) {

Why do you need that iterator? 

db = container_of(dbinfo, struct ., info);

Hmm?

> + if (dbinfo == &db->info) {
> + list_del(&db->next);
> + kfree(db);

Please move the kfree() outside of the lock region. It does not matter much
here, but we really should stop doing random crap in locked regions.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 05/10] genirq/msi-doorbell: msi_doorbell_pages

2016-07-19 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:
> msi_doorbell_pages sum up the number of iommu pages of a given order

adding () to the function name would make it immediately clear that
msi_doorbell_pages is a function.

> +/**
> + * msi_doorbell_pages: compute the number of iommu pages of size 1 << order
> + * requested to map all the registered doorbells
> + *
> + * @order: iommu page order
> + */

Why are you adding the kernel doc to the header and not to the implementation?

> +int msi_doorbell_pages(unsigned int order);
> +
>  #else
>  
>  static inline struct irq_chip_msi_doorbell_info *
> @@ -47,6 +55,12 @@ msi_doorbell_register_global(phys_addr_t base, size_t size,
>  static inline void
>  msi_doorbell_unregister_global(struct irq_chip_msi_doorbell_info *db) {}
>  
> +static inline int
> +msi_doorbell_pages(unsigned int order)

What's the point of this line break? 

> +{
> + return 0;
> +}
> +
>  #endif /* CONFIG_MSI_DOORBELL */
>  
>  #endif
> diff --git a/kernel/irq/msi-doorbell.c b/kernel/irq/msi-doorbell.c
> index 0ff541e..a5bde37 100644
> --- a/kernel/irq/msi-doorbell.c
> +++ b/kernel/irq/msi-doorbell.c
> @@ -60,3 +60,55 @@ void msi_doorbell_unregister_global(struct 
> irq_chip_msi_doorbell_info *dbinfo)
>   mutex_unlock(&irqchip_doorbell_mutex);
>  }
>  EXPORT_SYMBOL_GPL(msi_doorbell_unregister_global);
> +
> +static int compute_db_mapping_requirements(phys_addr_t addr, size_t size,
> +unsigned int order)
> +{
> + phys_addr_t offset, granule;
> + unsigned int nb_pages;
> +
> + granule = (uint64_t)(1 << order);
> + offset = addr & (granule - 1);
> + size = ALIGN(size + offset, granule);
> + nb_pages = size >> order;
> +
> + return nb_pages;
> +}
> +
> +static int
> +compute_dbinfo_mapping_requirements(struct irq_chip_msi_doorbell_info 
> *dbinfo,
> + unsigned int order)

I'm sure you can find even longer function names which require more line
breaks.

> +{
> + int ret = 0;
> +
> + if (!dbinfo->doorbell_is_percpu) {
> + ret = compute_db_mapping_requirements(dbinfo->global_doorbell,
> +   dbinfo->size, order);
> + } else {
> + phys_addr_t __percpu *pbase;
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + pbase = per_cpu_ptr(dbinfo->percpu_doorbells, cpu);
> + ret += compute_db_mapping_requirements(*pbase,
> +dbinfo->size,
> +order);
> + }
> + }
> + return ret;
> +}
> +
> +int msi_doorbell_pages(unsigned int order)
> +{
> + struct irqchip_doorbell *db;
> + int ret = 0;
> +
> + mutex_lock(&irqchip_doorbell_mutex);
> + list_for_each_entry(db, &irqchip_doorbell_list, next) {

Pointless braces

> + ret += compute_dbinfo_mapping_requirements(&db->info, order);
> + }
> + mutex_unlock(&irqchip_doorbell_mutex);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(msi_doorbell_pages);

So here is a general rant about your naming choices.

   struct irqchip_doorbell
   struct irq_chip_msi_doorbell_info

   struct irq_chip {
  *(*msi_doorbell_info);
   }

   irqchip_doorbell_mutex

   msi_doorbell_register_global
   msi_doorbell_unregister_global

   msi_doorbell_pages

This really sucks. Your public functions start sensibly with msi_doorbell.

Though what is the _global postfix for the register/unregister functions for?
Are there _private functions in the pipeline?

msi_doorbell_pages() is not telling me what it does. msi_calc_doorbell_pages()
would describe it right away.

You doorbell info structure can really do with:

struct msi_doorbell_info;

And the wrapper struct around it is fine with:

struct msi_doorbell;

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 06/10] genirq/msi-doorbell: msi_doorbell_safe

2016-07-20 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:
> +bool msi_doorbell_safe(void)
> +{
> + struct irqchip_doorbell *db;
> + bool irq_remapping = true;
> +
> + mutex_lock(&irqchip_doorbell_mutex);
> + list_for_each_entry(db, &irqchip_doorbell_list, next) {
> + irq_remapping &= db->info.irq_remapping;

db->info.irq_remapping is set in msi_doorbell_register(). So you can keep book
about that there. No need to iterate here.

Thanks,

tglx


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 09/10] genirq/msi: map/unmap the MSI doorbells on msi_domain_alloc/free_irqs

2016-07-20 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:
>  /**
> + * msi_handle_doorbell_mappings: in case the irq data corresponds to an
> + * MSI that requires iommu mapping, traverse the irq domain hierarchy
> + * to retrieve the doorbells to handle and iommu_map/unmap them according
> + * to @map boolean.
> + *
> + * @data: irq data handle
> + * @map: mapping if true, unmapping if false
> + */


Please run that through the kernel doc generator. It does not work that way.

The format is:

/**
 * function_name - Short function description
 * @arg1:   Description of arg1
 * @argument2:  Description of argument2
 *
 * Long explanation including documentation of the return values.
 */

> +static int msi_handle_doorbell_mappings(struct irq_data *data, bool map)
> +{
> + const struct irq_chip_msi_doorbell_info *dbinfo;
> + struct iommu_domain *domain;
> + struct irq_chip *chip;
> + struct device *dev;
> + dma_addr_t iova;
> + int ret = 0, cpu;
> +
> + while (data) {
> + dev = msi_desc_to_dev(irq_data_get_msi_desc(data));
> + domain = iommu_msi_domain(dev);
> + if (domain) {
> + chip = irq_data_get_irq_chip(data);
> + if (chip->msi_doorbell_info)
> + break;
> + }
> + data = data->parent_data;
> + }

Please split that out into a seperate function

struct irq_data *msi_get_doorbell_info(data)
{
.
if (chip->msi_doorbell_info)
return chip->msi_get_doorbell_info(data);
}
return NULL;
}

   info = msi_get_doorbell_info(data);
   .

> + if (!data)
> + return 0;
> +
> + dbinfo = chip->msi_doorbell_info(data);
> + if (!dbinfo)
> + return -EINVAL;
> +
> + if (!dbinfo->doorbell_is_percpu) {
> + if (!map) {
> + iommu_msi_put_doorbell_iova(domain,
> + dbinfo->global_doorbell);
> + return 0;
> + }
> + return iommu_msi_get_doorbell_iova(domain,
> +dbinfo->global_doorbell,
> +dbinfo->size, dbinfo->prot,
> +&iova);
> + }

You can spare an indentation level with a helper function

if (!dbinfo->doorbell_is_percpu)
return msi_map_global_doorbell(domain, dbinfo);

> +
> + /* percpu doorbells */
> + for_each_possible_cpu(cpu) {
> + phys_addr_t __percpu *db_addr =
> + per_cpu_ptr(dbinfo->percpu_doorbells, cpu);
> +
> + if (!map) {
> + iommu_msi_put_doorbell_iova(domain, *db_addr);
> + } else {
> +
> + ret = iommu_msi_get_doorbell_iova(domain, *db_addr,
> +   dbinfo->size,
> +   dbinfo->prot, &iova);
> + if (ret)
> + return ret;
> + }
> + }

Same here:

for_each_possible_cpu(cpu) {
ret = msi_map_percpu_doorbell(domain, cpu);
if (ret)
return ret;
}
return 0;
 
Hmm?

> +
> + return 0;
> +}
> +
> +/**
>   * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
>   * @domain:  The domain to allocate from
>   * @dev: Pointer to device struct of the device for which the interrupts
> @@ -352,17 +423,29 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, 
> struct device *dev,
>  
>   virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
>  dev_to_node(dev), &arg, false);
> - if (virq < 0) {
> - ret = -ENOSPC;
> - if (ops->handle_error)
> - ret = ops->handle_error(domain, desc, ret);
> - if (ops->msi_finish)
> - ops->msi_finish(&arg, ret);
> - return ret;
> - }
> + if (virq < 0)
> + goto error;
>  
>   for (i = 0; i < desc->nvec_used; i++)
>   irq_set_msi_desc_off(virq, i, desc);
> +
> + for (i = 0; i < desc->nvec_used; i++) {
> + struct irq_data *d = irq_get_irq_data(virq + i);
> +
> + ret = msi_handle_doorbell_mappings(d, true);
> + if (ret)
> + break;
> + }
> + if (ret) {
> + for (; i >= 0; i--) {
> + struct irq_data *d = irq_get_irq_data(virq + i);
> +
> + msi_handle_doorbell_mappings(d, false);
> + }
> +

Re: [PATCH v11 10/10] genirq/msi: use the MSI doorbell's IOVA when requested

2016-07-20 Thread Thomas Gleixner
On Tue, 19 Jul 2016, Eric Auger wrote:

First of all - valid for all patches:

Subject: sys/subsys: Sentence starts with an uppercase letter

Now for this particular one:

genirq/msi: use the MSI doorbell's IOVA when requested

> On MSI message composition we now use the MSI doorbell's IOVA in
> place of the doorbell's PA in case the device is upstream to an
> IOMMU that requires MSI addresses to be mapped. The doorbell's
> allocation and mapping happened on an early stage (pci_enable_msi).

This changelog is completely useless. At least I cannot figure out what that
patch actually does. And the implementation is not self explaining either.
 
> @@ -63,10 +63,18 @@ static int msi_compose(struct irq_data *irq_data,
>  {
>   int ret = 0;
>  
> - if (erase)
> + if (erase) {
>   memset(msg, 0, sizeof(*msg));
> - else
> + } else {
> + struct device *dev;
> +
>   ret = irq_chip_compose_msi_msg(irq_data, msg);
> + if (ret)
> + return ret;
> +
> + dev = msi_desc_to_dev(irq_data_get_msi_desc(irq_data));
> + WARN_ON(iommu_msi_msg_pa_to_va(dev, msg));

What the heck is this call doing? And why is there only a WARN_ON and not a
proper error return code handling?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 06/10] genirq/msi-doorbell: msi_doorbell_safe

2016-07-22 Thread Thomas Gleixner
On Thu, 21 Jul 2016, Auger Eric wrote:
> On 20/07/2016 10:12, Thomas Gleixner wrote:
> > On Tue, 19 Jul 2016, Eric Auger wrote:
> >> +bool msi_doorbell_safe(void)
> >> +{
> >> +  struct irqchip_doorbell *db;
> >> +  bool irq_remapping = true;
> >> +
> >> +  mutex_lock(&irqchip_doorbell_mutex);
> >> +  list_for_each_entry(db, &irqchip_doorbell_list, next) {
> >> +  irq_remapping &= db->info.irq_remapping;
> > 
> > db->info.irq_remapping is set in msi_doorbell_register(). So you can keep 
> > book
> > about that there. No need to iterate here.
> Yes makes sense to store the info at registration time. Currently this
> function is not in any fast path but that's cleaner from a general
> perspective. I will need to do such iteration at un-registration though.

Two simple counter should be sufficient.

  nr_registered_bells;
  nr_remapped_bells;



Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 09/10] genirq/msi: map/unmap the MSI doorbells on msi_domain_alloc/free_irqs

2016-07-26 Thread Thomas Gleixner
B1;2802;0cEric,

On Mon, 25 Jul 2016, Auger Eric wrote:
> On 20/07/2016 11:04, Thomas Gleixner wrote:
> > On Tue, 19 Jul 2016, Eric Auger wrote:
> >> +  if (ret) {
> >> +  for (; i >= 0; i--) {
> >> +  struct irq_data *d = irq_get_irq_data(virq + i);
> >> +
> >> +  msi_handle_doorbell_mappings(d, false);
> >> +  }
> >> +  irq_domain_free_irqs(virq, desc->nvec_used);
> >> +  desc->irq = 0;
> >> +  goto error;
> > 
> > How is that supposed to work? You clear desc->irq and then you call
> > ops->handle_error.
>
> if I don't clear the desc->irq I enter an infinite loop in
> pci_enable_msix_range.
>
> This happens because msix_capability_init and pcie_enable_msix returns 1.
> In msix_capability_init, at out_avail: we enumerate the msi_desc which have
> a non zero irq, hence the returned value equal to 1.
>
> Currently the only handle_error ops I found, pci_msi_domain_handle_error
> does not use irq field so works although questionable.

The logic here is: If the allocation does not succeed for the requested number
of interrupts, we tell the caller how many interrupts we were able to set up.
So the caller can decide what to do.

In your case you don't want to have a partial allocation, so instead of
playing silly games with desc->irq you should add a flag which tells the PCI
code that you are not interested in a partial allocation and that it should
return an error code instead.

Something like PCI_DEV_FLAGS_MSI_NO_PARTIAL_ALLOC should do the trick.

> As for the irq_domain_free_irqs I think I can remove it since handled later.

Not only the free_irqs(). You should let the teardown function handle
everything including your doorbell mapping teardown. It's nothing special and
free_msi_irqs() at the end of msix_capability_init() will take care of it.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 10/10] genirq/msi: use the MSI doorbell's IOVA when requested

2016-07-26 Thread Thomas Gleixner
Eric,

On Mon, 25 Jul 2016, Auger Eric wrote:
> On 20/07/2016 11:09, Thomas Gleixner wrote:
> > On Tue, 19 Jul 2016, Eric Auger wrote:
> >> @@ -63,10 +63,18 @@ static int msi_compose(struct irq_data *irq_data,
> >>  {
> >>int ret = 0;
> >>  
> >> -  if (erase)
> >> +  if (erase) {
> >>memset(msg, 0, sizeof(*msg));
> >> -  else
> >> +  } else {
> >> +  struct device *dev;
> >> +
> >>ret = irq_chip_compose_msi_msg(irq_data, msg);
> >> +  if (ret)
> >> +  return ret;
> >> +
> >> +  dev = msi_desc_to_dev(irq_data_get_msi_desc(irq_data));
> >> +  WARN_ON(iommu_msi_msg_pa_to_va(dev, msg));
> > 
> > What the heck is this call doing? And why is there only a WARN_ON and not a
> > proper error return code handling?
> 
> iommu_msi_msg_pa_to_va is part of the new iommu-msi API introduced in PART I
> of this series. This helper function detects the physical address found in
> the MSI message has a corresponding allocated IOVA. This happens if the MSI
> doorbell is accessed through an IOMMU and this IOMMU do not bypass the MSI
> addresses (ARM case). Allocation of this IOVA was performed in the previous
> patch.
>
> So, if this is the case, the physical address is swapped with the IOVA
> address. That way the PCIe device will send the MSI with this IOVA and
> the address will be translated by the IOMMU into the target MSI doorbell PA.
> 
> Hope this clarifies

No, it does not. You are explaining in great length what that function is
doing, but you are not explaining WHY your don't do a proper return code
handling and just do a WARN_ON() and happily proceed. If that function fails
then the interrupt will not be functional, so WHY on earth are you continuing?

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 09/10] genirq/msi: map/unmap the MSI doorbells on msi_domain_alloc/free_irqs

2016-07-26 Thread Thomas Gleixner
On Tue, 26 Jul 2016, Auger Eric wrote:
> On 26/07/2016 11:00, Thomas Gleixner wrote:
> > In your case you don't want to have a partial allocation, so instead of
> > playing silly games with desc->irq you should add a flag which tells the PCI
> > code that you are not interested in a partial allocation and that it should
> > return an error code instead.
> 

> In that case can we consider we even succeeded in allocating 1 MSI? In case
> the IOMMU mapping fails, the MSI transaction will never reach the target MSI
> frame so it is not usable. So when you mean "partial" I understand we did
> not succeed in allocating maxvec IRQs, correct? Here we succeeded in
> allocating 0 IRQ and still msi_capability_init returns 1.
>
> msi_capability_init doc-comment says "a positive return value indicates the
> number of interrupts which could have been allocated."
> 
> I understand allocation success currently only depends on the fact virq was
> allocated and set to desc->irq. But with that IOMMU stuff doesn't the
> criteria changes?

Right. But then you need to express it differently in a consistent way. Not by
hacking around it by setting desc->irq to 0.

Something like a flag field in msi_desc which denotes various properties would
be a possible solution. MSI_IRQ_ALLOCATED and MSI_IRQ_REMAPPED would be
sufficient for now. And the deallocation/cleanup would rely on those flags
rather than checking desc->irq.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v12 02/11] genirq/msi: msi_compose wrapper

2016-08-09 Thread Thomas Gleixner
On Tue, 2 Aug 2016, Eric Auger wrote:

> Currently the MSI message is composed by directly calling
> irq_chip_compose_msi_msg and erased by setting the memory to zero.
> 
> On some platforms, we will need to complexify this composition to
> properly handle MSI emission through IOMMU. Also we will need to track
> when the MSI message is erased.

I just can't find how you do that. After applying the series the

> + if (erase)
> + memset(msg, 0, sizeof(*msg));

branch is still just a memset(). The wrapper is fine for the compose side, but
having the extra argument just to wrap the memset() for no gain is silly.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/vt-d: Fix modify_irte NULL pointer

2016-08-22 Thread Thomas Gleixner
On Mon, 22 Aug 2016, Wanpeng Li wrote:

> From: Wanpeng Li 
> 
> native_smp_prepare_cpus
>   -> default_setup_apic_routing
> -> enable_IR_x2apic
>   -> irq_remapping_prepare
> -> intel_prepare_irq_remapping
>   -> parse_ioapics_under_ir   => return 0
>   -> ir_parse_ioapic_hpet_scope
> -> ir_parse_one_ioapic_scope  
>   -> intel_setup_irq_remapping  
 
> IR table is setup even if noapic boot parameter is added.
> index ac59692..f1cb7c6 100644
> --- a/drivers/iommu/intel_irq_remapping.c
> +++ b/drivers/iommu/intel_irq_remapping.c
> @@ -854,6 +854,9 @@ static int ir_parse_one_ioapic_scope(struct 
> acpi_dmar_device_scope *scope,
>   count = (scope->length - sizeof(struct acpi_dmar_device_scope))
>   / sizeof(struct acpi_dmar_pci_path);
>  
> + if (skip_ioapic_setup)
> + return -ENODEV;

Why are you adding this in the iommu code? We should not call any of the apic
functions when apic is disabled.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v5 19/19] iommu/dma: Add support for mapping MSIs

2016-08-24 Thread Thomas Gleixner
On Tue, 23 Aug 2016, Robin Murphy wrote:
> + cookie = domain->iova_cookie;
> + iovad = &cookie->iovad;
> +
> + spin_lock(&cookie->msi_lock);
> + list_for_each_entry(msi_page, &cookie->msi_page_list, list)
> + if (msi_page->phys_hi == msg->address_hi &&
> + msi_page->phys_lo - msg->address_lo < iovad->granule)
> + goto unlock;
> +
> + ret = __iommu_dma_map_msi_page(dev, msg, domain, &msi_page);
> +unlock:
> + spin_unlock(&cookie->msi_lock);
> +
> + if (!ret) {
> + msg->address_hi = upper_32_bits(msi_page->iova);
> + msg->address_lo &= iova_mask(iovad);
> + msg->address_lo += lower_32_bits(msi_page->iova);
> + } else {
> + /*
> +  * We're called from a void callback, so the best we can do is
> +  * 'fail' by filling the message with obviously bogus values.
> +  * Since we got this far due to an IOMMU being present, it's
> +  * not like the existing address would have worked anyway...
> +  */
> + msg->address_hi = ~0U;
> + msg->address_lo = ~0U;
> + msg->data = ~0U;
> + }

The above is really horrible to parse. I had to read it five times to
understand the logic.

static struct iommu_dma_msi_page *
find_or_map_msi_page(struct iommu_dma_cookie *cookie, struct msi_msg *msg)
{
struct iova_domain *iovad = &cookie->iovad;
struct iommu_dma_msi_page *page;

list_for_each_entry(*page, &cookie->msi_page_list, list) {
if (page->phys_hi == msg->address_hi &&
page->phys_lo - msg->address_lo < iovad->granule)
return page;
}

/*
 * FIXME: __iommu_dma_map_msi_page() should return a page or NULL.
 * The integer return value is pretty pointless. If seperate error
 * codes are required that's what ERR_PTR() is for 
 */
ret = __iommu_dma_map_msi_page(dev, msg, domain, &page);
return ret ? ERR_PTR(ret) : page;
}

So now the code in iommu_dma_map_msi_msg() becomes:

spin_lock(&cookie->msi_lock);
msi_page = find_or_map_msi_page(cookie, msg);
spin_unlock(&cookie->msi_lock);

if (!IS_ERR_OR_NULL(msi_page)) {
msg->address_hi = upper_32_bits(msi_page->iova);
msg->address_lo &= iova_mask(iovad);
msg->address_lo += lower_32_bits(msi_page->iova);
} else {
/*
 * We're called from a void callback, so the best we can do is
 * 'fail' by filling the message with obviously bogus values.
 * Since we got this far due to an IOMMU being present, it's
 * not like the existing address would have worked anyway...
 */
msg->address_hi = ~0U;
msg->address_lo = ~0U;
msg->data = ~0U;
}

Hmm? 

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v2 04/20] x86: Secure Memory Encryption (SME) support

2016-08-25 Thread Thomas Gleixner
On Mon, 22 Aug 2016, Tom Lendacky wrote:

> Provide support for Secure Memory Encryption (SME). This initial support
> defines the memory encryption mask as a variable for quick access and an
> accessor for retrieving the number of physical addressing bits lost if
> SME is enabled.

What is the reason that this needs to live in assembly code?
 
Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v6.1] iommu/dma: Add support for mapping MSIs

2016-09-09 Thread Thomas Gleixner
On Wed, 7 Sep 2016, Robin Murphy wrote:
> - Rework map_page() helper function plus insane lookup logic into
>   straightforward get_page() helper
> - Use phys_addr_t to further simplify address matching
> - Fix the bit where I neglected to actually round the doorbell
>   address to a page boundary (oops!)
> - Make the locking hardirq-safe to satisfy lockdep

That looks way better.

Acked-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] arm64: SMMU-v2: Workaround for Cavium ThunderX erratum 28168

2016-10-24 Thread Thomas Gleixner
On Mon, 24 Oct 2016, Marc Zyngier wrote:
> On 22/10/16 05:54, Geetha sowjanya wrote:
> > diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
> > index be3c34e..6add8da 100644
> > --- a/kernel/irq/chip.c
> > +++ b/kernel/irq/chip.c
> > @@ -585,6 +585,10 @@ void handle_fasteoi_irq(struct irq_desc *desc)
> > goto out;
> > }
> >  
> > +#ifdef CONFIG_CAVIUM_ERRATUM_28168
> > +   if (chip->irq_ack)
> > +   chip->irq_ack(&desc->irq_data);
> > +#endif
> > kstat_incr_irqs_this_cpu(desc);
> > if (desc->istate & IRQS_ONESHOT)
> > mask_irq(desc);
> > 
> 
> Overall, this workaround is not acceptable as it is.

Aside of being not acceptable this thing is completely broken. 

If that erratum is enabled then a interrupt chip which implements both EOI
and ACK callbacks will issue irq_ack when using the fasteoi handler. While
this might work on that cavium trainwreck, it will just make other
platforms pretty unhappy.

Platform specific hacks have no place in the core code at all. We have
enough options to handle oddball hardware, you just have to use them.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/12] x86: make dma_cache_sync a no-op

2017-08-31 Thread Thomas Gleixner
On Sun, 27 Aug 2017, Christoph Hellwig wrote:

> x86 does not implement DMA_ATTR_NON_CONSISTENT allocations, so it doesn't
> make any sense to do any work in dma_cache_sync given that it must be a
> no-op when dma_alloc_attrs returns coherent memory.
> 
> Signed-off-by: Christoph Hellwig 

> ---
>  arch/x86/include/asm/dma-mapping.h | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/dma-mapping.h 
> b/arch/x86/include/asm/dma-mapping.h
> index 398c79889f5c..04877267ad18 100644
> --- a/arch/x86/include/asm/dma-mapping.h
> +++ b/arch/x86/include/asm/dma-mapping.h
> @@ -70,7 +70,6 @@ static inline void
>  dma_cache_sync(struct device *dev, void *vaddr, size_t size,
>   enum dma_data_direction dir)
>  {
> - flush_write_buffers();
>  }
>  
>  static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
> -- 
> 2.11.0
> 
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 45/52] iommu/vt-d: Reevaluate vector configuration on activate()

2017-09-13 Thread Thomas Gleixner
With the upcoming reservation/management scheme, early activation will
assign a special vector. The final activation at request_irq() assigns a
real vector, which needs to be updated in the tables.

Split out the reconfiguration code in set_affinity and use it for
reactivation.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/intel_irq_remapping.c |   38 +++-
 1 file changed, 21 insertions(+), 17 deletions(-)

--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1121,6 +1121,24 @@ struct irq_remap_ops intel_irq_remap_ops
.get_irq_domain = intel_get_irq_domain,
 };
 
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+{
+   struct intel_ir_data *ir_data = irqd->chip_data;
+   struct irte *irte = &ir_data->irte_entry;
+   struct irq_cfg *cfg = irqd_cfg(irqd);
+
+   /*
+* Atomically updates the IRTE with the new destination, vector
+* and flushes the interrupt entry cache.
+*/
+   irte->vector = cfg->vector;
+   irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+
+   /* Update the hardware only if the interrupt is in remapped mode. */
+   if (!force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
+   modify_irte(&ir_data->irq_2_iommu, irte);
+}
+
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
@@ -1139,27 +1157,15 @@ static int
 intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
  bool force)
 {
-   struct intel_ir_data *ir_data = data->chip_data;
-   struct irte *irte = &ir_data->irte_entry;
-   struct irq_cfg *cfg = irqd_cfg(data);
struct irq_data *parent = data->parent_data;
+   struct irq_cfg *cfg = irqd_cfg(data);
int ret;
 
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;
 
-   /*
-* Atomically updates the IRTE with the new destination, vector
-* and flushes the interrupt entry cache.
-*/
-   irte->vector = cfg->vector;
-   irte->dest_id = IRTE_DEST(cfg->dest_apicid);
-
-   /* Update the hardware only if the interrupt is in remapped mode. */
-   if (ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
-   modify_irte(&ir_data->irq_2_iommu, irte);
-
+   intel_ir_reconfigure_irte(data, false);
/*
 * After this point, all the interrupts will start arriving
 * at the new destination. So, time to cleanup the previous
@@ -1392,9 +1398,7 @@ static void intel_irq_remapping_free(str
 static int intel_irq_remapping_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early)
 {
-   struct intel_ir_data *data = irq_data->chip_data;
-
-   modify_irte(&data->irq_2_iommu, &data->irte_entry);
+   intel_ir_reconfigure_irte(irq_data, true);
return 0;
 }
 


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 46/52] iommu/amd: Reevaluate vector configuration on activate()

2017-09-13 Thread Thomas Gleixner
With the upcoming reservation/management scheme, early activation will
assign a special vector. The final activation at request_irq() assigns a
real vector, which needs to be updated in the tables.

Split out the reconfiguration code in set_affinity and use it for
reactivation.

Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: iommu@lists.linux-foundation.org
---
 drivers/iommu/amd_iommu.c |   39 +--
 1 file changed, 29 insertions(+), 10 deletions(-)

--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4170,16 +4170,25 @@ static void irq_remapping_free(struct ir
irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+  struct amd_ir_data *ir_data,
+  struct irq_2_irte *irte_info,
+  struct irq_cfg *cfg);
+
 static int irq_remapping_activate(struct irq_domain *domain,
  struct irq_data *irq_data, bool early)
 {
struct amd_ir_data *data = irq_data->chip_data;
struct irq_2_irte *irte_info = &data->irq_2_irte;
struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid];
+   struct irq_cfg *cfg = irqd_cfg(irq_data);
 
-   if (iommu)
-   iommu->irte_ops->activate(data->entry, irte_info->devid,
- irte_info->index);
+   if (!iommu)
+   return 0;
+
+   iommu->irte_ops->activate(data->entry, irte_info->devid,
+ irte_info->index);
+   amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
return 0;
 }
 
@@ -4267,6 +4276,22 @@ static int amd_ir_set_vcpu_affinity(stru
return modify_irte_ga(irte_info->devid, irte_info->index, irte, 
ir_data);
 }
 
+
+static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
+  struct amd_ir_data *ir_data,
+  struct irq_2_irte *irte_info,
+  struct irq_cfg *cfg)
+{
+
+   /*
+* Atomically updates the IRTE with the new destination, vector
+* and flushes the interrupt entry cache.
+*/
+   iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
+ irte_info->index, cfg->vector,
+ cfg->dest_apicid);
+}
+
 static int amd_ir_set_affinity(struct irq_data *data,
   const struct cpumask *mask, bool force)
 {
@@ -4284,13 +4309,7 @@ static int amd_ir_set_affinity(struct ir
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;
 
-   /*
-* Atomically updates the IRTE with the new destination, vector
-* and flushes the interrupt entry cache.
-*/
-   iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid,
-   irte_info->index, cfg->vector, cfg->dest_apicid);
-
+   amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
/*
 * After this point, all the interrupts will start arriving
 * at the new destination. So, time to cleanup the previous


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH 02/11] x86: make dma_cache_sync a no-op

2017-10-03 Thread Thomas Gleixner
On Tue, 3 Oct 2017, David Laight wrote:

> From: Christoph Hellwig
> > Sent: 03 October 2017 11:43
> > x86 does not implement DMA_ATTR_NON_CONSISTENT allocations, so it doesn't
> > make any sense to do any work in dma_cache_sync given that it must be a
> > no-op when dma_alloc_attrs returns coherent memory.
> 
> I believe it is just about possible to require an explicit
> write flush on x86.
> ISTR this can happen with something like write combining.

As the changelog says: x86 only implements dma_alloc_coherent() which as
the name says returns coherent memory, i.e. type WB (write back), which is
not subject to dma_cache_sync() operations.

If the driver converts that memory to WC (write combine) on its own via
PAT/MTRR, then it needs to take care of flushing the write buffer on its
own. It's not convered by this interface.

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/2] iommu/amd: Add align parameter to alloc_irq_index()

2017-10-08 Thread Thomas Gleixner
On Fri, 6 Oct 2017, Joerg Roedel wrote:

> From: Joerg Roedel 
> 
> For multi-MSI IRQ ranges the IRQ index needs to be aligned
> to the power-of-two of the requested IRQ count. Extend the
> alloc_irq_index() function to allow such an allocation.
> 
> Reported-by: Thomas Gleixner 
> Fixes: 2b324506341cb ('iommu/amd: Add routines to manage irq remapping 
> tables')
> Signed-off-by: Joerg Roedel 

Reviewed-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/2] iommu/amd: Enforce alignment for MSI IRQs

2017-10-08 Thread Thomas Gleixner
On Fri, 6 Oct 2017, Joerg Roedel wrote:

> From: Joerg Roedel 
> 
> Make use of the new alignment capability of
> alloc_irq_index() to enforce IRQ index alignment
> for MSI.
> 
> Reported-by: Thomas Gleixner 
> Fixes: 2b324506341cb ('iommu/amd: Add routines to manage irq remapping 
> tables')
> Signed-off-by: Joerg Roedel 

Reviewed-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 03/14] x86: use dma-direct

2018-03-15 Thread Thomas Gleixner
On Wed, 14 Mar 2018, Christoph Hellwig wrote:

> The generic dma-direct implementation is now functionally equivalent to
> the x86 nommu dma_map implementation, so switch over to using it.

Can you please convert the various drivers first and then remove the
unused code?

> Note that the various iommu drivers are switched from x86_dma_supported
> to dma_direct_supported to provide identical functionality, although the
> checks looks fairly questionable for at least some of them.

Can you please elaborate? From the above it's not clear which checks you
are referring to. If you convert these drivers seperately then explicit
information about your concerns wants to be in the changelogs.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/14] x86: use generic swiotlb_ops

2018-03-15 Thread Thomas Gleixner
On Wed, 14 Mar 2018, Christoph Hellwig wrote:
>  #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
>   void *iommu; /* hook for IOMMU specific extension */
>  #endif
> +#ifdef CONFIG_STA2X11
> + bool is_sta2x11 : 1;

Huch? Please use either bool or an unsigned int based bitfield. A boolean
bitfield doesn't make sense.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/14] x86: use generic swiotlb_ops

2018-03-15 Thread Thomas Gleixner
On Thu, 15 Mar 2018, Christoph Hellwig wrote:

> On Thu, Mar 15, 2018 at 10:00:57AM +0100, Thomas Gleixner wrote:
> > On Wed, 14 Mar 2018, Christoph Hellwig wrote:
> > >  #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
> > >   void *iommu; /* hook for IOMMU specific extension */
> > >  #endif
> > > +#ifdef CONFIG_STA2X11
> > > + bool is_sta2x11 : 1;
> > 
> > Huch? Please use either bool or an unsigned int based bitfield. A boolean
> > bitfield doesn't make sense.
> 
> bool bitfields are perfectly valid in C99 and used in various places in
> the kernel. But if you want either bool or an unsigned bitfield let me
> know and I'll switch it over.

Yeah, I know that the standard defines it, but that doesn't mean it makes
sense. At least not to me.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 03/14] x86: use dma-direct

2018-03-15 Thread Thomas Gleixner
On Thu, 15 Mar 2018, Christoph Hellwig wrote:

> On Thu, Mar 15, 2018 at 09:56:13AM +0100, Thomas Gleixner wrote:
> > On Wed, 14 Mar 2018, Christoph Hellwig wrote:
> > 
> > > The generic dma-direct implementation is now functionally equivalent to
> > > the x86 nommu dma_map implementation, so switch over to using it.
> > 
> > Can you please convert the various drivers first and then remove the
> > unused code?
> 
> Which various drivers?
> 
> > > Note that the various iommu drivers are switched from x86_dma_supported

The various iommu drivers 

> > > to dma_direct_supported to provide identical functionality, although the
> > > checks looks fairly questionable for at least some of them.
> > 
> > Can you please elaborate? From the above it's not clear which checks you
> > are referring to. If you convert these drivers seperately then explicit
> > information about your concerns wants to be in the changelogs.
> 
> This bit:
> 
>   /* Copied from i386. Doesn't make much sense, because it will
>  only work for pci_alloc_coherent.
>  The caller just has to use GFP_DMA in this case. */
>   if (mask < DMA_BIT_MASK(24))
>   return 0;
> 
> in x86_dma_supported, or the equivalent bit in dma_direct_supported.
> Kept for bug to bug compatibility, but I guess I should reword or
> just drop the changelog bit іf it causes confusion.

Reword please.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 03/14] x86: use dma-direct

2018-03-15 Thread Thomas Gleixner
On Thu, 15 Mar 2018, Christoph Hellwig wrote:
> On Thu, Mar 15, 2018 at 01:53:52PM +0100, Thomas Gleixner wrote:
> > > > > The generic dma-direct implementation is now functionally equivalent 
> > > > > to
> > > > > the x86 nommu dma_map implementation, so switch over to using it.
> > > > 
> > > > Can you please convert the various drivers first and then remove the
> > > > unused code?
> > > 
> > > Which various drivers?
> > > 
> > > > > Note that the various iommu drivers are switched from 
> > > > > x86_dma_supported
> > 
> > The various iommu drivers 
> 
> It doesn't really make any sense to switch them separately.  They've
> inherited the ops from x86 which used to override it on an arch level,
> and we've now generalized the exactly same implementation to common code.

Fair enough.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/14] x86: use generic swiotlb_ops

2018-03-15 Thread Thomas Gleixner
On Thu, 15 Mar 2018, Christoph Hellwig wrote:

> On Thu, Mar 15, 2018 at 01:52:14PM +0100, Thomas Gleixner wrote:
> > Yeah, I know that the standard defines it, but that doesn't mean it makes
> > sense. At least not to me.
> 
> It makes sense in that it logically is a boolean but we only want
> to allocate 1 bit for it, unlike the normal ABI allocations of at
> least a byte.
>
> Either way, tell me what you want it changed to, and I'll do it.

I'd prefer either bool or a regular bitfield, but I can live with the
boolean bitfield as well.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use generic dma-direct and swiotlb code for x86 V3

2018-03-19 Thread Thomas Gleixner
On Mon, 19 Mar 2018, Christoph Hellwig wrote:

> Hi all,
> 
> this series switches the x86 code the the dma-direct implementation
> for direct (non-iommu) dma and the generic swiotlb ops.  This includes
> getting rid of the special ops for the AMD memory encryption case and
> the STA2x11 SOC.  The generic implementations are based on the x86
> code, so they provide the same functionality.

Reviewed-by: Thomas Gleixner 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] swiotlb: swiotlb_{alloc,free}_buffer should depend on CONFIG_DMA_DIRECT_OPS

2018-03-23 Thread Thomas Gleixner
On Fri, 23 Mar 2018, Konrad Rzeszutek Wilk wrote:

> On Fri, Mar 23, 2018 at 06:49:30PM +0100, Christoph Hellwig wrote:
> > Otherwise we might get unused symbol warnings for configs that built
> > swiotlb.c only for use by xen-swiotlb.c and that don't otherwise select
> > CONFIG_DMA_DIRECT_OPS, which is possible on arm.
> > 
> > Fixes: 16e73adbca76 ("dma/swiotlb: Remove swiotlb_{alloc,free}_coherent()")
> > Reported-by: Stephen Rothwell 
> > Signed-off-by: Christoph Hellwig 
> 
> 
> Alternatively could we set the Kconfig to slect DMA_DIRECT_OPS?

You only want to do that when you actually need the code. If not it's a
pointless exercise.

But Christoph change makes sense independent of that because the next
oddball Kconfig will come along sooner than later and run into the very
same problem.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v9 02/13] x86: always set IF before oopsing from page fault

2019-04-04 Thread Thomas Gleixner
On Thu, 4 Apr 2019, Tycho Andersen wrote:
>   leaq-PTREGS_SIZE(%rax), %rsp
>   UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
>  
> + /*
> +  * If we oopsed in an interrupt handler, interrupts may be off. Let's 
> turn
> +  * them back on before going back to "normal" code.
> +  */
> + sti

That breaks the paravirt muck and tracing/lockdep.

ENABLE_INTERRUPTS() is what you want plus TRACE_IRQ_ON to keep the tracer
and lockdep happy.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v9 12/13] xpfo, mm: Defer TLB flushes for non-current CPUs (x86 only)

2019-04-05 Thread Thomas Gleixner
On Thu, 4 Apr 2019, Khalid Aziz wrote:
> When xpfo unmaps a page from physmap only (after mapping the page in
> userspace in response to an allocation request from userspace) on one
> processor, there is a small window of opportunity for ret2dir attack on
> other cpus until the TLB entry in physmap for the unmapped pages on
> other cpus is cleared. Forcing that to happen synchronously is the
> expensive part. A multiple of these requests can come in over a very
> short time across multiple processors resulting in every cpu asking
> every other cpusto flush TLB just to close this small window of
> vulnerability in the kernel. If each request is processed synchronously,
> each CPU will do multiple TLB flushes in short order. If we could
> consolidate these TLB flush requests instead and do one TLB flush on
> each cpu at the time of context switch, we can reduce the performance
> impact significantly. This bears out in real life measuring the system
> time when doing a parallel kernel build on a large server. Without this,
> system time on 96-core server when doing "make -j60 all" went up 26x.
> After this optimization, impact went down to 1.44x.
> 
> The trade-off with this strategy is, the kernel on a cpu is vulnerable
> for a short time if the current running processor is the malicious

The "short" time to next context switch on the other CPUs is how short
exactly? Anything from 1us to seconds- think NOHZ FULL - and even w/o that
10ms on a HZ=100 kernel is plenty of time to launch an attack.

> process. Is that an acceptable trade-off?

You are not seriously asking whether creating a user controllable ret2dir
attack window is a acceptable trade-off? April 1st was a few days ago.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 0/6] normalize IOMMU dma mode boot options

2019-04-07 Thread Thomas Gleixner
On Mon, 8 Apr 2019, Leizhen (ThunderTown) wrote:
> > 
> > This will break systems using boot options as now, and I think
> > this is unacceptable. If you want to do so, just introduce iommu.dma_mode
> > on top of those iommu boot options with dma mode boot options unchanged,
> > and iommu.dma_mode is for all archs but compatible with them.
> 
> I just changed the boot options name, but keep the function no change. I added
> all related maintainers/supporters in the "to=" list, maybe we can disuss 
> this.

Changing the name _IS_ the problem. Think about unattended updates.

> Should I add some "obsoleted" warnings for old options and keep them for a 
> while?

No, just keep the old options around for backwards compatibilty sake. We
just do not add new arch specific options in the future. New options need
to use the generic iommu.dma_mode name space.

Thanks,

tglx
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC patch 28/41] dma/debug: Simplify stracktrace retrieval

2019-04-10 Thread Thomas Gleixner
Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner 
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy 
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
---
 kernel/dma/debug.c |   13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
int  sg_mapped_ents;
enum map_err_types  map_err_type;
 #ifdef CONFIG_STACKTRACE
-   struct   stack_trace stacktrace;
-   unsigned longst_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+   unsigned intst_len;
+   unsigned long   st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
 #endif
 };
 
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(stru
 #ifdef CONFIG_STACKTRACE
if (entry) {
pr_warning("Mapped at:\n");
-   print_stack_trace(&entry->stacktrace, 0);
+   stack_trace_print(entry->st_entries, entry->st_len, 0);
}
 #endif
 }
@@ -704,12 +704,9 @@ static struct dma_debug_entry *dma_entry
spin_unlock_irqrestore(&free_entries_lock, flags);
 
 #ifdef CONFIG_STACKTRACE
-   entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
-   entry->stacktrace.entries = entry->st_entries;
-   entry->stacktrace.skip = 2;
-   save_stack_trace(&entry->stacktrace);
+   entry->st_len = stack_trace_save(entry->st_entries,
+ARRAY_SIZE(entry->st_entries), 2);
 #endif
-
return entry;
 }
 


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC patch 28/41] dma/debug: Simplify stracktrace retrieval

2019-04-10 Thread Thomas Gleixner
On Wed, 10 Apr 2019, Christoph Hellwig wrote:

> On Wed, Apr 10, 2019 at 12:28:22PM +0200, Thomas Gleixner wrote:
> > Replace the indirection through struct stack_trace with an invocation of
> > the storage array based interface.
> 
> This seems to be missing some context, at least stack_trace_save does
> not actually exist in mainline.
> 
> Please always send the whole series out to everyone on the To and Cc
> list, otherwise patch series are not reviewable.
 
Bah. People complain about overly broad cc-lists and the context is on
lkml. But sure, I just bounced it to you.

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v9 03/13] mm: Add support for eXclusive Page Frame Ownership (XPFO)

2019-04-17 Thread Thomas Gleixner
On Wed, 17 Apr 2019, Nadav Amit wrote:
> > On Apr 17, 2019, at 10:26 AM, Ingo Molnar  wrote:
> >> As I was curious, I looked at the paper. Here is a quote from it:
> >> 
> >> "In x86-64, however, the permissions of physmap are not in sane state.
> >> Kernels up to v3.8.13 violate the W^X property by mapping the entire region
> >> as “readable, writeable, and executable” (RWX)—only very recent kernels
> >> (≥v3.9) use the more conservative RW mapping.”
> > 
> > But v3.8.13 is a 5+ years old kernel, it doesn't count as a "modern" 
> > kernel in any sense of the word. For any proposed patchset with 
> > significant complexity and non-trivial costs the benchmark version 
> > threshold is the "current upstream kernel".
> > 
> > So does that quote address my followup questions:
> > 
> >> Is this actually true of modern x86-64 kernels? We've locked down W^X
> >> protections in general.
> >> 
> >> I.e. this conclusion:
> >> 
> >>  "Therefore, by simply overwriting kfptr with 0x87FF9F08 and
> >>   triggering the kernel to dereference it, an attacker can directly
> >>   execute shell code with kernel privileges."
> >> 
> >> ... appears to be predicated on imperfect W^X protections on the x86-64
> >> kernel.
> >> 
> >> Do such holes exist on the latest x86-64 kernel? If yes, is there a
> >> reason to believe that these W^X holes cannot be fixed, or that any fix
> >> would be more expensive than XPFO?
> > 
> > ?
> > 
> > What you are proposing here is a XPFO patch-set against recent kernels 
> > with significant runtime overhead, so my questions about the W^X holes 
> > are warranted.
> > 
> 
> Just to clarify - I am an innocent bystander and have no part in this work.
> I was just looking (again) at the paper, as I was curious due to the recent
> patches that I sent that improve W^X protection.

It's not necessarily a W+X issue. The user space text is mapped in the
kernel as well and even if it is mapped RX then this can happen. So any
kernel mappings of user space text need to be mapped NX!

Thanks,

tglx___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH v9 03/13] mm: Add support for eXclusive Page Frame Ownership (XPFO)

2019-04-17 Thread Thomas Gleixner
On Wed, 17 Apr 2019, Linus Torvalds wrote:

> On Wed, Apr 17, 2019, 14:20 Thomas Gleixner  wrote:
> 
> >
> > It's not necessarily a W+X issue. The user space text is mapped in the
> > kernel as well and even if it is mapped RX then this can happen. So any
> > kernel mappings of user space text need to be mapped NX!
> 
> With SMEP, user space pages are always NX.

We talk past each other. The user space page in the ring3 valid virtual
address space (non negative) is of course protected by SMEP.

The attack utilizes the kernel linear mapping of the physical
memory. I.e. user space address 0x43210 has a kernel equivalent at
0xfxx. So if the attack manages to trick the kernel to that valid
kernel address and that is mapped X --> game over. SMEP does not help
there.

>From the top of my head I'd say this is a non issue as those kernel address
space mappings _should_ be NX, but we got bitten by _should_ in the past:)

Thanks,

tglx


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v9 03/13] mm: Add support for eXclusive Page Frame Ownership (XPFO)

2019-04-17 Thread Thomas Gleixner
On Wed, 17 Apr 2019, Linus Torvalds wrote:
> On Wed, Apr 17, 2019 at 4:42 PM Thomas Gleixner  wrote:
> > On Wed, 17 Apr 2019, Linus Torvalds wrote:
> > > With SMEP, user space pages are always NX.
> >
> > We talk past each other. The user space page in the ring3 valid virtual
> > address space (non negative) is of course protected by SMEP.
> >
> > The attack utilizes the kernel linear mapping of the physical
> > memory. I.e. user space address 0x43210 has a kernel equivalent at
> > 0xfxx. So if the attack manages to trick the kernel to that valid
> > kernel address and that is mapped X --> game over. SMEP does not help
> > there.
> 
> Oh, agreed.
> 
> But that would simply be a kernel bug. We should only map kernel pages
> executable when we have kernel code in them, and we should certainly
> not allow those pages to be mapped writably in user space.
> 
> That kind of "executable in kernel, writable in user" would be a
> horrendous and major bug.
> 
> So i think it's a non-issue.

Pretty much.

> > From the top of my head I'd say this is a non issue as those kernel address
> > space mappings _should_ be NX, but we got bitten by _should_ in the past:)
> 
> I do agree that bugs can happen, obviously, and we might have missed 
> something.
>
> But in the context of XPFO, I would argue (*very* strongly) that the
> likelihood of the above kind of bug is absolutely *miniscule* compared
> to the likelihood that we'd have something wrong in the software
> implementation of XPFO.
> 
> So if the argument is "we might have bugs in software", then I think
> that's an argument _against_ XPFO rather than for it.

No argument from my side. We better spend time to make sure that a bogus
kernel side X mapping is caught, like we catch other things.

Thanks,

tglx

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 02/29] stacktrace: Provide helpers for common stack trace operations

2019-04-18 Thread Thomas Gleixner
All operations with stack traces are based on struct stack_trace. That's a
horrible construct as the struct is a kitchen sink for input and
output. Quite some usage sites embed it into their own data structures
which creates weird indirections.

There is absolutely no point in doing so. For all use cases a storage array
and the number of valid stack trace entries in the array is sufficient.

Provide helper functions which avoid the struct stack_trace indirection so
the usage sites can be cleaned up.

Signed-off-by: Thomas Gleixner 
---
 include/linux/stacktrace.h |   27 +++
 kernel/stacktrace.c|  160 -
 2 files changed, 172 insertions(+), 15 deletions(-)

--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -3,11 +3,26 @@
 #define __LINUX_STACKTRACE_H
 
 #include 
+#include 
 
 struct task_struct;
 struct pt_regs;
 
 #ifdef CONFIG_STACKTRACE
+void stack_trace_print(unsigned long *trace, unsigned int nr_entries,
+  int spaces);
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+   unsigned int nr_entries, int spaces);
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr);
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+ unsigned long *store, unsigned int size,
+ unsigned int skipnr);
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+  unsigned int size, unsigned int skipnr);
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+
+/* Internal interfaces. Do not use in generic code */
 struct stack_trace {
unsigned int nr_entries, max_entries;
unsigned long *entries;
@@ -41,4 +56,16 @@ extern void save_stack_trace_user(struct
 # define save_stack_trace_tsk_reliable(tsk, trace) ({ -ENOSYS; })
 #endif /* CONFIG_STACKTRACE */
 
+#if defined(CONFIG_STACKTRACE) && defined(CONFIG_HAVE_RELIABLE_STACKTRACE)
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long 
*store,
+ unsigned int size);
+#else
+static inline int stack_trace_save_tsk_reliable(struct task_struct *tsk,
+   unsigned long *store,
+   unsigned int size)
+{
+   return -ENOSYS;
+}
+#endif
+
 #endif /* __LINUX_STACKTRACE_H */
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -11,35 +11,52 @@
 #include 
 #include 
 
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries:   Pointer to storage array
+ * @nr_entries:Number of entries in the storage array
+ * @spaces:Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+  int spaces)
 {
-   int i;
+   unsigned int i;
 
-   if (WARN_ON(!trace->entries))
+   if (WARN_ON(!entries))
return;
 
-   for (i = 0; i < trace->nr_entries; i++)
-   printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+   for (i = 0; i < nr_entries; i++)
+   printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
+}
+EXPORT_SYMBOL_GPL(stack_trace_print);
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+   stack_trace_print(trace->entries, trace->nr_entries, spaces);
 }
 EXPORT_SYMBOL_GPL(print_stack_trace);
 
-int snprint_stack_trace(char *buf, size_t size,
-   struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf:   Pointer to the print buffer
+ * @size:  Size of the print buffer
+ * @entries:   Pointer to storage array
+ * @nr_entries:Number of entries in the storage array
+ * @spaces:Number of leading spaces to print
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+   unsigned int nr_entries, int spaces)
 {
-   int i;
-   int generated;
-   int total = 0;
+   unsigned int generated, i, total = 0;
 
-   if (WARN_ON(!trace->entries))
+   if (WARN_ON(!entries))
return 0;
 
-   for (i = 0; i < trace->nr_entries; i++) {
+   for (i = 0; i < nr_entries; i++) {
generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
-(void *)trace->entries[i]);
+(void *)entries[i]);
 
total += generated;
-
-   /* Assume that generated isn't a negative number */
if (generated >= size) {
 

[patch V2 03/29] lib/stackdepot: Provide functions which operate on plain storage arrays

2019-04-18 Thread Thomas Gleixner
The struct stack_trace indirection in the stack depot functions is a truly
pointless excercise which requires horrible code at the callsites.

Provide interfaces based on plain storage arrays.

Signed-off-by: Thomas Gleixner 
Acked-by: Alexander Potapenko 
---
 include/linux/stackdepot.h |4 ++
 lib/stackdepot.c   |   66 -
 2 files changed, 51 insertions(+), 19 deletions(-)

--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -26,7 +26,11 @@ typedef u32 depot_stack_handle_t;
 struct stack_trace;
 
 depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags);
+depot_stack_handle_t stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries, gfp_t gfp_flags);
 
 void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace);
+unsigned int stack_depot_fetch(depot_stack_handle_t handle,
+  unsigned long **entries);
 
 #endif
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -194,40 +194,56 @@ static inline struct stack_record *find_
return NULL;
 }
 
-void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
+/**
+ * stack_depot_fetch - Fetch stack entries from a depot
+ *
+ * @entries:   Pointer to store the entries address
+ */
+unsigned int stack_depot_fetch(depot_stack_handle_t handle,
+  unsigned long **entries)
 {
union handle_parts parts = { .handle = handle };
void *slab = stack_slabs[parts.slabindex];
size_t offset = parts.offset << STACK_ALLOC_ALIGN;
struct stack_record *stack = slab + offset;
 
-   trace->nr_entries = trace->max_entries = stack->size;
-   trace->entries = stack->entries;
-   trace->skip = 0;
+   *entries = stack->entries;
+   return stack->size;
+}
+EXPORT_SYMBOL_GPL(stack_depot_fetch);
+
+void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
+{
+   unsigned int nent = stack_depot_fetch(handle, &trace->entries);
+
+   trace->max_entries = trace->nr_entries = nent;
 }
 EXPORT_SYMBOL_GPL(depot_fetch_stack);
 
 /**
- * depot_save_stack - save stack in a stack depot.
- * @trace - the stacktrace to save.
- * @alloc_flags - flags for allocating additional memory if required.
+ * stack_depot_save - Save a stack trace from an array
  *
- * Returns the handle of the stack struct stored in depot.
+ * @entries:   Pointer to storage array
+ * @nr_entries:Size of the storage array
+ * @alloc_flags:   Allocation gfp flags
+ *
+ * Returns the handle of the stack struct stored in depot
  */
-depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
-   gfp_t alloc_flags)
+depot_stack_handle_t stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t alloc_flags)
 {
-   u32 hash;
-   depot_stack_handle_t retval = 0;
struct stack_record *found = NULL, **bucket;
-   unsigned long flags;
+   depot_stack_handle_t retval = 0;
struct page *page = NULL;
void *prealloc = NULL;
+   unsigned long flags;
+   u32 hash;
 
-   if (unlikely(trace->nr_entries == 0))
+   if (unlikely(nr_entries == 0))
goto fast_exit;
 
-   hash = hash_stack(trace->entries, trace->nr_entries);
+   hash = hash_stack(entries, nr_entries);
bucket = &stack_table[hash & STACK_HASH_MASK];
 
/*
@@ -235,8 +251,8 @@ depot_stack_handle_t depot_save_stack(st
 * The smp_load_acquire() here pairs with smp_store_release() to
 * |bucket| below.
 */
-   found = find_stack(smp_load_acquire(bucket), trace->entries,
-  trace->nr_entries, hash);
+   found = find_stack(smp_load_acquire(bucket), entries,
+  nr_entries, hash);
if (found)
goto exit;
 
@@ -264,10 +280,10 @@ depot_stack_handle_t depot_save_stack(st
 
spin_lock_irqsave(&depot_lock, flags);
 
-   found = find_stack(*bucket, trace->entries, trace->nr_entries, hash);
+   found = find_stack(*bucket, entries, nr_entries, hash);
if (!found) {
struct stack_record *new =
-   depot_alloc_stack(trace->entries, trace->nr_entries,
+   depot_alloc_stack(entries, nr_entries,
  hash, &prealloc, alloc_flags);
if (new) {
new->next = *bucket;
@@ -297,4 +313,16 @@ depot_stack_handle_t depot_save_stack(st
 fast_exit:
return retval;
 }
+EXPORT_SYMBOL_GPL(stack_depot_save);
+
+/**
+ * depot_save_stack - save stack in a stack depot.
+ * @trace - the stacktrace to save.
+ * @all

[patch V2 01/29] tracing: Cleanup stack trace code

2019-04-18 Thread Thomas Gleixner
- Remove the extra array member of stack_dump_trace[]. It's not required as
  the stack tracer stores at max array size - 1 entries so there is still
  an empty slot.

- Make variables which are only used in trace_stack.c static.

- Simplify the enable/disable logic.

- Rename stack_trace_print() as it's using the stack_trace_ namespace. Free
  the name up for stack trace related functions.

Signed-off-by: Thomas Gleixner 
Cc: Steven Rostedt 
---
V2: Add more cleanups and use print_max_stack() as requested by Steven.
---
 include/linux/ftrace.h |   18 --
 kernel/trace/trace_stack.c |   36 
 2 files changed, 16 insertions(+), 38 deletions(-)

--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -241,21 +241,11 @@ static inline void ftrace_free_mem(struc
 
 #ifdef CONFIG_STACK_TRACER
 
-#define STACK_TRACE_ENTRIES 500
-
-struct stack_trace;
-
-extern unsigned stack_trace_index[];
-extern struct stack_trace stack_trace_max;
-extern unsigned long stack_trace_max_size;
-extern arch_spinlock_t stack_trace_max_lock;
-
 extern int stack_tracer_enabled;
-void stack_trace_print(void);
-int
-stack_trace_sysctl(struct ctl_table *table, int write,
-  void __user *buffer, size_t *lenp,
-  loff_t *ppos);
+
+int stack_trace_sysctl(struct ctl_table *table, int write,
+  void __user *buffer, size_t *lenp,
+  loff_t *ppos);
 
 /* DO NOT MODIFY THIS VARIABLE DIRECTLY! */
 DECLARE_PER_CPU(int, disable_stack_tracer);
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,8 +18,10 @@
 
 #include "trace.h"
 
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES + 1];
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
+#define STACK_TRACE_ENTRIES 500
+
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
 
 /*
  * Reserve one entry for the passed in ip. This will allow
@@ -31,17 +33,16 @@ struct stack_trace stack_trace_max = {
.entries= &stack_dump_trace[0],
 };
 
-unsigned long stack_trace_max_size;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
 DEFINE_PER_CPU(int, disable_stack_tracer);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 
 int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
 
-void stack_trace_print(void)
+static void print_max_stack(void)
 {
long i;
int size;
@@ -61,16 +62,7 @@ void stack_trace_print(void)
}
 }
 
-/*
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- * stack_trace_index[]
- * stack_trace_max
- * stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
+static void check_stack(unsigned long ip, unsigned long *stack)
 {
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
@@ -179,7 +171,7 @@ check_stack(unsigned long ip, unsigned l
stack_trace_max.nr_entries = x;
 
if (task_stack_end_corrupted(current)) {
-   stack_trace_print();
+   print_max_stack();
BUG();
}
 
@@ -412,23 +404,20 @@ stack_trace_sysctl(struct ctl_table *tab
   void __user *buffer, size_t *lenp,
   loff_t *ppos)
 {
-   int ret;
+   int ret, was_enabled;
 
mutex_lock(&stack_sysctl_mutex);
+   was_enabled = !!stack_tracer_enabled;
 
ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
-   if (ret || !write ||
-   (last_stack_tracer_enabled == !!stack_tracer_enabled))
+   if (ret || !write || (was_enabled == !!stack_tracer_enabled))
goto out;
 
-   last_stack_tracer_enabled = !!stack_tracer_enabled;
-
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
-
  out:
mutex_unlock(&stack_sysctl_mutex);
return ret;
@@ -444,7 +433,6 @@ static __init int enable_stacktrace(char
strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
 
stack_tracer_enabled = 1;
-   last_stack_tracer_enabled = 1;
return 1;
 }
 __setup("stacktrace", enable_stacktrace);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 05/29] proc: Simplify task stack retrieval

2019-04-18 Thread Thomas Gleixner
Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Alexey Dobriyan 
Cc: Andrew Morton 
---
 fs/proc/base.c |   14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -407,7 +407,6 @@ static void unlock_trace(struct task_str
 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
  struct pid *pid, struct task_struct *task)
 {
-   struct stack_trace trace;
unsigned long *entries;
int err;
 
@@ -430,20 +429,17 @@ static int proc_pid_stack(struct seq_fil
if (!entries)
return -ENOMEM;
 
-   trace.nr_entries= 0;
-   trace.max_entries   = MAX_STACK_TRACE_DEPTH;
-   trace.entries   = entries;
-   trace.skip  = 0;
-
err = lock_trace(task);
if (!err) {
-   unsigned int i;
+   unsigned int i, nr_entries;
 
-   save_stack_trace_tsk(task, &trace);
+   nr_entries = stack_trace_save_tsk(task, entries,
+ MAX_STACK_TRACE_DEPTH, 0);
 
-   for (i = 0; i < trace.nr_entries; i++) {
+   for (i = 0; i < nr_entries; i++) {
seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
}
+
unlock_trace(task);
}
kfree(entries);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 07/29] mm/slub: Simplify stack trace retrieval

2019-04-18 Thread Thomas Gleixner
Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: Pekka Enberg 
Cc: linux...@kvack.org
Cc: David Rientjes 
Cc: Christoph Lameter 
---
 mm/slub.c |   12 
 1 file changed, 4 insertions(+), 8 deletions(-)

--- a/mm/slub.c
+++ b/mm/slub.c
@@ -552,18 +552,14 @@ static void set_track(struct kmem_cache
 
if (addr) {
 #ifdef CONFIG_STACKTRACE
-   struct stack_trace trace;
+   unsigned int nr_entries;
 
-   trace.nr_entries = 0;
-   trace.max_entries = TRACK_ADDRS_COUNT;
-   trace.entries = p->addrs;
-   trace.skip = 3;
metadata_access_enable();
-   save_stack_trace(&trace);
+   nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
 
-   if (trace.nr_entries < TRACK_ADDRS_COUNT)
-   p->addrs[trace.nr_entries] = 0;
+   if (nr_entries < TRACK_ADDRS_COUNT)
+   p->addrs[nr_entries] = 0;
 #endif
p->addr = addr;
p->cpu = smp_processor_id();


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch V2 11/29] fault-inject: Simplify stacktrace retrieval

2019-04-18 Thread Thomas Gleixner
Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner 
Cc: Akinobu Mita 
---
 lib/fault-inject.c |   12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -65,22 +65,16 @@ static bool fail_task(struct fault_attr
 
 static bool fail_stacktrace(struct fault_attr *attr)
 {
-   struct stack_trace trace;
int depth = attr->stacktrace_depth;
unsigned long entries[MAX_STACK_TRACE_DEPTH];
-   int n;
+   int n, nr_entries;
bool found = (attr->require_start == 0 && attr->require_end == 
ULONG_MAX);
 
if (depth == 0)
return found;
 
-   trace.nr_entries = 0;
-   trace.entries = entries;
-   trace.max_entries = depth;
-   trace.skip = 1;
-
-   save_stack_trace(&trace);
-   for (n = 0; n < trace.nr_entries; n++) {
+   nr_entries = stack_trace_save(entries, depth, 1);
+   for (n = 0; n < nr_entries; n++) {
if (attr->reject_start <= entries[n] &&
   entries[n] < attr->reject_end)
return false;


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


  1   2   3   4   5   6   >