Re: [PATCH v2] arm64: Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC

2018-02-21 Thread Will Deacon
On Wed, Feb 21, 2018 at 07:10:34AM -0600, Shanker Donthineni wrote:
> On 02/21/2018 05:12 AM, Catalin Marinas wrote:
> > However, my worry is that in an implementation with DIC set, we also
> > skip the DSB/ISB sequence in the invalidate_icache_by_line macro. For
> > example, in an implementation with transparent PoU, we could have:
> > 
> > str , [addr]
> > // no cache maintenance or barrier
> > br  
> > 
> 
> Thanks for pointing out the missing barriers. I think it make sense to follow
> the existing barrier semantics in order to avoid the unknown things.
>  
> > Is an ISB required between the instruction store and execution? I would
> > say yes but maybe Will has a better opinion here.
> > 
> Agree, an ISB is required especially for self-modifying code. I'll include in 
> v3 patch. 

I'd have thought you'd need a DSB too, before the ISB.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] arm64: Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC

2018-02-21 Thread Shanker Donthineni
Hi Catalin,

On 02/21/2018 05:12 AM, Catalin Marinas wrote:
> On Mon, Feb 19, 2018 at 08:59:06PM -0600, Shanker Donthineni wrote:
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index f55fe5b..4061210 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -1095,6 +1095,27 @@ config ARM64_RAS_EXTN
>>and access the new registers if the system supports the extension.
>>Platform RAS features may additionally depend on firmware support.
>>  
>> +config ARM64_CACHE_IDC
>> +bool "Enable support for DCache clean PoU optimization"
>> +default y
>> +help
>> +  The data cache clean to the point of unification is not required
>> +  for instruction to be data coherence if CTR_EL0.IDC has value 1.
>> +
>> +  Selecting this feature will allow the kernel to optimize the POU
>> +  cache maintaince operations where it requires 'DC CVAU'.
>> +
>> +config ARM64_CACHE_DIC
>> +bool "Enable support for ICache invalidation PoU optimization"
>> +default y
>> +help
>> +  Instruction cache invalidation to the point of unification is not
>> +  required for instruction to be data coherence if CTR_EL0.DIC has
>> +  value 1.
>> +
>> +  Selecting this feature will allow the kernel to optimize the POU
>> +  cache maintaince operations where it requires 'IC IVAU'.
> 
> A single Kconfig entry is sufficient for both features.
> 

I'll do in v3 patch.

>> @@ -864,6 +864,22 @@ static bool has_no_fpsimd(const struct 
>> arm64_cpu_capabilities *entry, int __unus
>>  ID_AA64PFR0_FP_SHIFT) < 0;
>>  }
>>  
>> +#ifdef CONFIG_ARM64_CACHE_IDC
>> +static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
>> +  int __unused)
>> +{
>> +return !!(read_sanitised_ftr_reg(SYS_CTR_EL0) & (1UL << CTR_IDC_SHIFT));
>> +}
>> +#endif
>> +
>> +#ifdef CONFIG_ARM64_CACHE_DIC
>> +static bool has_cache_dic(const struct arm64_cpu_capabilities *entry,
>> +  int __unused)
>> +{
>> +return !!(read_sanitised_ftr_reg(SYS_CTR_EL0) & (1UL << CTR_DIC_SHIFT));
>> +}
>> +#endif
> 
> Nitpick: no need for !! since the function type is bool already.
> 

Sure, I'll remove '!!'.

>> diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
>> index 758bde7..7d37d71 100644
>> --- a/arch/arm64/mm/cache.S
>> +++ b/arch/arm64/mm/cache.S
>> @@ -50,6 +50,9 @@ ENTRY(flush_icache_range)
>>   */
>>  ENTRY(__flush_cache_user_range)
>>  uaccess_ttbr0_enable x2, x3, x4
>> +alternative_if ARM64_HAS_CACHE_IDC
>> +b   8f
>> +alternative_else_nop_endif
>>  dcache_line_size x2, x3
>>  sub x3, x2, #1
>>  bic x4, x0, x3
>> @@ -60,6 +63,11 @@ user_alt 9f, "dc cvau, x4",  "dc civac, x4",  
>> ARM64_WORKAROUND_CLEAN_CACHE
>>  b.lo1b
>>  dsb ish
>>  
>> +8:
>> +alternative_if ARM64_HAS_CACHE_DIC
>> +mov x0, #0
>> +b   1f
>> +alternative_else_nop_endif
>>  invalidate_icache_by_line x0, x1, x2, x3, 9f
>>  mov x0, #0
>>  1:
> 
> You can add another label at mov x0, #0 below this hunk and keep a
> single instruction in the alternative path.
> 
> However, my worry is that in an implementation with DIC set, we also
> skip the DSB/ISB sequence in the invalidate_icache_by_line macro. For
> example, in an implementation with transparent PoU, we could have:
> 
>   str , [addr]
>   // no cache maintenance or barrier
>   br  
> 

Thanks for pointing out the missing barriers. I think it make sense to follow
the existing barrier semantics in order to avoid the unknown things.
 
> Is an ISB required between the instruction store and execution? I would
> say yes but maybe Will has a better opinion here.
> 
Agree, an ISB is required especially for self-modifying code. I'll include in 
v3 patch. 

-- 
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux 
Foundation Collaborative Project.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] arm64: Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC

2018-02-21 Thread Catalin Marinas
On Mon, Feb 19, 2018 at 08:59:06PM -0600, Shanker Donthineni wrote:
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index f55fe5b..4061210 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1095,6 +1095,27 @@ config ARM64_RAS_EXTN
> and access the new registers if the system supports the extension.
> Platform RAS features may additionally depend on firmware support.
>  
> +config ARM64_CACHE_IDC
> + bool "Enable support for DCache clean PoU optimization"
> + default y
> + help
> +   The data cache clean to the point of unification is not required
> +   for instruction to be data coherence if CTR_EL0.IDC has value 1.
> +
> +   Selecting this feature will allow the kernel to optimize the POU
> +   cache maintaince operations where it requires 'DC CVAU'.
> +
> +config ARM64_CACHE_DIC
> + bool "Enable support for ICache invalidation PoU optimization"
> + default y
> + help
> +   Instruction cache invalidation to the point of unification is not
> +   required for instruction to be data coherence if CTR_EL0.DIC has
> +   value 1.
> +
> +   Selecting this feature will allow the kernel to optimize the POU
> +   cache maintaince operations where it requires 'IC IVAU'.

A single Kconfig entry is sufficient for both features.

> @@ -864,6 +864,22 @@ static bool has_no_fpsimd(const struct 
> arm64_cpu_capabilities *entry, int __unus
>   ID_AA64PFR0_FP_SHIFT) < 0;
>  }
>  
> +#ifdef CONFIG_ARM64_CACHE_IDC
> +static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
> +   int __unused)
> +{
> + return !!(read_sanitised_ftr_reg(SYS_CTR_EL0) & (1UL << CTR_IDC_SHIFT));
> +}
> +#endif
> +
> +#ifdef CONFIG_ARM64_CACHE_DIC
> +static bool has_cache_dic(const struct arm64_cpu_capabilities *entry,
> +   int __unused)
> +{
> + return !!(read_sanitised_ftr_reg(SYS_CTR_EL0) & (1UL << CTR_DIC_SHIFT));
> +}
> +#endif

Nitpick: no need for !! since the function type is bool already.

> diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
> index 758bde7..7d37d71 100644
> --- a/arch/arm64/mm/cache.S
> +++ b/arch/arm64/mm/cache.S
> @@ -50,6 +50,9 @@ ENTRY(flush_icache_range)
>   */
>  ENTRY(__flush_cache_user_range)
>   uaccess_ttbr0_enable x2, x3, x4
> +alternative_if ARM64_HAS_CACHE_IDC
> + b   8f
> +alternative_else_nop_endif
>   dcache_line_size x2, x3
>   sub x3, x2, #1
>   bic x4, x0, x3
> @@ -60,6 +63,11 @@ user_alt 9f, "dc cvau, x4",  "dc civac, x4",  
> ARM64_WORKAROUND_CLEAN_CACHE
>   b.lo1b
>   dsb ish
>  
> +8:
> +alternative_if ARM64_HAS_CACHE_DIC
> + mov x0, #0
> + b   1f
> +alternative_else_nop_endif
>   invalidate_icache_by_line x0, x1, x2, x3, 9f
>   mov x0, #0
>  1:

You can add another label at mov x0, #0 below this hunk and keep a
single instruction in the alternative path.

However, my worry is that in an implementation with DIC set, we also
skip the DSB/ISB sequence in the invalidate_icache_by_line macro. For
example, in an implementation with transparent PoU, we could have:

str , [addr]
// no cache maintenance or barrier
br  

Is an ISB required between the instruction store and execution? I would
say yes but maybe Will has a better opinion here.

-- 
Catalin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2] arm64: Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC

2018-02-19 Thread Shanker Donthineni
The DCache clean & ICache invalidation requirements for instructions
to be data coherence are discoverable through new fields in CTR_EL0.
The following two control bits DIC and IDC were defined for this
purpose. No need to perform point of unification cache maintenance
operations from software on systems where CPU caches are transparent.

This patch optimize the three functions __flush_cache_user_range(),
clean_dcache_area_pou() and invalidate_icache_range() if the hardware
reports CTR_EL0.IDC and/or CTR_EL0.IDC. Basically it skips the two
instructions 'DC CVAU' and 'IC IVAU', and the associated loop logic
in order to avoid the unnecessary overhead.

CTR_EL0.DIC: Instruction cache invalidation requirements for
 instruction to data coherence. The meaning of this bit[29].
  0: Instruction cache invalidation to the point of unification
 is required for instruction to data coherence.
  1: Instruction cache cleaning to the point of unification is
  not required for instruction to data coherence.

CTR_EL0.IDC: Data cache clean requirements for instruction to data
 coherence. The meaning of this bit[28].
  0: Data cache clean to the point of unification is required for
 instruction to data coherence, unless CLIDR_EL1.LoC == 0b000
 or (CLIDR_EL1.LoUIS == 0b000 && CLIDR_EL1.LoUU == 0b000).
  1: Data cache clean to the point of unification is not required
 for instruction to data coherence.

Signed-off-by: Philip Elcan 
Signed-off-by: Shanker Donthineni 
---
Changes since v1:
  -Reworded commit text.
  -Used the alternatives framework as Catalin suggested.
  -Rebased on top of https://patchwork.kernel.org/patch/10227927/

 arch/arm64/Kconfig   | 21 +++
 arch/arm64/include/asm/cache.h   |  5 +
 arch/arm64/include/asm/cpucaps.h |  4 +++-
 arch/arm64/kernel/cpufeature.c   | 44 ++--
 arch/arm64/mm/cache.S| 15 ++
 5 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f55fe5b..4061210 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1095,6 +1095,27 @@ config ARM64_RAS_EXTN
  and access the new registers if the system supports the extension.
  Platform RAS features may additionally depend on firmware support.
 
+config ARM64_CACHE_IDC
+   bool "Enable support for DCache clean PoU optimization"
+   default y
+   help
+ The data cache clean to the point of unification is not required
+ for instruction to be data coherence if CTR_EL0.IDC has value 1.
+
+ Selecting this feature will allow the kernel to optimize the POU
+ cache maintaince operations where it requires 'DC CVAU'.
+
+config ARM64_CACHE_DIC
+   bool "Enable support for ICache invalidation PoU optimization"
+   default y
+   help
+ Instruction cache invalidation to the point of unification is not
+ required for instruction to be data coherence if CTR_EL0.DIC has
+ value 1.
+
+ Selecting this feature will allow the kernel to optimize the POU
+ cache maintaince operations where it requires 'IC IVAU'.
+
 endmenu
 
 config ARM64_SVE
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index ea9bb4e..e22178b 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -20,8 +20,13 @@
 
 #define CTR_L1IP_SHIFT 14
 #define CTR_L1IP_MASK  3
+#define CTR_DMLINE_SHIFT   16
+#define CTR_ERG_SHIFT  20
 #define CTR_CWG_SHIFT  24
 #define CTR_CWG_MASK   15
+#define CTR_IDC_SHIFT  28
+#define CTR_DIC_SHIFT  29
+#define CTR_B31_SHIFT  31
 
 #define CTR_L1IP(ctr)  (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)
 
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index bb26382..8dd42ae 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -45,7 +45,9 @@
 #define ARM64_HARDEN_BRANCH_PREDICTOR  24
 #define ARM64_HARDEN_BP_POST_GUEST_EXIT25
 #define ARM64_HAS_RAS_EXTN 26
+#define ARM64_HAS_CACHE_IDC27
+#define ARM64_HAS_CACHE_DIC28
 
-#define ARM64_NCAPS27
+#define ARM64_NCAPS29
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index ff8a6e9..53a7266 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -199,12 +199,12 @@ static int __init register_cpu_hwcaps_dumper(void)
 };
 
 static const struct arm64_ftr_bits ftr_ctr[] = {
-   ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1),   
/* RES1 */
-   ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 29, 1, 1),  
/* DIC */
-   ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 28, 1, 1),  
/* IDC */