[PATCH] x86/efi: Do not release sub-1MB memory regions when the crashkernel option is specified

2021-04-07 Thread Lianbo Jiang
Some sub-1MB memory regions may be reserved by EFI boot services, and the
memory regions will be released later in the efi_free_boot_services().

Currently, always reserve all sub-1MB memory regions when the crashkernel
option is specified, but unfortunately EFI boot services may have already
reserved some sub-1MB memory regions before the crash_reserve_low_1M() is
called, which makes that the crash_reserve_low_1M() only own the
remaining sub-1MB memory regions, not all sub-1MB memory regions, because,
subsequently EFI boot services will free its own sub-1MB memory regions.
Eventually, DMA will be able to allocate memory from the sub-1MB area and
cause the following error:

crash> kmem -s |grep invalid
kmem: dma-kmalloc-512: slab: d52c40001900 invalid freepointer: 
9403c0067300
kmem: dma-kmalloc-512: slab: d52c40001900 invalid freepointer: 
9403c0067300
crash> vtop 9403c0067300
VIRTUAL   PHYSICAL
9403c0067300  67300   --->The physical address falls into this range 
[0x00063000-0x0008efff]

kernel debugging log:
...
[0.008927] memblock_reserve: [0x0001-0x00013fff] 
efi_reserve_boot_services+0x85/0xd0
[0.008930] memblock_reserve: [0x00063000-0x0008efff] 
efi_reserve_boot_services+0x85/0xd0
...
[0.009425] memblock_reserve: [0x-0x000f] 
crash_reserve_low_1M+0x2c/0x49
...
[0.010586] Zone ranges:
[0.010587]   DMA  [mem 0x1000-0x00ff]
[0.010589]   DMA32[mem 0x0100-0x]
[0.010591]   Normal   [mem 0x0001-0x000c7fff]
[0.010593]   Device   empty
...
[8.814894] __memblock_free_late: [0x00063000-0x0008efff] 
efi_free_boot_services+0x14b/0x23b
[8.815793] __memblock_free_late: [0x0001-0x00013fff] 
efi_free_boot_services+0x14b/0x23b

Do not release sub-1MB memory regions even though they are reserved by
EFI boot services, so that always reserve all sub-1MB memory regions when
the crashkernel option is specified.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/platform/efi/quirks.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 67d93a243c35..637f932c4fd4 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define EFI_MIN_RESERVE 5120
 
@@ -303,6 +304,19 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 
size)
  */
 static __init bool can_free_region(u64 start, u64 size)
 {
+   /*
+* Some sub-1MB memory regions may be reserved by EFI boot
+* services, and these memory regions will be released later
+* in the efi_free_boot_services().
+*
+* Do not release sub-1MB memory regions even though they are
+* reserved by EFI boot services, because, always reserve all
+* sub-1MB memory when the crashkernel option is specified.
+*/
+   if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) > 0
+   && (start + size < (1<<20)))
+   return false;
+
if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
return false;
 
-- 
2.17.1



[PATCH v3 2/2] iommu: use the __iommu_attach_device() directly for deferred attach

2021-01-26 Thread Lianbo Jiang
Currently, because domain attach allows to be deferred from iommu
driver to device driver, and when iommu initializes, the devices
on the bus will be scanned and the default groups will be allocated.

Due to the above changes, some devices could be added to the same
group as below:

[3.859417] pci :01:00.0: Adding to iommu group 16
[3.864572] pci :01:00.1: Adding to iommu group 16
[3.869738] pci :02:00.0: Adding to iommu group 17
[3.874892] pci :02:00.1: Adding to iommu group 17

But when attaching these devices, it doesn't allow that a group has
more than one device, otherwise it will return an error. This conflicts
with the deferred attaching. Unfortunately, it has two devices in the
same group for my side, for example:

[9.627014] iommu_group_device_count(): device name[0]::01:00.0
[9.633545] iommu_group_device_count(): device name[1]::01:00.1
...
[   10.255609] iommu_group_device_count(): device name[0]::02:00.0
[   10.262144] iommu_group_device_count(): device name[1]::02:00.1

Finally, which caused the failure of tg3 driver when tg3 driver calls
the dma_alloc_coherent() to allocate coherent memory in the tg3_test_dma().

[9.660310] tg3 :01:00.0: DMA engine test failed, aborting
[9.754085] tg3: probe of :01:00.0 failed with error -12
[9.997512] tg3 :01:00.1: DMA engine test failed, aborting
[   10.043053] tg3: probe of :01:00.1 failed with error -12
[   10.288905] tg3 :02:00.0: DMA engine test failed, aborting
[   10.334070] tg3: probe of :02:00.0 failed with error -12
[   10.578303] tg3 :02:00.1: DMA engine test failed, aborting
[   10.622629] tg3: probe of :02:00.1 failed with error -12

In addition, the similar situations also occur in other drivers such
as the bnxt_en driver. That can be reproduced easily in kdump kernel
when SME is active.

Let's move the handling currently in iommu_dma_deferred_attach() into
the iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Signed-off-by: Lianbo Jiang 
---
 drivers/iommu/dma-iommu.c | 18 +++---
 drivers/iommu/iommu.c | 10 ++
 include/linux/iommu.h |  1 +
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c80056f6c9f9..f659395e7959 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -380,18 +380,6 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
return iova_reserve_iommu_regions(dev, domain);
 }
 
-static int iommu_dma_deferred_attach(struct device *dev,
-   struct iommu_domain *domain)
-{
-   const struct iommu_ops *ops = domain->ops;
-
-   if (unlikely(ops->is_attach_deferred &&
-   ops->is_attach_deferred(domain, dev)))
-   return iommu_attach_device(domain, dev);
-
-   return 0;
-}
-
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
  *page flags.
@@ -535,7 +523,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
phys_addr_t phys,
dma_addr_t iova;
 
if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_deferred_attach(dev, domain))
return DMA_MAPPING_ERROR;
 
size = iova_align(iovad, size + iova_off);
@@ -694,7 +682,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, 
size_t size,
*dma_handle = DMA_MAPPING_ERROR;
 
if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_deferred_attach(dev, domain))
return NULL;
 
min_size = alloc_sizes & -alloc_sizes;
@@ -978,7 +966,7 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
int i;
 
if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_deferred_attach(dev, domain))
return 0;
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffeebda8d6de..15b5fd6bd554 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1980,6 +1980,16 @@ int iommu_attach_device(struct iommu_domain *domain, 
struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
+{
+   const struct iommu_ops *ops = domain->ops;
+
+   if (ops->is_attach_deferred && ops->is_attach_deferred(domain, dev))
+   return __iommu_attach_device(domain, dev);
+
+   return 0;
+}
+
 /*
  * Check flags and other user provided data for valid comb

[PATCH v3 1/2] dma-iommu: use static-key to minimize the impact in the fast-path

2021-01-26 Thread Lianbo Jiang
Let's move out the is_kdump_kernel() check from iommu_dma_deferred_attach()
to iommu_dma_init(), and use the static-key in the fast-path to minimize
the impact in the normal case.

Signed-off-by: Lianbo Jiang 
Co-developed-by: Robin Murphy 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4078358ed66e..c80056f6c9f9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -51,6 +51,8 @@ struct iommu_dma_cookie {
struct iommu_domain *fq_domain;
 };
 
+static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
+
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
struct iommu_domain *domain)
 {
@@ -383,9 +385,6 @@ static int iommu_dma_deferred_attach(struct device *dev,
 {
const struct iommu_ops *ops = domain->ops;
 
-   if (!is_kdump_kernel())
-   return 0;
-
if (unlikely(ops->is_attach_deferred &&
ops->is_attach_deferred(domain, dev)))
return iommu_attach_device(domain, dev);
@@ -535,7 +534,8 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
phys_addr_t phys,
size_t iova_off = iova_offset(iovad, phys);
dma_addr_t iova;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(_deferred_attach_enabled) &&
+   iommu_dma_deferred_attach(dev, domain))
return DMA_MAPPING_ERROR;
 
size = iova_align(iovad, size + iova_off);
@@ -693,7 +693,8 @@ static void *iommu_dma_alloc_remap(struct device *dev, 
size_t size,
 
*dma_handle = DMA_MAPPING_ERROR;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(_deferred_attach_enabled) &&
+   iommu_dma_deferred_attach(dev, domain))
return NULL;
 
min_size = alloc_sizes & -alloc_sizes;
@@ -976,7 +977,8 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
unsigned long mask = dma_get_seg_boundary(dev);
int i;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(_deferred_attach_enabled) &&
+   iommu_dma_deferred_attach(dev, domain))
return 0;
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
@@ -1424,6 +1426,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 static int iommu_dma_init(void)
 {
+   if (is_kdump_kernel())
+   static_branch_enable(_deferred_attach_enabled);
+
return iova_cache_get();
 }
 arch_initcall(iommu_dma_init);
-- 
2.17.1



[PATCH v3 0/2] iommu: fix the failure of deferred attach for iommu attach device

2021-01-26 Thread Lianbo Jiang
This patchset is to fix the failure of deferred attach for iommu attach
device, it includes the following two patches:

[1] [PATCH 1/2] dma-iommu: use static-key to minimize the impact in the 
fast-path
This is a prepared patch for the second one, move out the is_kdump_kernel()
check from iommu_dma_deferred_attach() to iommu_dma_init(), and use the
static-key in the fast-path to minimize the impact in the normal case.

[2] [PATCH 2/2] iommu: use the __iommu_attach_device() directly for deferred 
attach
Move the handling currently in iommu_dma_deferred_attach() into the
iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Changes since v1:
[1] use the __iommu_attach_device() directly for deferred attach
[2] use static-key to minimize the impact in the fast-path

Changes since v2:
[1] remove the underscores for the variable "__deferred_attach", and change
its name to iommu_deferred_attach_enabled [Suggested by Christoph Hellwig]
[2] remove the "do_" from the iommu_do_deferred_attach(), and change its
name to iommu_deferred_attach()
[3] remove the "extern" from the definition of iommu_deferred_attach() in
include/linux/iommu.h

Lianbo Jiang (2):
  dma-iommu: use static-key to minimize the impact in the fast-path
  iommu: use the __iommu_attach_device() directly for deferred attach

 drivers/iommu/dma-iommu.c | 29 +++--
 drivers/iommu/iommu.c | 10 ++
 include/linux/iommu.h |  1 +
 3 files changed, 22 insertions(+), 18 deletions(-)

-- 
2.17.1



[PATCH 0/2 v2] iommu: fix the failure of deferred attach for iommu attach device

2021-01-19 Thread Lianbo Jiang
This patchset is to fix the failure of deferred attach for iommu attach
device, it includes the following two patches:

[1] [PATCH 1/2] dma-iommu: use static-key to minimize the impact in the 
fast-path
This is a prepared patch for the second one, move out the is_kdump_kernel()
check from iommu_dma_deferred_attach() to iommu_dma_init(), and use the
static-key in the fast-path to minimize the impact in the normal case.

[2] [PATCH 2/2] iommu: use the __iommu_attach_device() directly for deferred 
attach
Move the handling currently in iommu_dma_deferred_attach() into the
iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Changes since v1:
[1] use the __iommu_attach_device() directly for deferred attach
[2] use static-key to minimize the impact in the fast-path

Lianbo Jiang (2):
  dma-iommu: use static-key to minimize the impact in the fast-path
  iommu: use the __iommu_attach_device() directly for deferred attach

 drivers/iommu/dma-iommu.c | 29 +++--
 drivers/iommu/iommu.c | 12 
 include/linux/iommu.h |  2 ++
 3 files changed, 25 insertions(+), 18 deletions(-)

-- 
2.17.1



[PATCH 2/2 v2] iommu: use the __iommu_attach_device() directly for deferred attach

2021-01-19 Thread Lianbo Jiang
Currently, because domain attach allows to be deferred from iommu
driver to device driver, and when iommu initializes, the devices
on the bus will be scanned and the default groups will be allocated.

Due to the above changes, some devices could be added to the same
group as below:

[3.859417] pci :01:00.0: Adding to iommu group 16
[3.864572] pci :01:00.1: Adding to iommu group 16
[3.869738] pci :02:00.0: Adding to iommu group 17
[3.874892] pci :02:00.1: Adding to iommu group 17

But when attaching these devices, it doesn't allow that a group has
more than one device, otherwise it will return an error. This conflicts
with the deferred attaching. Unfortunately, it has two devices in the
same group for my side, for example:

[9.627014] iommu_group_device_count(): device name[0]::01:00.0
[9.633545] iommu_group_device_count(): device name[1]::01:00.1
...
[   10.255609] iommu_group_device_count(): device name[0]::02:00.0
[   10.262144] iommu_group_device_count(): device name[1]::02:00.1

Finally, which caused the failure of tg3 driver when tg3 driver calls
the dma_alloc_coherent() to allocate coherent memory in the tg3_test_dma().

[9.660310] tg3 :01:00.0: DMA engine test failed, aborting
[9.754085] tg3: probe of :01:00.0 failed with error -12
[9.997512] tg3 :01:00.1: DMA engine test failed, aborting
[   10.043053] tg3: probe of :01:00.1 failed with error -12
[   10.288905] tg3 :02:00.0: DMA engine test failed, aborting
[   10.334070] tg3: probe of :02:00.0 failed with error -12
[   10.578303] tg3 :02:00.1: DMA engine test failed, aborting
[   10.622629] tg3: probe of :02:00.1 failed with error -12

In addition, the similar situations also occur in other drivers such
as the bnxt_en driver. That can be reproduced easily in kdump kernel
when SME is active

Let's move the handling currently in iommu_dma_deferred_attach() into
the iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Signed-off-by: Lianbo Jiang 
---
 drivers/iommu/dma-iommu.c | 18 +++---
 drivers/iommu/iommu.c | 12 
 include/linux/iommu.h |  2 ++
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3711b4a6e4f9..fa6f9098e77d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -380,18 +380,6 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
return iova_reserve_iommu_regions(dev, domain);
 }
 
-static int iommu_dma_deferred_attach(struct device *dev,
-   struct iommu_domain *domain)
-{
-   const struct iommu_ops *ops = domain->ops;
-
-   if (unlikely(ops->is_attach_deferred &&
-   ops->is_attach_deferred(domain, dev)))
-   return iommu_attach_device(domain, dev);
-
-   return 0;
-}
-
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
  *page flags.
@@ -535,7 +523,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
phys_addr_t phys,
dma_addr_t iova;
 
if (static_branch_unlikely(&__deferred_attach) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_do_deferred_attach(dev, domain))
return DMA_MAPPING_ERROR;
 
size = iova_align(iovad, size + iova_off);
@@ -694,7 +682,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, 
size_t size,
*dma_handle = DMA_MAPPING_ERROR;
 
if (static_branch_unlikely(&__deferred_attach) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_do_deferred_attach(dev, domain))
return NULL;
 
min_size = alloc_sizes & -alloc_sizes;
@@ -1005,7 +993,7 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
int i;
 
if (static_branch_unlikely(&__deferred_attach) &&
-   iommu_dma_deferred_attach(dev, domain))
+   iommu_do_deferred_attach(dev, domain))
return 0;
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffeebda8d6de..32164d355d2e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1980,6 +1980,18 @@ int iommu_attach_device(struct iommu_domain *domain, 
struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_do_deferred_attach(struct device *dev,
+struct iommu_domain *domain)
+{
+   const struct iommu_ops *ops = domain->ops;
+
+   if (unlikely(ops->is_attach_deferred &&
+ops->is_attach_deferred(domain, dev)))
+   return __iommu_attach_device(domain, dev);
+
+   return 0

[PATCH 1/2 v2] dma-iommu: use static-key to minimize the impact in the fast-path

2021-01-19 Thread Lianbo Jiang
Let's move out the is_kdump_kernel() check from iommu_dma_deferred_attach()
to iommu_dma_init(), and use the static-key in the fast-path to minimize
the impact in the normal case.

Signed-off-by: Lianbo Jiang 
Co-developed-by: Robin Murphy 
Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f0305e6aac1b..3711b4a6e4f9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -51,6 +51,8 @@ struct iommu_dma_cookie {
struct iommu_domain *fq_domain;
 };
 
+static DEFINE_STATIC_KEY_FALSE(__deferred_attach);
+
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
struct iommu_domain *domain)
 {
@@ -383,9 +385,6 @@ static int iommu_dma_deferred_attach(struct device *dev,
 {
const struct iommu_ops *ops = domain->ops;
 
-   if (!is_kdump_kernel())
-   return 0;
-
if (unlikely(ops->is_attach_deferred &&
ops->is_attach_deferred(domain, dev)))
return iommu_attach_device(domain, dev);
@@ -535,7 +534,8 @@ static dma_addr_t __iommu_dma_map(struct device *dev, 
phys_addr_t phys,
size_t iova_off = iova_offset(iovad, phys);
dma_addr_t iova;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(&__deferred_attach) &&
+   iommu_dma_deferred_attach(dev, domain))
return DMA_MAPPING_ERROR;
 
size = iova_align(iovad, size + iova_off);
@@ -693,7 +693,8 @@ static void *iommu_dma_alloc_remap(struct device *dev, 
size_t size,
 
*dma_handle = DMA_MAPPING_ERROR;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(&__deferred_attach) &&
+   iommu_dma_deferred_attach(dev, domain))
return NULL;
 
min_size = alloc_sizes & -alloc_sizes;
@@ -1003,7 +1004,8 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
unsigned long mask = dma_get_seg_boundary(dev);
int i;
 
-   if (unlikely(iommu_dma_deferred_attach(dev, domain)))
+   if (static_branch_unlikely(&__deferred_attach) &&
+   iommu_dma_deferred_attach(dev, domain))
return 0;
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
@@ -1451,6 +1453,9 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 static int iommu_dma_init(void)
 {
+   if (is_kdump_kernel())
+   static_branch_enable(&__deferred_attach);
+
return iova_cache_get();
 }
 arch_initcall(iommu_dma_init);
-- 
2.17.1



[PATCH] iommu: check for the deferred attach when attaching a device

2020-12-25 Thread Lianbo Jiang
Currently, because domain attach allows to be deferred from iommu
driver to device driver, and when iommu initializes, the devices
on the bus will be scanned and the default groups will be allocated.

Due to the above changes, some devices could be added to the same
group as below:

[3.859417] pci :01:00.0: Adding to iommu group 16
[3.864572] pci :01:00.1: Adding to iommu group 16
[3.869738] pci :02:00.0: Adding to iommu group 17
[3.874892] pci :02:00.1: Adding to iommu group 17

But when attaching these devices, it doesn't allow that a group has
more than one device, otherwise it will return an error. This conflicts
with the deferred attaching. Unfortunately, it has two devices in the
same group for my side, for example:

[9.627014] iommu_group_device_count(): device name[0]::01:00.0
[9.633545] iommu_group_device_count(): device name[1]::01:00.1
...
[   10.255609] iommu_group_device_count(): device name[0]::02:00.0
[   10.262144] iommu_group_device_count(): device name[1]::02:00.1

Finally, which caused the failure of tg3 driver when tg3 driver calls
the dma_alloc_coherent() to allocate coherent memory in the tg3_test_dma().

[9.660310] tg3 :01:00.0: DMA engine test failed, aborting
[9.754085] tg3: probe of :01:00.0 failed with error -12
[9.997512] tg3 :01:00.1: DMA engine test failed, aborting
[   10.043053] tg3: probe of :01:00.1 failed with error -12
[   10.288905] tg3 :02:00.0: DMA engine test failed, aborting
[   10.334070] tg3: probe of :02:00.0 failed with error -12
[   10.578303] tg3 :02:00.1: DMA engine test failed, aborting
[   10.622629] tg3: probe of :02:00.1 failed with error -12

In addition, the similar situations also occur in other drivers such
as the bnxt_en driver. That can be reproduced easily in kdump kernel
when SME is active.

Add a check for the deferred attach in the iommu_attach_device() and
allow to attach the deferred device regardless of how many devices
are in a group.

Signed-off-by: Lianbo Jiang 
---
 drivers/iommu/iommu.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffeebda8d6de..dccab7b133fb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1967,8 +1967,11 @@ int iommu_attach_device(struct iommu_domain *domain, 
struct device *dev)
 */
mutex_lock(>mutex);
ret = -EINVAL;
-   if (iommu_group_device_count(group) != 1)
+   if (!iommu_is_attach_deferred(domain, dev) &&
+   iommu_group_device_count(group) != 1) {
+   dev_err_ratelimited(dev, "Group has more than one device\n");
goto out_unlock;
+   }
 
ret = __iommu_attach_group(domain, group);
 
-- 
2.17.1



[PATCH] docs: admin-guide: update kdump documentation due to change of crash URL

2020-09-18 Thread Lianbo Jiang
Since crash utility has moved to github, the original URL is no longer
available. Let's update it accordingly.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 Documentation/admin-guide/kdump/kdump.rst | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kdump/kdump.rst 
b/Documentation/admin-guide/kdump/kdump.rst
index 2da65fef2a1c..75a9dd98e76e 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -509,9 +509,12 @@ ELF32-format headers using the --elf32-core-headers kernel 
option on the
 dump kernel.
 
 You can also use the Crash utility to analyze dump files in Kdump
-format. Crash is available on Dave Anderson's site at the following URL:
+format. Crash is available at the following URL:
 
-   http://people.redhat.com/~anderson/
+   https://github.com/crash-utility/crash
+
+Crash document can be found at:
+   https://crash-utility.github.io/
 
 Trigger Kdump on WARN()
 ===
-- 
2.17.1



[tip: x86/urgent] kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: a2e9a95d2190ef55bf0724ecdf8a466d393a86b6
Gitweb:
https://git.kernel.org/tip/a2e9a95d2190ef55bf0724ecdf8a466d393a86b6
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:32 +08:00
Committer: Ingo Molnar 
CommitterDate: Fri, 07 Aug 2020 01:32:00 +02:00

kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges

The crash_exclude_mem_range() function can only handle one memory region a time.

It will fail in the case in which the passed in area covers several memory
regions. In this case, it will only exclude the first region, then return,
but leave the later regions unsolved.

E.g in a NEC system with two usable RAM regions inside the low 1M:

  ...
  BIOS-e820: [mem 0x-0x0003efff] usable
  BIOS-e820: [mem 0x0003f000-0x0003] reserved
  BIOS-e820: [mem 0x0004-0x0009] usable

It will only exclude the memory region [0, 0x3efff], the memory region
[0x4, 0x9] will still be added into /proc/vmcore, which may cause
the following failure when dumping vmcore:

 ioremap on RAM at 0x0004 - 0x00040fff
 WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 
__ioremap_caller+0x2c7/0x2e0
 ...
 RIP: 0010:__ioremap_caller+0x2c7/0x2e0
 ...
 cp: error reading '/proc/vmcore': Cannot allocate memory
 kdump: saving vmcore failed

In order to fix this bug, let's extend the crash_exclude_mem_range()
to handle the overlapping ranges.

[ mingo: Amended the changelog. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-3-liji...@redhat.com
---
 kernel/kexec_file.c | 35 +++
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 94661d2..97fa682 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
 {
int i, j;
-   unsigned long long start, end;
+   unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
 
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+   p_start = mstart;
+   p_end = mend;
 
if (mstart > end || mend < start)
continue;
 
/* Truncate any area outside of range */
if (mstart < start)
-   mstart = start;
+   p_start = start;
if (mend > end)
-   mend = end;
+   p_end = end;
 
/* Found completely overlapping range */
-   if (mstart == start && mend == end) {
+   if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+   /*
+* Continue to check if there are another 
overlapping ranges
+* from the current position because of 
shifting the above
+* mem ranges.
+*/
+   i--;
+   mem->nr_ranges--;
+   continue;
}
mem->nr_ranges--;
return 0;
}
 
-   if (mstart > start && mend < end) {
+   if (p_start > start && p_end < end) {
/* Split original range */
-   mem->ranges[i].end = mstart - 1;
-   temp_range.start = mend + 1;
+   mem->ranges[i].end = p_start - 1;
+   temp_range.start = p_end + 1;
temp_range.end = end;
-   } else if (mstart != start)
-   mem->ranges[i].end = mstart - 1;
+   } else if (p_start != start)
+   mem->ranges[i].end = p_start - 1;
else
-   mem->ranges[i].start = mend + 1;
+   mem->ranges[i].start = p_end + 1;
break;
}
 
@@ -1243,7 +1254,7 @@ int crash_prepare_elf64_headers(st

[tip: x86/urgent] x86/crash: Correct the address boundary of function parameters

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: a3e1c3bb24e2ff2927af5e30c2bebe669bb84196
Gitweb:
https://git.kernel.org/tip/a3e1c3bb24e2ff2927af5e30c2bebe669bb84196
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:31 +08:00
Committer: Ingo Molnar 
CommitterDate: Fri, 07 Aug 2020 01:32:00 +02:00

x86/crash: Correct the address boundary of function parameters

Let's carefully handle the boundary of the function parameter to make
sure that the arguments passed doesn't exceed the address range.

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-2-liji...@redhat.com
---
 arch/x86/kernel/crash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index fd87b59..a8f3af2 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -230,7 +230,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
 
/* Exclude the low 1M because it is always reserved */
-   ret = crash_exclude_mem_range(cmem, 0, 1<<20);
+   ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
if (ret)
return ret;
 


[tip: x86/urgent] kexec_file: Correctly output debugging information for the PT_LOAD ELF header

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 475f63ae63b5102ae6423d1712333929d04d6ecc
Gitweb:
https://git.kernel.org/tip/475f63ae63b5102ae6423d1712333929d04d6ecc
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:33 +08:00
Committer: Ingo Molnar 
CommitterDate: Fri, 07 Aug 2020 01:32:00 +02:00

kexec_file: Correctly output debugging information for the PT_LOAD ELF header

Currently, when we enable the debugging switch to debug kexec_file,
we always get the following incorrect results:

  kexec_file: Crash PT_LOAD elf header. phdr=c988639b vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=3cca69a0 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=c584cb9f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=cf85d57f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=a4a8f847 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=272ec49f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=ea0b65de vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=1f5e490c vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=dfe4109e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=480ed2b6 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=80b65151 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=24e31c5e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=332e0385 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=2754d5da vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=783320dd vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=76fe5b64 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0

The reason is that kernel always prints the values of the next PT_LOAD
instead of the current PT_LOAD. Change it to ensure that we can get the
correct debugging information.

[ mingo: Amended changelog, capitalized "ELF". ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-4-liji...@redhat.com
---
 kernel/kexec_file.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 97fa682..3f7867c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1246,7 +1246,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
unsigned long long notes_addr;
unsigned long mstart, mend;
 
-   /* extra phdr for vmcoreinfo elf note */
+   /* extra phdr for vmcoreinfo ELF note */
nr_phdr = nr_cpus + 1;
nr_phdr += mem->nr_ranges;
 
@@ -1254,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
 * area (for example, 8000 - a000 on x86_64).
 * I think this is required by tools like gdb. So same physical
-* memory will be mapped in two elf  headers. One will contain kernel
+* memory will be mapped in two ELF headers. One will contain kernel
 * text virtual addresses and other will have __va(physical) addresses.
 */
 
@@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
-   phdr++;
-   pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+   pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+   phdr++;
}
 
*addr = buf;


[tip: x86/urgent] kexec_file: Correctly output debugging information for the PT_LOAD ELF header

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 8ca346039f70cf92dbada6c06048efde165b191f
Gitweb:
https://git.kernel.org/tip/8ca346039f70cf92dbada6c06048efde165b191f
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:33 +08:00
Committer: Ingo Molnar 
CommitterDate: Thu, 06 Aug 2020 15:26:09 +02:00

kexec_file: Correctly output debugging information for the PT_LOAD ELF header

Currently, when we enable the debugging switch to debug kexec_file,
we always get the following incorrect results:

  kexec_file: Crash PT_LOAD elf header. phdr=c988639b vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=3cca69a0 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=c584cb9f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=cf85d57f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=a4a8f847 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=272ec49f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=ea0b65de vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=1f5e490c vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=dfe4109e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=480ed2b6 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=80b65151 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=24e31c5e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=332e0385 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=2754d5da vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=783320dd vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=76fe5b64 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0

The reason is that kernel always prints the values of the next PT_LOAD
instead of the current PT_LOAD. Change it to ensure that we can get the
correct debugging information.

[ mingo: Amended changelog, capitalized "ELF". ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-4-liji...@redhat.com
---
 kernel/kexec_file.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 5cc2c47..f1f4009 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1246,7 +1246,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
unsigned long long notes_addr;
unsigned long mstart, mend;
 
-   /* extra phdr for vmcoreinfo elf note */
+   /* extra phdr for vmcoreinfo ELF note */
nr_phdr = nr_cpus + 1;
nr_phdr += mem->nr_ranges;
 
@@ -1254,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
 * area (for example, 8000 - a000 on x86_64).
 * I think this is required by tools like gdb. So same physical
-* memory will be mapped in two elf  headers. One will contain kernel
+* memory will be mapped in two ELF headers. One will contain kernel
 * text virtual addresses and other will have __va(physical) addresses.
 */
 
@@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
-   phdr++;
-   pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+   pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+   phdr++;
}
 
*addr = buf;


[tip: x86/urgent] x86/crash: Correct the address boundary of function parameters

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 5b89a35f8c11a7846b06ac729d7de72044f7fc60
Gitweb:
https://git.kernel.org/tip/5b89a35f8c11a7846b06ac729d7de72044f7fc60
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:31 +08:00
Committer: Ingo Molnar 
CommitterDate: Thu, 06 Aug 2020 15:25:58 +02:00

x86/crash: Correct the address boundary of function parameters

Let's carefully handle the boundary of the function parameter to make
sure that the arguments passed doesn't exceed the address range.

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-2-liji...@redhat.com
---
 arch/x86/kernel/crash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index fd87b59..a8f3af2 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -230,7 +230,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
 
/* Exclude the low 1M because it is always reserved */
-   ret = crash_exclude_mem_range(cmem, 0, 1<<20);
+   ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
if (ret)
return ret;
 


[tip: x86/urgent] kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges

2020-08-06 Thread tip-bot2 for Lianbo Jiang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 12e4e432ac4d65020ba85037da06f2c886188e4f
Gitweb:
https://git.kernel.org/tip/12e4e432ac4d65020ba85037da06f2c886188e4f
Author:Lianbo Jiang 
AuthorDate:Tue, 04 Aug 2020 12:49:32 +08:00
Committer: Ingo Molnar 
CommitterDate: Thu, 06 Aug 2020 15:25:58 +02:00

kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges

The crash_exclude_mem_range() function can only handle one memory region a time.

It will fail in the case in which the passed in area covers several memory
regions. In this case, it will only exclude the first region, then return,
but leave the later regions unsolved.

E.g in a NEC system with two usable RAM regions inside the low 1M:

  ...
  BIOS-e820: [mem 0x-0x0003efff] usable
  BIOS-e820: [mem 0x0003f000-0x0003] reserved
  BIOS-e820: [mem 0x0004-0x0009] usable

It will only exclude the memory region [0, 0x3efff], the memory region
[0x4, 0x9] will still be added into /proc/vmcore, which may cause
the following failure when dumping vmcore:

 ioremap on RAM at 0x0004 - 0x00040fff
 WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 
__ioremap_caller+0x2c7/0x2e0
 ...
 RIP: 0010:__ioremap_caller+0x2c7/0x2e0
 ...
 cp: error reading '/proc/vmcore': Cannot allocate memory
 kdump: saving vmcore failed

In order to fix this bug, let's extend the crash_exclude_mem_range()
to handle the overlapping ranges.

[ mingo: Amended the changelog. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Ingo Molnar 
Acked-by: Dave Young 
Link: https://lore.kernel.org/r/20200804044933.1973-3-liji...@redhat.com
---
 kernel/kexec_file.c | 35 +++
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78d..5cc2c47 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
 {
int i, j;
-   unsigned long long start, end;
+   unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
 
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+   p_start = mstart;
+   p_end = mend;
 
if (mstart > end || mend < start)
continue;
 
/* Truncate any area outside of range */
if (mstart < start)
-   mstart = start;
+   p_start = start;
if (mend > end)
-   mend = end;
+   p_end = end;
 
/* Found completely overlapping range */
-   if (mstart == start && mend == end) {
+   if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+   /*
+* Continue to check if there are another 
overlapping ranges
+* from the current position because of 
shifting the above
+* mem ranges.
+*/
+   i--;
+   mem->nr_ranges--;
+   continue;
}
mem->nr_ranges--;
return 0;
}
 
-   if (mstart > start && mend < end) {
+   if (p_start > start && p_end < end) {
/* Split original range */
-   mem->ranges[i].end = mstart - 1;
-   temp_range.start = mend + 1;
+   mem->ranges[i].end = p_start - 1;
+   temp_range.start = p_end + 1;
temp_range.end = end;
-   } else if (mstart != start)
-   mem->ranges[i].end = mstart - 1;
+   } else if (p_start != start)
+   mem->ranges[i].end = p_start - 1;
else
-   mem->ranges[i].start = mend + 1;
+   mem->ranges[i].start = p_end + 1;
break;
}
 
@@ -1243,7 +1254,7 @@ int crash_prepare_elf64_headers(st

[PATCH 2/3] kexec: Improve the crash_exclude_mem_range() to handle the overlapping ranges

2020-08-03 Thread Lianbo Jiang
The crash_exclude_mem_range() can only handle one memory region one time.
It will fail the case in which the passed in area covers several memory
regions. In the case, it will only exclude the first region, then return,
but leave the later regions unsolved.

E.g in a NEC system with two usable RAM regions inside the low 1M:
...
BIOS-e820: [mem 0x-0x0003efff] usable
BIOS-e820: [mem 0x0003f000-0x0003] reserved
BIOS-e820: [mem 0x0004-0x0009] usable

It will only exclude the memory region [0, 0x3efff], the memory region
[0x4, 0x9] will still be added into /proc/vmcore, which may cause
the following failure when dumping the vmcore:

ioremap on RAM at 0x0004 - 0x00040fff
WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 
__ioremap_caller+0x2c7/0x2e0
...
RIP: 0010:__ioremap_caller+0x2c7/0x2e0
Code: 05 20 47 1c 01 48 09 c5 e9 93 fe ff ff 48 8d 54 24 28 48 8d 74 24 18 48 c7
  c7 85 e7 09 82 c6 05 b4 10 36 01 01 e8 32 91 04 00 <0f> 0b 45 31 ff e9 f3
  fe ff ff e8 2a 8e 04 00 66 2e 0f 1f 84 00 00
RSP: 0018:c971fd60 EFLAGS: 00010286
RAX:  RBX: 0004 RCX: 
RDX: 8880620268c0 RSI: 888062016a08 RDI: 888062016a08
RBP:  R08: 0441 R09: 0048
R10:  R11: c971fc08 R12: 7f794c343000
R13: 1000 R14:  R15: 
FS:  7f794c352800() GS:88806200() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7f794c35 CR3: 5df9c005 CR4: 001606b0
Call Trace:
? __copy_oldmem_page.part.0+0x9c/0xb0
__copy_oldmem_page.part.0+0x9c/0xb0
read_from_oldmem.part.2+0xe2/0x140
read_vmcore+0xd8/0x2f0
proc_reg_read+0x39/0x60
vfs_read+0x91/0x140
ksys_read+0x4f/0xb0
do_syscall_64+0x5b/0x1a0
entry_SYSCALL_64_after_hwframe+0x65/0xca
cp: error reading '/proc/vmcore': Cannot allocate memory
kdump: saving vmcore failed

In order to solve this issue, let's extend the crash_exclude_mem_range()
to handle the overlapping ranges.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_file.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78df53c6..41616b6a80ad 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
 {
int i, j;
-   unsigned long long start, end;
+   unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
 
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+   p_start = mstart;
+   p_end = mend;
 
if (mstart > end || mend < start)
continue;
 
/* Truncate any area outside of range */
if (mstart < start)
-   mstart = start;
+   p_start = start;
if (mend > end)
-   mend = end;
+   p_end = end;
 
/* Found completely overlapping range */
-   if (mstart == start && mend == end) {
+   if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+   /*
+* Continue to check if there are another 
overlapping ranges
+* from the current position because of 
shifting the above
+* mem ranges.
+*/
+   i--;
+   mem->nr_ranges--;
+   continue;
}
mem->nr_ranges--;
return 0;
}
 
-   if (mstart > start && mend < end) {
+   if (p_start > start && p_end < end) {
/* Split original range */
-   mem->ranges[i].end = mstart - 1;
-   temp_range.start = mend + 1;
+   mem->ranges[i].end = p_start - 1;
+   temp_range.start = p_end + 1;
temp_range.end = end;
-   } e

[PATCH 3/3] kexec_file: correctly output debugging information for the PT_LOAD elf header

2020-08-03 Thread Lianbo Jiang
Currently, when we enable the debugging switch to debug kexec_file,
always get the following wrong results:

kexec_file: Crash PT_LOAD elf header. phdr=c988639b vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=3cca69a0 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=c584cb9f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=cf85d57f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=a4a8f847 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=272ec49f vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=ea0b65de vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=1f5e490c vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=dfe4109e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=480ed2b6 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=80b65151 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=24e31c5e vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=332e0385 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=2754d5da vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=783320dd vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0
kexec_file: Crash PT_LOAD elf header. phdr=76fe5b64 vaddr=0x0, 
paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0

The reason is that kernel always prints the values of the next PT_LOAD
instead of the current PT_LOAD. Change it to ensure that we can get the
correct debugging information.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 41616b6a80ad..e2c03b4ce31b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, 
int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
-   phdr++;
pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, 
paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+   phdr++;
}
 
*addr = buf;
-- 
2.17.1



[PATCH 0/3] x86/kexec_file: Fix some corners bugs and improve the crash_exclude_mem_range()

2020-08-03 Thread Lianbo Jiang
This series includes the following patches, it fixes some corners bugs
and improves the crash_exclude_mem_range().

[1] [PATCH 1/3] x86/crash: Correct the address boundary of function
parameters
[2] [PATCH 2/3] kexec: Improve the crash_exclude_mem_range() to handle
the overlapping ranges
[3] [PATCH 3/3] kexec_file: correctly output debugging information for
the PT_LOAD elf header

Lianbo Jiang (3):
  x86/crash: Correct the address boundary of function parameters
  kexec: Improve the crash_exclude_mem_range() to handle the overlapping
ranges
  kexec_file: correctly output debugging information for the PT_LOAD elf
header

 arch/x86/kernel/crash.c |  2 +-
 kernel/kexec_file.c | 33 ++---
 2 files changed, 23 insertions(+), 12 deletions(-)

-- 
2.17.1



[PATCH 1/3] x86/crash: Correct the address boundary of function parameters

2020-08-03 Thread Lianbo Jiang
Let's carefully handle the boundary of the function parameter to make
sure that the arguments passed doesn't exceed the address range.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index fd87b59452a3..a8f3af257e26 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -230,7 +230,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
 
/* Exclude the low 1M because it is always reserved */
-   ret = crash_exclude_mem_range(cmem, 0, 1<<20);
+   ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
if (ret)
return ret;
 
-- 
2.17.1



[PATCH v2] kexec: Do not verify the signature without the lockdown or mandatory signature

2020-06-01 Thread Lianbo Jiang
Signature verification is an important security feature, to protect
system from being attacked with a kernel of unknown origin. Kexec
rebooting is a way to replace the running kernel, hence need be
secured carefully.

In the current code of handling signature verification of kexec kernel,
the logic is very twisted. It mixes signature verification, IMA signature
appraising and kexec lockdown.

If there is no KEXEC_SIG_FORCE, kexec kernel image doesn't have one of
signature, the supported crypto, and key, we don't think this is wrong,
Unless kexec lockdown is executed. IMA is considered as another kind of
signature appraising method.

If kexec kernel image has signature/crypto/key, it has to go through the
signature verification and pass. Otherwise it's seen as verification
failure, and won't be loaded.

Seems kexec kernel image with an unqualified signature is even worse than
those w/o signature at all, this sounds very unreasonable. E.g. If people
get a unsigned kernel to load, or a kernel signed with expired key, which
one is more dangerous?

So, here, let's simplify the logic to improve code readability. If the
KEXEC_SIG_FORCE enabled or kexec lockdown enabled, signature verification
is mandated. Otherwise, we lift the bar for any kernel image.

Signed-off-by: Lianbo Jiang 
---
Changes since v1:
[1] Modify the log level(suggested by Jiri Bohac)

 kernel/kexec_file.c | 34 ++
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index faa74d5f6941..fae496958a68 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 static int
 kimage_validate_signature(struct kimage *image)
 {
-   const char *reason;
int ret;
 
ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
   image->kernel_buf_len);
-   switch (ret) {
-   case 0:
-   break;
+   if (ret) {
 
-   /* Certain verification errors are non-fatal if we're not
-* checking errors, provided we aren't mandating that there
-* must be a valid signature.
-*/
-   case -ENODATA:
-   reason = "kexec of unsigned image";
-   goto decide;
-   case -ENOPKG:
-   reason = "kexec of image with unsupported crypto";
-   goto decide;
-   case -ENOKEY:
-   reason = "kexec of image with unavailable key";
-   decide:
if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
-   pr_notice("%s rejected\n", reason);
+   pr_notice("Enforced kernel signature verification 
failed (%d).\n", ret);
return ret;
}
 
-   /* If IMA is guaranteed to appraise a signature on the kexec
+   /*
+* If IMA is guaranteed to appraise a signature on the kexec
 * image, permit it even if the kernel is otherwise locked
 * down.
 */
@@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image)
security_locked_down(LOCKDOWN_KEXEC))
return -EPERM;
 
-   return 0;
-
-   /* All other errors are fatal, including nomem, unparseable
-* signatures and signature check failures - even if signatures
-* aren't required.
-*/
-   default:
-   pr_notice("kernel signature verification failed (%d).\n", ret);
+   pr_debug("kernel signature verification failed (%d).\n", ret);
}
 
-   return ret;
+   return 0;
 }
 #endif
 
-- 
2.17.1



[PATCH] kexec: Do not verify the signature without the lockdown or mandatory signature

2020-05-24 Thread Lianbo Jiang
Signature verification is an important security feature, to protect
system from being attacked with a kernel of unknown origin. Kexec
rebooting is a way to replace the running kernel, hence need be
secured carefully.

In the current code of handling signature verification of kexec kernel,
the logic is very twisted. It mixes signature verification, IMA signature
appraising and kexec lockdown.

If there is no KEXEC_SIG_FORCE, kexec kernel image doesn't have one of
signature, the supported crypto, and key, we don't think this is wrong,
Unless kexec lockdown is executed. IMA is considered as another kind of
signature appraising method.

If kexec kernel image has signature/crypto/key, it has to go through the
signature verification and pass. Otherwise it's seen as verification
failure, and won't be loaded.

Seems kexec kernel image with an unqualified signature is even worse than
those w/o signature at all, this sounds very unreasonable. E.g. If people
get a unsigned kernel to load, or a kernel signed with expired key, which
one is more dangerous?

So, here, let's simplify the logic to improve code readability. If the
KEXEC_SIG_FORCE enabled or kexec lockdown enabled, signature verification
is mandated. Otherwise, we lift the bar for any kernel image.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_file.c | 37 ++---
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index faa74d5f6941..e4bdf0c42f35 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -181,52 +181,27 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 static int
 kimage_validate_signature(struct kimage *image)
 {
-   const char *reason;
int ret;
 
ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
   image->kernel_buf_len);
-   switch (ret) {
-   case 0:
-   break;
+   if (ret) {
+   pr_debug("kernel signature verification failed (%d).\n", ret);
 
-   /* Certain verification errors are non-fatal if we're not
-* checking errors, provided we aren't mandating that there
-* must be a valid signature.
-*/
-   case -ENODATA:
-   reason = "kexec of unsigned image";
-   goto decide;
-   case -ENOPKG:
-   reason = "kexec of image with unsupported crypto";
-   goto decide;
-   case -ENOKEY:
-   reason = "kexec of image with unavailable key";
-   decide:
-   if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
-   pr_notice("%s rejected\n", reason);
+   if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE))
return ret;
-   }
 
-   /* If IMA is guaranteed to appraise a signature on the kexec
+   /*
+* If IMA is guaranteed to appraise a signature on the kexec
 * image, permit it even if the kernel is otherwise locked
 * down.
 */
if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
security_locked_down(LOCKDOWN_KEXEC))
return -EPERM;
-
-   return 0;
-
-   /* All other errors are fatal, including nomem, unparseable
-* signatures and signature check failures - even if signatures
-* aren't required.
-*/
-   default:
-   pr_notice("kernel signature verification failed (%d).\n", ret);
}
 
-   return ret;
+   return 0;
 }
 #endif
 
-- 
2.17.1



[PATCH 0/3 v4] x86/kdump: Fix 'kmem -s' reported an invalid freepointer when SME was active

2019-10-17 Thread Lianbo Jiang
In purgatory(), the main things are as below:

[1] verify sha256 hashes for various segments.
Lets keep these codes, and do not touch the logic.

[2] copy the first 640k content to a backup region.
Lets safely remove it and clean all code related to backup region.

This patch series will remove the backup region, because the current
handling of copying the first 640k runs into problems when SME is
active(https://bugzilla.kernel.org/show_bug.cgi?id=204793).

The low 1MiB region will always be reserved when the crashkernel kernel
command line option is specified. And this way makes it unnecessary to
do anything with the low 1MiB region, because the memory allocated later
won't fall into the low 1MiB area.

This series includes three patches:
[1] x86/kdump: always reserve the low 1MiB when the crashkernel option
is specified
The low 1MiB region will always be reserved when the crashkernel
kernel command line option is specified, which ensures that the
memory allocated later won't fall into the low 1MiB area.

[2] x86/kdump: remove the unused crash_copy_backup_region()
The crash_copy_backup_region() has never been used, so clean
up the redundant code.

[3] x86/kdump: clean up all the code related to the backup region
Remove the backup region and clean up.

Changes since v1:
[1] Add extra checking condition: when the crashkernel option is
specified, reserve the low 640k area.

Changes since v2:
[1] Reserve the low 1MiB region when the crashkernel option is only
specified.(Suggested by Eric)

[2] Remove the unused crash_copy_backup_region()

[3] Remove the backup region and clean up

[4] Split them into three patches

Changes since v3:
[1] Improve the first patch's log
[2] Improve the third patch based on Eric's suggestions

Lianbo Jiang (3):
  x86/kdump: always reserve the low 1MiB when the crashkernel option is
specified
  x86/kdump: remove the unused crash_copy_backup_region()
  x86/kdump: clean up all the code related to the backup region

 arch/x86/include/asm/crash.h   |  1 -
 arch/x86/include/asm/kexec.h   | 10 
 arch/x86/include/asm/purgatory.h   | 10 
 arch/x86/kernel/crash.c| 87 --
 arch/x86/kernel/machine_kexec_64.c | 47 
 arch/x86/purgatory/purgatory.c | 19 ---
 arch/x86/realmode/init.c   | 11 
 7 files changed, 22 insertions(+), 163 deletions(-)

-- 
2.17.1



[PATCH v2] x86/kdump: Fix 'kmem -s' reported an invalid freepointer when SME was active

2019-10-07 Thread Lianbo Jiang
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204793

Kdump kernel will reuse the first 640k region because of some reasons,
for example: the trampline and conventional PC system BIOS region may
require to allocate memory in this area. Obviously, kdump kernel will
also overwrite the first 640k region, therefore, kernel has to copy
the contents of the first 640k area to a backup area, which is done in
purgatory(), because vmcore may need the old memory. When vmcore is
dumped, kdump kernel will read the old memory from the backup area of
the first 640k area.

Basically, the main reason should be clear, kernel does not correctly
handle the first 640k region when SME is active, which causes that
kernel does not properly copy these old memory to the backup area in
purgatory(). Therefore, kdump kernel reads out the incorrect contents
from the backup area when dumping vmcore. Finally, the phenomenon is
as follow:

[root linux]$ crash vmlinux /var/crash/127.0.0.1-2019-09-19-08\:31\:27/vmcore
WARNING: kernel relocated [240MB]: patching 97110 gdb minimal_symbol values

  KERNEL: /var/crash/127.0.0.1-2019-09-19-08:31:27/vmlinux
DUMPFILE: /var/crash/127.0.0.1-2019-09-19-08:31:27/vmcore  [PARTIAL DUMP]
CPUS: 128
DATE: Thu Sep 19 08:31:18 2019
  UPTIME: 00:01:21
LOAD AVERAGE: 0.16, 0.07, 0.02
   TASKS: 1343
NODENAME: amd-ethanol
 RELEASE: 5.3.0-rc7+
 VERSION: #4 SMP Thu Sep 19 08:14:00 EDT 2019
 MACHINE: x86_64  (2195 Mhz)
  MEMORY: 127.9 GB
   PANIC: "Kernel panic - not syncing: sysrq triggered crash"
 PID: 9789
 COMMAND: "bash"
TASK: "89711894ae80  [THREAD_INFO: 89711894ae80]"
 CPU: 83
   STATE: TASK_RUNNING (PANIC)

crash> kmem -s|grep -i invalid
kmem: dma-kmalloc-512: slab:d77680001c00 invalid 
freepointer:a6086ac099f0c5a4
kmem: dma-kmalloc-512: slab:d77680001c00 invalid 
freepointer:a6086ac099f0c5a4
crash>

BTW: I also tried to fix the above problem in purgatory(), but there
are too many restricts in purgatory() context, for example: i can't
allocate new memory to create the identity mapping page table for SME
situation.

Currently, there are two places where the first 640k area is needed,
the first one is in the find_trampoline_placement(), another one is
in the reserve_real_mode(), and their content doesn't matter. To avoid
the above error, lets occupy the remain memory of the first 640k region
(expect for the trampoline and real mode) so that the allocated memory
does not fall into the first 640k area when SME is active, which makes
us not to worry about whether kernel can correctly copy the contents of
the first 640k area to a backup region in the purgatory().

Signed-off-by: Lianbo Jiang 
---
Changes since v1:
1. Improve patch log
2. Change the checking condition from sme_active() to sme_active()
   && strstr(boot_command_line, "crashkernel=")

 arch/x86/kernel/setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 77ea96b794bd..bdb1a02a84fd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1148,6 +1148,9 @@ void __init setup_arch(char **cmdline_p)
 
reserve_real_mode();
 
+   if (sme_active() && strstr(boot_command_line, "crashkernel="))
+   memblock_reserve(0, 640*1024);
+
trim_platform_memory_ranges();
trim_low_memory_range();
 
-- 
2.17.1



[PATCH] x86/kdump: Fix 'kmem -s' reported an invalid freepointer when SME was active

2019-09-19 Thread Lianbo Jiang
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204793

Kdump kernel will reuse the first 640k region because of some reasons,
for example: the trampline and conventional PC system BIOS region may
require to allocate memory in this area. Obviously, kdump kernel will
also overwrite the first 640k region, therefore, kernel has to copy
the contents of the first 640k area to a backup area, which is done in
purgatory(), because vmcore may need the old memory. When vmcore is
dumped, kdump kernel will read the old memory from the backup area of
the first 640k area.

Basically, the main reason should be clear, kernel does not correctly
handle the first 640k region when SME is active, which causes that
kernel does not properly copy these old memory to the backup area in
purgatory(). Therefore, kdump kernel reads out the incorrect contents
from the backup area when dumping vmcore. Finally, the phenomenon is
as follow:

[root linux]$ crash vmlinux /var/crash/127.0.0.1-2019-09-19-08\:31\:27/vmcore
WARNING: kernel relocated [240MB]: patching 97110 gdb minimal_symbol values

  KERNEL: /var/crash/127.0.0.1-2019-09-19-08:31:27/vmlinux
DUMPFILE: /var/crash/127.0.0.1-2019-09-19-08:31:27/vmcore  [PARTIAL DUMP]
CPUS: 128
DATE: Thu Sep 19 08:31:18 2019
  UPTIME: 00:01:21
LOAD AVERAGE: 0.16, 0.07, 0.02
   TASKS: 1343
NODENAME: amd-ethanol
 RELEASE: 5.3.0-rc7+
 VERSION: #4 SMP Thu Sep 19 08:14:00 EDT 2019
 MACHINE: x86_64  (2195 Mhz)
  MEMORY: 127.9 GB
   PANIC: "Kernel panic - not syncing: sysrq triggered crash"
 PID: 9789
 COMMAND: "bash"
TASK: "89711894ae80  [THREAD_INFO: 89711894ae80]"
 CPU: 83
   STATE: TASK_RUNNING (PANIC)

crash> kmem -s|grep -i invalid
kmem: dma-kmalloc-512: slab:d77680001c00 invalid 
freepointer:a6086ac099f0c5a4
kmem: dma-kmalloc-512: slab:d77680001c00 invalid 
freepointer:a6086ac099f0c5a4
crash>

In order to avoid such problem, lets occupy the first 640k region when
SME is active, which will ensure that the allocated memory does not fall
into the first 640k area. So, no need to worry about whether kernel can
correctly copy the contents of the first 640K area to a backup region in
purgatory().

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 77ea96b794bd..5bfb2c83bb6c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1148,6 +1148,9 @@ void __init setup_arch(char **cmdline_p)
 
reserve_real_mode();
 
+   if (sme_active())
+   memblock_reserve(0, 640*1024);
+
trim_platform_memory_ranges();
trim_low_memory_range();
 
-- 
2.17.1



[tip:x86/kdump] fs/proc/vmcore: Enable dumping of encrypted memory when SEV was active

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  4eb5fec31e613105668a1472d5876f3d0558e5d8
Gitweb: https://git.kernel.org/tip/4eb5fec31e613105668a1472d5876f3d0558e5d8
Author: Lianbo Jiang 
AuthorDate: Tue, 30 Apr 2019 15:44:21 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 10:07:49 +0200

fs/proc/vmcore: Enable dumping of encrypted memory when SEV was active

In the kdump kernel, the memory of the first kernel gets to be dumped
into a vmcore file.

Similarly to SME kdump, if SEV was enabled in the first kernel, the old
memory has to be remapped encrypted in order to access it properly.

Commit

  992b649a3f01 ("kdump, proc/vmcore: Enable kdumping encrypted memory with SME 
enabled")

took care of the SME case but it uses sme_active() which checks for SME
only. Use mem_encrypt_active() instead, which returns true when either
SME or SEV is active.

Unlike SME, the second kernel images (kernel and initrd) are loaded into
encrypted memory when SEV is active, hence the kernel elf header must be
remapped as encrypted in order to access it properly.

 [ bp: Massage commit message. ]

Co-developed-by: Brijesh Singh 
Signed-off-by: Brijesh Singh 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Alexey Dobriyan 
Cc: Andrew Morton 
Cc: Arnd Bergmann 
Cc: b...@redhat.com
Cc: dyo...@redhat.com
Cc: Ganesh Goudar 
Cc: H. Peter Anvin 
Cc: ke...@lists.infradead.org
Cc: linux-fsde...@vger.kernel.org
Cc: Matthew Wilcox 
Cc: Mike Rapoport 
Cc: mi...@redhat.com
Cc: Rahul Lakkireddy 
Cc: Souptick Joarder 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190430074421.7852-4-liji...@redhat.com
---
 fs/proc/vmcore.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7bb96fdd38ad..57957c91c6df 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,7 +166,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0, false);
+   return read_from_oldmem(buf, count, ppos, 0, sev_active());
 }
 
 /*
@@ -174,7 +174,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 
*ppos)
  */
 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0, sme_active());
+   return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active());
 }
 
 /*
@@ -374,7 +374,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, 
loff_t *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, ,
-  userbuf, sme_active());
+  userbuf, mem_encrypt_active());
if (tmp < 0)
return tmp;
buflen -= tsz;


[tip:x86/kdump] x86/kexec: Set the C-bit in the identity map page table when SEV is active

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  85784d16c2cf172cf1ebaf2390d6b7c4045d659c
Gitweb: https://git.kernel.org/tip/85784d16c2cf172cf1ebaf2390d6b7c4045d659c
Author: Lianbo Jiang 
AuthorDate: Tue, 30 Apr 2019 15:44:20 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 10:07:12 +0200

x86/kexec: Set the C-bit in the identity map page table when SEV is active

When SEV is active, the second kernel image is loaded into encrypted
memory. For that, make sure that when kexec builds the identity mapping
page table, the memory is encrypted (i.e., _PAGE_ENC is set).

 [ bp: Sort local args and OR in _PAGE_ENC for more clarity. ]

Co-developed-by: Brijesh Singh 
Signed-off-by: Brijesh Singh 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: b...@redhat.com
Cc: dyo...@redhat.com
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: ke...@lists.infradead.org
Cc: "Kirill A. Shutemov" 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190430074421.7852-3-liji...@redhat.com
---
 arch/x86/kernel/machine_kexec_64.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 3b38449028e0..16c37fe489bc 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -50,12 +50,13 @@ static void free_transition_pgtable(struct kimage *image)
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 {
+   pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+   unsigned long vaddr, paddr;
+   int result = -ENOMEM;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
-   unsigned long vaddr, paddr;
-   int result = -ENOMEM;
 
vaddr = (unsigned long)relocate_kernel;
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
@@ -92,7 +93,11 @@ static int init_transition_pgtable(struct kimage *image, 
pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
-   set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
+
+   if (sev_active())
+   prot = PAGE_KERNEL_EXEC;
+
+   set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
 err:
return result;
@@ -129,6 +134,11 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
 
+   if (sev_active()) {
+   info.page_flag   |= _PAGE_ENC;
+   info.kernpg_flag |= _PAGE_ENC;
+   }
+
if (direct_gbpages)
info.direct_gbpages = true;
 


[tip:x86/kdump] x86/kexec: Do not map kexec area as decrypted when SEV is active

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  1a79c1b8a04153c4c387518967ce851f89e22733
Gitweb: https://git.kernel.org/tip/1a79c1b8a04153c4c387518967ce851f89e22733
Author: Lianbo Jiang 
AuthorDate: Tue, 30 Apr 2019 15:44:19 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 10:06:46 +0200

x86/kexec: Do not map kexec area as decrypted when SEV is active

When a virtual machine panics, its memory needs to be dumped for
analysis. With memory encryption in the picture, special care must be
taken when loading a kexec/kdump kernel in a SEV guest.

A SEV guest starts and runs fully encrypted. In order to load a kexec
kernel and initrd, arch_kexec_post_{alloc,free}_pages() need to not map
areas as decrypted unconditionally but differentiate whether the kernel
is running as a SEV guest and if so, leave kexec area encrypted.

 [ bp: Reduce commit message to the relevant information pertaining to
   this commit only. ]

Co-developed-by: Brijesh Singh 
Signed-off-by: Brijesh Singh 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: b...@redhat.com
Cc: Brijesh Singh 
Cc: dyo...@redhat.com
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: ke...@lists.infradead.org
Cc: "Kirill A. Shutemov" 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190430074421.7852-2-liji...@redhat.com
---
 arch/x86/kernel/machine_kexec_64.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index ceba408ea982..3b38449028e0 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -559,8 +559,20 @@ void arch_kexec_unprotect_crashkres(void)
kexec_mark_crashkres(false);
 }
 
+/*
+ * During a traditional boot under SME, SME will encrypt the kernel,
+ * so the SME kexec kernel also needs to be un-encrypted in order to
+ * replicate a normal SME boot.
+ *
+ * During a traditional boot under SEV, the kernel has already been
+ * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
+ * order to replicate a normal SEV boot.
+ */
 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
 {
+   if (sev_active())
+   return 0;
+
/*
 * If SME is active we need to be sure that kexec pages are
 * not encrypted because when we boot to the new kernel the
@@ -571,6 +583,9 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int 
pages, gfp_t gfp)
 
 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
 {
+   if (sev_active())
+   return;
+
/*
 * If SME is active we need to reset the pages back to being
 * an encrypted mapping before freeing them.


[tip:x86/kdump] x86/crash: Add e820 reserved ranges to kdump kernel's e820 table

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  980621daf368f2b9aa69c7ea01baa654edb7577b
Gitweb: https://git.kernel.org/tip/980621daf368f2b9aa69c7ea01baa654edb7577b
Author: Lianbo Jiang 
AuthorDate: Tue, 23 Apr 2019 09:30:07 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 10:05:06 +0200

x86/crash: Add e820 reserved ranges to kdump kernel's e820 table

At present, when using the kexec_file_load() syscall to load the kernel
image and initramfs, for example:

  kexec -s -p xxx

the kernel does not pass the e820 reserved ranges to the second kernel,
which might cause two problems:

 1. MMCONFIG: A device in PCI segment 1 cannot be discovered by the
kernel PCI probing without all the e820 I/O reservations being present
in the e820 table. Which is the case currently, because the kdump kernel
does not have those reservations because the kexec command does not pass
the I/O reservation via the "memmap=xxx" command line option.

Further details courtesy of Bjorn Helgaas¹: I think you should regard
correct MCFG/ECAM usage in the kdump kernel as a requirement. MMCONFIG
(aka ECAM) space is described in the ACPI MCFG table. If you don't have
ECAM:

  (a) PCI devices won't work at all on non-x86 systems that use only
   ECAM for config access,

  (b) you won't be able to access devices on non-0 segments (granted,
  there aren't very many of these yet, but there will be more in the
  future), and

  (c) you won't be able to access extended config space (addresses
  0x100-0xfff), which means none of the Extended Capabilities will be
  available (AER, ACS, ATS, etc).

 2. The second issue is that the SME kdump kernel doesn't work without
the e820 reserved ranges. When SME is active in the kdump kernel, those
reserved regions are still decrypted, but because those reserved ranges
are not present at all in kdump kernel's e820 table, they are accessed
as encrypted. Which is obviously wrong.

 [1]: 
https://lkml.kernel.org/r/cabhmzuuscs3juzusm5y6eyjk6weo7mjj5-eakgvbw0qee%2b3...@mail.gmail.com

 [ bp: Heavily massage commit message. ]

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Baoquan He 
Cc: Bjorn Helgaas 
Cc: dave.han...@linux.intel.com
Cc: Dave Young 
Cc: "Gustavo A. R. Silva" 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: ke...@lists.infradead.org
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Cc: Yi Wang 
Link: https://lkml.kernel.org/r/20190423013007.17838-4-liji...@redhat.com
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 576b2e1bfc12..32c956705b8e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -381,6 +381,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add e820 reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;


[tip:x86/kdump] x86/mm: Rework ioremap resource mapping determination

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  5da04cc86d1215fd9fe0e5c88ead6e8428a75e56
Gitweb: https://git.kernel.org/tip/5da04cc86d1215fd9fe0e5c88ead6e8428a75e56
Author: Lianbo Jiang 
AuthorDate: Tue, 23 Apr 2019 09:30:06 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 09:58:07 +0200

x86/mm: Rework ioremap resource mapping determination

On ioremap(), __ioremap_check_mem() does a couple of checks on the
supplied memory range to determine how the range should be mapped and in
particular what protection flags should be used.

Generalize the procedure by introducing IORES_MAP_* flags which control
different aspects of the ioremapping and use them in the respective
helpers which determine which descriptor flags should be set per range.

 [ bp:
   - Rewrite commit message.
   - Add/improve comments.
   - Reflow __ioremap_caller()'s args.
   - s/__ioremap_check_desc/__ioremap_check_encrypted/g;
   - s/__ioremap_res_check/__ioremap_collect_map_flags/g;
   - clarify __ioremap_check_ram()'s purpose. ]

Signed-off-by: Lianbo Jiang 
Co-developed-by: Borislav Petkov 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: b...@redhat.com
Cc: Dave Hansen 
Cc: dyo...@redhat.com
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: ke...@lists.infradead.org
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190423013007.17838-3-liji...@redhat.com
---
 arch/x86/mm/ioremap.c  | 71 --
 include/linux/ioport.h |  9 +++
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4b6423e7bd21..e500f1df1140 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -28,9 +28,11 @@
 
 #include "physaddr.h"
 
-struct ioremap_mem_flags {
-   bool system_ram;
-   bool desc_other;
+/*
+ * Descriptor controlling ioremap() behavior.
+ */
+struct ioremap_desc {
+   unsigned int flags;
 };
 
 /*
@@ -62,13 +64,14 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long 
size,
return err;
 }
 
-static bool __ioremap_check_ram(struct resource *res)
+/* Does the range (or a subset of) contain normal RAM? */
+static unsigned int __ioremap_check_ram(struct resource *res)
 {
unsigned long start_pfn, stop_pfn;
unsigned long i;
 
if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
-   return false;
+   return 0;
 
start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
stop_pfn = (res->end + 1) >> PAGE_SHIFT;
@@ -76,28 +79,44 @@ static bool __ioremap_check_ram(struct resource *res)
for (i = 0; i < (stop_pfn - start_pfn); ++i)
if (pfn_valid(start_pfn + i) &&
!PageReserved(pfn_to_page(start_pfn + i)))
-   return true;
+   return IORES_MAP_SYSTEM_RAM;
}
 
-   return false;
+   return 0;
 }
 
-static int __ioremap_check_desc_other(struct resource *res)
+/*
+ * In a SEV guest, NONE and RESERVED should not be mapped encrypted because
+ * there the whole memory is already encrypted.
+ */
+static unsigned int __ioremap_check_encrypted(struct resource *res)
 {
-   return (res->desc != IORES_DESC_NONE);
+   if (!sev_active())
+   return 0;
+
+   switch (res->desc) {
+   case IORES_DESC_NONE:
+   case IORES_DESC_RESERVED:
+   break;
+   default:
+   return IORES_MAP_ENCRYPTED;
+   }
+
+   return 0;
 }
 
-static int __ioremap_res_check(struct resource *res, void *arg)
+static int __ioremap_collect_map_flags(struct resource *res, void *arg)
 {
-   struct ioremap_mem_flags *flags = arg;
+   struct ioremap_desc *desc = arg;
 
-   if (!flags->system_ram)
-   flags->system_ram = __ioremap_check_ram(res);
+   if (!(desc->flags & IORES_MAP_SYSTEM_RAM))
+   desc->flags |= __ioremap_check_ram(res);
 
-   if (!flags->desc_other)
-   flags->desc_other = __ioremap_check_desc_other(res);
+   if (!(desc->flags & IORES_MAP_ENCRYPTED))
+   desc->flags |= __ioremap_check_encrypted(res);
 
-   return flags->system_ram && flags->desc_other;
+   return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) ==
+  (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED));
 }
 
 /*
@@ -106,15 +125,15 @@ static int __ioremap_res_check(struct resource *res, void 
*arg)
  * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
  */
 static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
-   struct ioremap_mem_flags *flags)
+   struct ioremap_desc *desc)
 {
u64 start, end;
 
start = 

[tip:x86/kdump] x86/e820, ioport: Add a new I/O resource descriptor IORES_DESC_RESERVED

2019-06-20 Thread tip-bot for Lianbo Jiang
Commit-ID:  ae9e13d621d6795ec1ad6bf10bd2549c6c3feca4
Gitweb: https://git.kernel.org/tip/ae9e13d621d6795ec1ad6bf10bd2549c6c3feca4
Author: Lianbo Jiang 
AuthorDate: Tue, 23 Apr 2019 09:30:05 +0800
Committer:  Borislav Petkov 
CommitDate: Thu, 20 Jun 2019 09:54:31 +0200

x86/e820, ioport: Add a new I/O resource descriptor IORES_DESC_RESERVED

When executing the kexec_file_load() syscall, the first kernel needs to
pass the e820 reserved ranges to the second kernel because some devices
(PCI, for example) need them present in the kdump kernel for proper
initialization.

But the kernel can not exactly match the e820 reserved ranges when
walking through the iomem resources using the default IORES_DESC_NONE
descriptor, because there are several types of e820 ranges which are
marked IORES_DESC_NONE, see e820_type_to_iores_desc().

Therefore, add a new I/O resource descriptor called IORES_DESC_RESERVED
to mark exactly those ranges. It will be used to match the reserved
resource ranges when walking through iomem resources.

 [ bp: Massage commit message. ]

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: b...@redhat.com
Cc: dave.han...@linux.intel.com
Cc: dyo...@redhat.com
Cc: "H. Peter Anvin" 
Cc: Huang Zijiang 
Cc: Ingo Molnar 
Cc: Joe Perches 
Cc: Juergen Gross 
Cc: ke...@lists.infradead.org
Cc: Masayoshi Mizuma 
Cc: Michal Hocko 
Cc: Mike Rapoport 
Cc: Naoya Horiguchi 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190423013007.17838-2-liji...@redhat.com
---
 arch/x86/kernel/e820.c | 2 +-
 include/linux/ioport.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 8f32e705a980..e69408bf664b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1063,10 +1063,10 @@ static unsigned long __init 
e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM:return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM:return 
IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+   case E820_TYPE_RESERVED:return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE:/* Fall-through: */
-   case E820_TYPE_RESERVED:/* Fall-through: */
default:return IORES_DESC_NONE;
}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+   IORES_DESC_RESERVED = 8,
 };
 
 /* helpers to define resources */


[PATCH v2] scsi: smartpqi: properly set both the DMA mask and the coherent DMA mask in pqi_pci_init()

2019-05-26 Thread Lianbo Jiang
When SME is enabled, the smartpqi driver won't work on the HP DL385
G10 machine, which causes the failure of kernel boot because it fails
to allocate pqi error buffer. Please refer to the kernel log:

[9.431749] usbcore: registered new interface driver uas
[9.441524] Microsemi PQI Driver (v1.1.4-130)
[9.442956] i40e :04:00.0: fw 6.70.48768 api 1.7 nvm 10.2.5
[9.447237] smartpqi :23:00.0: Microsemi Smart Family Controller found
 Starting dracut initqueue hook...
[  OK  ] Started Show Plymouth Boot Scre[9.471654] Broadcom NetXtreme-C/E 
driver bnxt_en v1.9.1
en.
[  OK  ] Started Forward Password Requests to Plymouth Directory Watch.
[[0;[9.487108] smartpqi :23:00.0: failed to allocate PQI error buffer

[  139.050544] dracut-initqueue[949]: Warning: dracut-initqueue timeout - 
starting timeout scripts
[  139.589779] dracut-initqueue[949]: Warning: dracut-initqueue timeout - 
starting timeout scripts

Basically, the fact that the coherent DMA mask value wasn't set caused
the driver to fall back to SWIOTLB when SME is active.

For correct operation, lets call the dma_set_mask_and_coherent() to
properly set the mask for both streaming and coherent, in order to
inform the kernel about the devices DMA addressing capabilities.

Signed-off-by: Lianbo Jiang 
Acked-by: Don Brace 
Tested-by: Don Brace 
---
Changes since v1:
1. Add the extra description suggested by Tom to patch log.
2. Add Don's Acked-by and Tested-by to the commit. 

 drivers/scsi/smartpqi/smartpqi_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/smartpqi/smartpqi_init.c 
b/drivers/scsi/smartpqi/smartpqi_init.c
index c26cac819f9e..8b1fde6c7dab 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -7282,7 +7282,7 @@ static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info)
else
mask = DMA_BIT_MASK(32);
 
-   rc = dma_set_mask(_info->pci_dev->dev, mask);
+   rc = dma_set_mask_and_coherent(_info->pci_dev->dev, mask);
if (rc) {
dev_err(_info->pci_dev->dev, "failed to set DMA mask\n");
goto disable_device;
-- 
2.17.1



[PATCH] scsi: smartpqi: properly set both the DMA mask and the coherent DMA mask in pqi_pci_init()

2019-05-22 Thread Lianbo Jiang
When SME is enabled, the smartpqi driver won't work on the HP DL385
G10 machine, which causes the failure of kernel boot because it fails
to allocate pqi error buffer. Please refer to the kernel log:

[9.431749] usbcore: registered new interface driver uas
[9.441524] Microsemi PQI Driver (v1.1.4-130)
[9.442956] i40e :04:00.0: fw 6.70.48768 api 1.7 nvm 10.2.5
[9.447237] smartpqi :23:00.0: Microsemi Smart Family Controller found
 Starting dracut initqueue hook...
[  OK  ] Started Show Plymouth Boot Scre[9.471654] Broadcom NetXtreme-C/E 
driver bnxt_en v1.9.1
en.
[  OK  ] Started Forward Password Requests to Plymouth Directory Watch.
[[0;[9.487108] smartpqi :23:00.0: failed to allocate PQI error buffer

[  139.050544] dracut-initqueue[949]: Warning: dracut-initqueue timeout - 
starting timeout scripts
[  139.589779] dracut-initqueue[949]: Warning: dracut-initqueue timeout - 
starting timeout scripts

For correct operation, lets call the dma_set_mask_and_coherent() to
properly set the mask for both streaming and coherent, in order to
inform the kernel about the devices DMA addressing capabilities.

Signed-off-by: Lianbo Jiang 
---
 drivers/scsi/smartpqi/smartpqi_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/smartpqi/smartpqi_init.c 
b/drivers/scsi/smartpqi/smartpqi_init.c
index c26cac819f9e..8b1fde6c7dab 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -7282,7 +7282,7 @@ static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info)
else
mask = DMA_BIT_MASK(32);
 
-   rc = dma_set_mask(_info->pci_dev->dev, mask);
+   rc = dma_set_mask_and_coherent(_info->pci_dev->dev, mask);
if (rc) {
dev_err(_info->pci_dev->dev, "failed to set DMA mask\n");
goto disable_device;
-- 
2.17.1



[PATCH 3/3 v9] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2019-03-21 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), kernel does not pass the e820
reserved ranges to the second kernel, which might cause two problems:

The first one is the MMCONFIG issue. The basic problem is that this device
is in PCI segment 1 and the kernel PCI probing can not find it without all
the e820 I/O reservations being present in the e820 table. And the kdump
kernel does not have those reservations because the kexec command does not
pass the I/O reservation via the "memmap=xxx" command line option. (This
problem does not show up for other vendors, as SGI is apparently the
actually fails for everyone, but devices in segment 0 are then found by
some legacy lookup method.) The workaround for this is to pass the I/O
reserved regions to the kdump kernel.

MMCONFIG(aka ECAM) space is described in the ACPI MCFG table. If you don't
have ECAM: (a) PCI devices won't work at all on non-x86 systems that use
only ECAM for config access, (b) you won't be albe to access devices on
non-0 segments, (c) you won't be able to access extended config space(
address 0x100-0x), which means none of the Extended Capabilities will
be available(AER, ACS, ATS, etc). [Bjorn's comment]

The second issue is that the SME kdump kernel doesn't work without the
e820 reserved ranges. When SME is active in kdump kernel, actually, those
reserved regions are still decrypted, but because those reserved ranges are
not present at all in kdump kernel e820 table, those reserved regions are
considered as encrypted, it goes wrong.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 17ffc869cab8..1db2754df9e9 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -381,6 +381,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add e820 reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1



[PATCH 1/3 v9] x86/mm: Change the examination condition to avoid confusion

2019-03-21 Thread Lianbo Jiang
Following the commit <0e4c12b45aa8> ("x86/mm, resource: Use
PAGE_KERNEL protection for ioremap of memory pages"), here
it is really checking for the 'IORES_DESC_ACPI_*' values.
Therefore, it is necessary to change the examination condition
to avoid confusion.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/mm/ioremap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0029604af8a4..0e3ba620612d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,7 +83,8 @@ static bool __ioremap_check_ram(struct resource *res)
 
 static int __ioremap_check_desc_other(struct resource *res)
 {
-   return (res->desc != IORES_DESC_NONE);
+   return ((res->desc == IORES_DESC_ACPI_TABLES) ||
+   (res->desc == IORES_DESC_ACPI_NV_STORAGE));
 }
 
 static int __ioremap_res_check(struct resource *res, void *arg)
-- 
2.17.1



[tip:x86/kdump] kdump: Document kernel data exported in the vmcoreinfo note

2019-01-15 Thread tip-bot for Lianbo Jiang
Commit-ID:  f263245a0ce2c4e23b89a58fa5f7dfc048e11929
Gitweb: https://git.kernel.org/tip/f263245a0ce2c4e23b89a58fa5f7dfc048e11929
Author: Lianbo Jiang 
AuthorDate: Thu, 10 Jan 2019 20:19:43 +0800
Committer:  Borislav Petkov 
CommitDate: Tue, 15 Jan 2019 11:05:28 +0100

kdump: Document kernel data exported in the vmcoreinfo note

Document data exported in vmcoreinfo and briefly describe its use by
userspace tools.

 [ bp: heavily massage and redact the text. ]

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: Andrew Morton 
Cc: Baoquan He 
Cc: Dave Young 
Cc: Jonathan Corbet 
Cc: Thomas Gleixner 
Cc: Vivek Goyal 
Cc: ander...@redhat.com
Cc: k-ha...@ab.jp.nec.com
Cc: ke...@lists.infradead.org
Cc: linux-...@vger.kernel.org
Cc: mi...@redhat.com
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190110121944.6050-2-liji...@redhat.com
---
 Documentation/kdump/vmcoreinfo.txt | 495 +
 1 file changed, 495 insertions(+)

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..bb94a4bd597a
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,495 @@
+
+   VMCOREINFO
+
+
+===
+What is it?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built. For example, crash uses it to
+find the corresponding vmlinux in order to process vmcore.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data used by the memory
+management facilities. It is usually 4096 bytes of size and a page is
+aligned on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+The UTS namespace which is used to isolate two specific elements of the
+system that relate to the uname(2) system call. It is named after the
+data structure used to store information returned by the uname(2) system
+call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online nodes
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile gets the vmalloc start value
+from this variable and its value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile gets the pglist_data structure from this symbol, which is
+used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value is used to check if the
+pglist_data structure is valid. It is also used for checking the memory
+type.
+
+zone
+
+
+The size of a zone structure. This value is used to check if the zone
+structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful when excluding

[tip:x86/kdump] x86/kdump: Export the SME mask to vmcoreinfo

2019-01-11 Thread tip-bot for Lianbo Jiang
Commit-ID:  65f750e5457aef9a8085a99d613fea0430303e93
Gitweb: https://git.kernel.org/tip/65f750e5457aef9a8085a99d613fea0430303e93
Author: Lianbo Jiang 
AuthorDate: Thu, 10 Jan 2019 20:19:44 +0800
Committer:  Borislav Petkov 
CommitDate: Fri, 11 Jan 2019 16:09:25 +0100

x86/kdump: Export the SME mask to vmcoreinfo

On AMD SME machines, makedumpfile tools need to know whether the crashed
kernel was encrypted.

If SME is enabled in the first kernel, the crashed kernel's page table
entries (pgd/pud/pmd/pte) contain the memory encryption mask which
makedumpfile needs to remove in order to obtain the true physical
address.

Export that mask in a vmcoreinfo variable.

 [ bp: Massage commit message and move define at the end of the
   function. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Andrew Morton 
Cc: Baoquan He 
Cc: Dave Young 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: Tom Lendacky 
Cc: ander...@redhat.com
Cc: k-ha...@ab.jp.nec.com
Cc: ke...@lists.infradead.org
Cc: linux-...@vger.kernel.org
Cc: x86-ml 
Link: https://lkml.kernel.org/r/20190110121944.6050-3-liji...@redhat.com
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..ceba408ea982 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,6 +352,8 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
@@ -364,6 +366,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
  kaslr_offset());
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+   VMCOREINFO_NUMBER(sme_mask);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */


[PATCH 1/2 v6] kdump: add the vmcoreinfo documentation

2019-01-10 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 500 +
 1 file changed, 500 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..8e444586b87b
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,500 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned
+on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools can compute their values based on the offset of these
+variables. The variables are helpful to exclude unnecessary pages.
+
+(pglist_data, node_zones|nr_zones|node_mem_map

[PATCH 2/2 v6] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-10 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crashed kernel was encrypted or not. If SME is enabled
in the first kernel, the crashed kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so makedumpfile needs to remove
the sme mask to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1



[PATCH 0/2 v6] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-10 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crashed kernel was encrypted or not. If SME is enabled in the first
kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so makedumpfile needs to remove the sme mask to
obtain the true physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Changes since v4:
1. Remove a command that dumping the VMCOREINFO contents from this
   document.
2. Merge the 'PG_buddy' and 'PG_offline' into the PG_* flag in this
   document.
3. Correct some of the mistakes in this document.

Changes since v5:
1. Improve patch log.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 500 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 503 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1



[PATCH 1/2 v5] kdump: add the vmcoreinfo documentation

2019-01-06 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variables as a convention between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 500 +
 1 file changed, 500 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..8e444586b87b
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,500 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned
+on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools can compute their values based on the offset of these
+variables. The variables are helpful

[PATCH 0/2 v5] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-06 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a convention between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Changes since v4:
1. Remove a command that dumping the VMCOREINFO contents from this
   document.
2. Merge the 'PG_buddy' and 'PG_offline' into the PG_* flag in this
   document.
3. Correct some of the mistakes in this document.

*** BLURB HERE ***

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 500 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 503 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1



[PATCH 2/2 v5] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-06 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1



[PATCH 1/2 v4] kdump: add the vmcoreinfo documentation

2018-12-19 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variables as a convention between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 513 +
 1 file changed, 513 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..1f1f69143600
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,513 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+To dump the VMCOREINFO contents, one can do:
+
+# makedumpfile -g VMCOREINFO -x vmlinux
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned on
+4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools

[PATCH 2/2 v4] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-19 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1



[PATCH 0/2 v4] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-19 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a convention between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 513 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 516 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1



[PATCH 2/2 v3] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-16 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..1860fe24117d 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,24 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   /*
+* Currently, the local variable 'sme_mask' stores the value of
+* sme_me_mask(bit 47), and also write the value of sme_mask to
+* the vmcoreinfo.
+* If need, the bit(sme_mask) might be redefined in the future,
+* but the 'bit63' will be reserved.
+* For example:
+* [ misc  ][ enc bit  ][ other misc SME info   ]
+* ____1000______..._
+* 63   59   55   51   47   43   39   35   31   27   ... 3
+*/
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1



[PATCH 0/2 v3] kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-16 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it would normalize the
exported variable as a standard ABI between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 456 +
 arch/x86/kernel/machine_kexec_64.c |  14 +
 2 files changed, 470 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1



[PATCH 1/2 v3] kdump: add the vmcoreinfo documentation

2018-12-16 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it would normalize the
exported variable as a standard ABI between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 456 +
 1 file changed, 456 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..d71260bf383a
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,456 @@
+
+   Documentation for VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+It is a special ELF note section. The VMCOREINFO contains the first
+kernel's various information, for example, structure size, page size,
+symbol values and field offset, etc. These data are packed into an ELF
+note section, and these data will also help user-space tools(e.g. crash
+makedumpfile) analyze the first kernel's memory usage.
+
+In general, makedumpfile can dump the VMCOREINFO contents from vmlinux
+in the first kernel. For example:
+# makedumpfile -g VMCOREINFO -x vmlinux
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+The number of OS release. Based on this version number, people can find
+the source code for the corresponding version. When analyzing the vmcore,
+people must read the source code to find the reason why the kernel crashed.
+
+PAGE_SIZE
+=
+The size of a page. It is the smallest unit of data for memory management
+in kernel. It is usually 4k bytes and the page is aligned in 4k bytes,
+which is very important for computing address.
+
+init_uts_ns
+===
+This is the UTS namespace, which is used to isolate two specific elements
+of the system that relate to the uname system call. The UTS namespace is
+named after the data structure used to store information returned by the
+uname system call.
+
+User-space tools can get the kernel name, host name, kernel release number,
+kernel version, architecture name and OS type from the 'init_uts_ns'.
+
+node_online_map
+===
+It is a macro definition, actually it is an array node_states[N_ONLINE],
+and it represents the set of online node in a system, one bit position
+per node number.
+
+This is used to keep track of which nodes are in the system and online.
+
+swapper_pg_dir
+=
+It generally indicates the pgd for the kernel. When mmu is enabled in
+config file, the 'swapper_pg_dir' is valid.
+
+The 'swapper_pg_dir' helps to translate the virtual address to a physical
+address.
+
+_stext
+==
+It is an assemble symbol that defines the beginning of the text section.
+In general, the '_stext' indicates the kernel start address. This is used
+to convert a virtual address to a physical address when the virtual address
+does not belong to the 'vmalloc' address.
+
+vmap_area_list
+==
+It stores the virtual area list, makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+===
+Physical addresses are translated to struct pages by treating them as an
+index into the mem_map array. Shifting a physical address PAGE_SHIFT bits
+to the right will treat it as a PFN from physical address 0, which is also
+an index within the mem_map array.
+
+In short, it can map the address to struct page.
+
+contig_page_data
+
+Makedumpfile can get the pglist_data structure from this symbol
+'contig_page_data'. The pglist_data structure is used to describe the
+memory layout.
+
+User-space tools can use this symbols for excluding free pages.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+==
+Export the address of 'mem_section' array, and it's length, structure size,
+and the 'section_mem_map' offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them will help to translate
+the address.
+
+page
+
+The size of a 'page' structure. In kernel, the page is an important data
+structure, it is widely used to compute the continuous memory.
+
+pglist_data
+===
+The size of a 'pglist_data' structure. This value will be used to check if
+the 'pglist_data' structure is valid. It is also one of the conditions for
+checking the memory type.
+
+zone
+
+The size of a 'zone' structure. This value is often used to check if the
+'zone' structure is found. It is necessary structures for excluding free
+pages.
+
+free_area

[PATCH 0/2 v7] add reserved e820 ranges to the kdump kernel e820 table

2018-11-15 Thread Lianbo Jiang
These patches add the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces and also pass the e820 reserved
ranges to kdump kernel.

At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might cause two
problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.

Lianbo Jiang (2):
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/kernel/crash.c | 6 ++
 arch/x86/kernel/e820.c  | 2 +-
 include/linux/ioport.h  | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

-- 
2.17.1



[PATCH 0/2 v7] add reserved e820 ranges to the kdump kernel e820 table

2018-11-15 Thread Lianbo Jiang
These patches add the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces and also pass the e820 reserved
ranges to kdump kernel.

At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might cause two
problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.

Lianbo Jiang (2):
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/kernel/crash.c | 6 ++
 arch/x86/kernel/e820.c  | 2 +-
 include/linux/ioport.h  | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

-- 
2.17.1



[tip:x86/mm] kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  992b649a3f013465d8128da02e5449def662a4c3
Gitweb: https://git.kernel.org/tip/992b649a3f013465d8128da02e5449def662a4c3
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 16:37:41 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:09:26 +0200

kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled

In the kdump kernel, the memory of the first kernel needs to be dumped
into the vmcore file.

If SME is enabled in the first kernel, the old memory has to be remapped
with the memory encryption mask in order to access it properly.

Split copy_oldmem_page() functionality to handle encrypted memory
properly.

 [ bp: Heavily massage everything. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/be7b47f9-6be6-e0d1-2c2a-9125bc74b...@redhat.com
---
 arch/x86/kernel/crash_dump_64.c | 60 -
 fs/proc/vmcore.c| 24 -
 include/linux/crash_dump.h  |  4 +++
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..eb8ab3915268 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
 #include 
 #include 
 
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-   size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf,
+ bool encrypted)
 {
void  *vaddr;
 
if (!csize)
return 0;
 
-   vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (encrypted)
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+   else
+   vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
 
if (userbuf) {
-   if (copy_to_user(buf, vaddr + offset, csize)) {
-   iounmap(vaddr);
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
 
set_iounmap_nonlazy();
-   iounmap(vaddr);
+   iounmap((void __iomem *)vaddr);
return csize;
 }
+
+/**
+ * copy_oldmem_page - copy one page of memory
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from the old kernel's memory. For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+}
+
+/**
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap 
the
+ * memory with the encryption mask set to accomodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+  unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..42c32d06f7da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #incl

[tip:x86/mm] iommu/amd: Remap the IOMMU device table with the memory encryption mask for kdump

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  8780158cf977ea5f9912931a30b3d575b36dba22
Gitweb: https://git.kernel.org/tip/8780158cf977ea5f9912931a30b3d575b36dba22
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 11:10:32 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:08:24 +0200

iommu/amd: Remap the IOMMU device table with the memory encryption mask for 
kdump

The kdump kernel copies the IOMMU device table from the old device table
which is encrypted when SME is enabled in the first kernel. So remap the
old device table with the memory encryption mask in the kdump kernel.

 [ bp: Massage commit message. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Acked-by: Joerg Roedel 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Link: https://lkml.kernel.org/r/20180930031033.22110-4-liji...@redhat.com
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 


[tip:x86/mm] kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  992b649a3f013465d8128da02e5449def662a4c3
Gitweb: https://git.kernel.org/tip/992b649a3f013465d8128da02e5449def662a4c3
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 16:37:41 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:09:26 +0200

kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled

In the kdump kernel, the memory of the first kernel needs to be dumped
into the vmcore file.

If SME is enabled in the first kernel, the old memory has to be remapped
with the memory encryption mask in order to access it properly.

Split copy_oldmem_page() functionality to handle encrypted memory
properly.

 [ bp: Heavily massage everything. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/be7b47f9-6be6-e0d1-2c2a-9125bc74b...@redhat.com
---
 arch/x86/kernel/crash_dump_64.c | 60 -
 fs/proc/vmcore.c| 24 -
 include/linux/crash_dump.h  |  4 +++
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..eb8ab3915268 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
 #include 
 #include 
 
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-   size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf,
+ bool encrypted)
 {
void  *vaddr;
 
if (!csize)
return 0;
 
-   vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (encrypted)
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+   else
+   vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
 
if (userbuf) {
-   if (copy_to_user(buf, vaddr + offset, csize)) {
-   iounmap(vaddr);
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
 
set_iounmap_nonlazy();
-   iounmap(vaddr);
+   iounmap((void __iomem *)vaddr);
return csize;
 }
+
+/**
+ * copy_oldmem_page - copy one page of memory
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from the old kernel's memory. For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+}
+
+/**
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap 
the
+ * memory with the encryption mask set to accomodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+  unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..42c32d06f7da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #incl

[tip:x86/mm] iommu/amd: Remap the IOMMU device table with the memory encryption mask for kdump

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  8780158cf977ea5f9912931a30b3d575b36dba22
Gitweb: https://git.kernel.org/tip/8780158cf977ea5f9912931a30b3d575b36dba22
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 11:10:32 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:08:24 +0200

iommu/amd: Remap the IOMMU device table with the memory encryption mask for 
kdump

The kdump kernel copies the IOMMU device table from the old device table
which is encrypted when SME is enabled in the first kernel. So remap the
old device table with the memory encryption mask in the kdump kernel.

 [ bp: Massage commit message. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Acked-by: Joerg Roedel 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Link: https://lkml.kernel.org/r/20180930031033.22110-4-liji...@redhat.com
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 


[tip:x86/mm] kexec: Allocate decrypted control pages for kdump if SME is enabled

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  9cf38d5559e813cccdba8b44c82cc46ba48d0896
Gitweb: https://git.kernel.org/tip/9cf38d5559e813cccdba8b44c82cc46ba48d0896
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 11:10:31 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:01:51 +0200

kexec: Allocate decrypted control pages for kdump if SME is enabled

When SME is enabled in the first kernel, it needs to allocate decrypted
pages for kdump because when the kdump kernel boots, these pages need to
be accessed decrypted in the initial boot stage, before SME is enabled.

 [ bp: clean up text. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/20180930031033.22110-3-liji...@redhat.com
---
 kernel/kexec_core.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..86ef06d3dbe3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,10 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   /* Ensure that these pages are decrypted if SME is enabled. */
+   if (pages)
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+
return pages;
 }
 
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;


[tip:x86/mm] x86/ioremap: Add an ioremap_encrypted() helper

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  c3a7a61c192ec350330128edb13db33a9bc0ace1
Gitweb: https://git.kernel.org/tip/c3a7a61c192ec350330128edb13db33a9bc0ace1
Author: Lianbo Jiang 
AuthorDate: Thu, 27 Sep 2018 15:19:51 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 11:57:51 +0200

x86/ioremap: Add an ioremap_encrypted() helper

When SME is enabled, the memory is encrypted in the first kernel. In
this case, SME also needs to be enabled in the kdump kernel, and we have
to remap the old memory with the memory encryption mask.

The case of concern here is if SME is active in the first kernel,
and it is active too in the kdump kernel. There are four cases to be
considered:

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in the
   kdump kernel.

b. crash notes
   When dumping vmcore, the people usually need to read useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It's encrypted in the first kernel, kdump kernel needs to access its
   content to analyze and get information it needs.

d. mmio of AMD iommu
   not encrypted in both kernels

Add a new bool parameter @encrypted to __ioremap_caller(). If set,
memory will be remapped with the SME mask.

Add a new function ioremap_encrypted() to explicitly pass in a true
value for @encrypted. Use ioremap_encrypted() for the above a, b, c
cases.

 [ bp: cleanup commit message, extern defs in io.h and drop forgotten
   include. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/20180927071954.29615-2-liji...@redhat.com
---
 arch/x86/include/asm/io.h |  3 ++-
 arch/x86/mm/ioremap.c | 24 
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..6df53efcecfd 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t 
offset, unsigned long size)
 #define ioremap_nocache ioremap_nocache
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 #define ioremap_uc ioremap_uc
-
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_

[tip:x86/mm] kexec: Allocate decrypted control pages for kdump if SME is enabled

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  9cf38d5559e813cccdba8b44c82cc46ba48d0896
Gitweb: https://git.kernel.org/tip/9cf38d5559e813cccdba8b44c82cc46ba48d0896
Author: Lianbo Jiang 
AuthorDate: Sun, 30 Sep 2018 11:10:31 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 12:01:51 +0200

kexec: Allocate decrypted control pages for kdump if SME is enabled

When SME is enabled in the first kernel, it needs to allocate decrypted
pages for kdump because when the kdump kernel boots, these pages need to
be accessed decrypted in the initial boot stage, before SME is enabled.

 [ bp: clean up text. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/20180930031033.22110-3-liji...@redhat.com
---
 kernel/kexec_core.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..86ef06d3dbe3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,10 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   /* Ensure that these pages are decrypted if SME is enabled. */
+   if (pages)
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+
return pages;
 }
 
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;


[tip:x86/mm] x86/ioremap: Add an ioremap_encrypted() helper

2018-10-06 Thread tip-bot for Lianbo Jiang
Commit-ID:  c3a7a61c192ec350330128edb13db33a9bc0ace1
Gitweb: https://git.kernel.org/tip/c3a7a61c192ec350330128edb13db33a9bc0ace1
Author: Lianbo Jiang 
AuthorDate: Thu, 27 Sep 2018 15:19:51 +0800
Committer:  Borislav Petkov 
CommitDate: Sat, 6 Oct 2018 11:57:51 +0200

x86/ioremap: Add an ioremap_encrypted() helper

When SME is enabled, the memory is encrypted in the first kernel. In
this case, SME also needs to be enabled in the kdump kernel, and we have
to remap the old memory with the memory encryption mask.

The case of concern here is if SME is active in the first kernel,
and it is active too in the kdump kernel. There are four cases to be
considered:

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in the
   kdump kernel.

b. crash notes
   When dumping vmcore, the people usually need to read useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It's encrypted in the first kernel, kdump kernel needs to access its
   content to analyze and get information it needs.

d. mmio of AMD iommu
   not encrypted in both kernels

Add a new bool parameter @encrypted to __ioremap_caller(). If set,
memory will be remapped with the SME mask.

Add a new function ioremap_encrypted() to explicitly pass in a true
value for @encrypted. Use ioremap_encrypted() for the above a, b, c
cases.

 [ bp: cleanup commit message, extern defs in io.h and drop forgotten
   include. ]

Signed-off-by: Lianbo Jiang 
Signed-off-by: Borislav Petkov 
Reviewed-by: Tom Lendacky 
Cc: ke...@lists.infradead.org
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: a...@linux-foundation.org
Cc: dan.j.willi...@intel.com
Cc: bhelg...@google.com
Cc: baiyao...@cmss.chinamobile.com
Cc: ti...@suse.de
Cc: brijesh.si...@amd.com
Cc: dyo...@redhat.com
Cc: b...@redhat.com
Cc: jroe...@suse.de
Link: https://lkml.kernel.org/r/20180927071954.29615-2-liji...@redhat.com
---
 arch/x86/include/asm/io.h |  3 ++-
 arch/x86/mm/ioremap.c | 24 
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..6df53efcecfd 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t 
offset, unsigned long size)
 #define ioremap_nocache ioremap_nocache
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 #define ioremap_uc ioremap_uc
-
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_

[PATCH v7 RESEND 1/4] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-27 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Here we only talk about the case that SME is active in the first kernel,
and only care it's active too in kdump kernel. there are four cases we
need considered.

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in kdump
   kernel.

b. crash notes
   When dumping vmcore, the people usually need to read the useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It is allocated by kernel, need fill its pointer into mmio of amd iommu.
   It's encrypted in the first kernel, need read the old content to analyze
   and get useful information.

d. mmio of amd iommu
   Register reported by amd firmware, it's not RAM, we don't encrypt in
   both the first kernel and kdump kernel.

To achieve the goal, the solution is:
1. add a new bool parameter "encrypted" to __ioremap_caller()
   It is a low level function, and check the newly added parameter, if it's
   true and in kdump kernel, will remap the memory with sme mask.

2. add a new function ioremap_encrypted() to explicitly passed in a "true"
   value for "encrypted".
   For above a, b, c, we will call ioremap_encrypted();

3. adjust all existed ioremap wrapper functions, passed in "false" for
   encrypted to make them an before.

   ioremap_encrypted()\
   ioremap_cache() |
   ioremap_prot()  |
   ioremap_wt()|->__ioremap_caller()
   ioremap_wc()|
   ioremap_uc()    |
   ioremap_nocache()  /

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {

[PATCH v7 RESEND 1/4] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-27 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Here we only talk about the case that SME is active in the first kernel,
and only care it's active too in kdump kernel. there are four cases we
need considered.

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in kdump
   kernel.

b. crash notes
   When dumping vmcore, the people usually need to read the useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It is allocated by kernel, need fill its pointer into mmio of amd iommu.
   It's encrypted in the first kernel, need read the old content to analyze
   and get useful information.

d. mmio of amd iommu
   Register reported by amd firmware, it's not RAM, we don't encrypt in
   both the first kernel and kdump kernel.

To achieve the goal, the solution is:
1. add a new bool parameter "encrypted" to __ioremap_caller()
   It is a low level function, and check the newly added parameter, if it's
   true and in kdump kernel, will remap the memory with sme mask.

2. add a new function ioremap_encrypted() to explicitly passed in a "true"
   value for "encrypted".
   For above a, b, c, we will call ioremap_encrypted();

3. adjust all existed ioremap wrapper functions, passed in "false" for
   encrypted to make them an before.

   ioremap_encrypted()\
   ioremap_cache() |
   ioremap_prot()  |
   ioremap_wt()|->__ioremap_caller()
   ioremap_wc()|
   ioremap_uc()    |
   ioremap_nocache()  /

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {

[PATCH 0/2] x86/kexec_file: add reserved e820 ranges to the kdump kernel e820 table

2018-09-17 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

Furthermore, when AMD SME kdump support, it needs to map dmi table area as
unencrypted. For normal boot these ranges sit in e820 reserved ranges thus
the early ioremap code naturally map them as unencrypted. So if we have same
e820 reserve setup in kdump kernel then it will just work like normal kernel.

Kdump use walk_iomem_res_desc to iterate resources then add matched desc to
e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types, we
need add exact e820 type to the kdump kernel e820 table thus need an extra
checking in memmap_entry_callback to match the e820 type and resource name.

NOTE:
Before verifying this patches, you need to merge the following patch, which
uses to fix an upstream bug. For more information, you can refer to the link
below.
https://lore.kernel.org/patchwork/patch/986979/

Lianbo Jiang (2):
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to 2nd kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 12 +++-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  1 +
 4 files changed, 15 insertions(+), 2 deletions(-)

-- 
2.17.1



[PATCH 1/2] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-09-17 Thread Lianbo Jiang
kdump use walk_iomem_res_desc to iterate io resources then add matched
desc to e820 table for 2nd kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to 2nd kernel e820 table thus need an extra
checking in memmap_entry_callback to match the e820 type and resource
name.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..f5e84fb9fe58 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char* e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..3c113e6545a3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (!strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..3e2fc4845fe7 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1012,7 +1012,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char* e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index f5d9fc70a04c..cc90633f35f9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(struct resource *res, 
unsigned long desc,
res->end = p->end;
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1



[PATCH 0/2] x86/kexec_file: add reserved e820 ranges to the kdump kernel e820 table

2018-09-17 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

Furthermore, when AMD SME kdump support, it needs to map dmi table area as
unencrypted. For normal boot these ranges sit in e820 reserved ranges thus
the early ioremap code naturally map them as unencrypted. So if we have same
e820 reserve setup in kdump kernel then it will just work like normal kernel.

Kdump use walk_iomem_res_desc to iterate resources then add matched desc to
e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types, we
need add exact e820 type to the kdump kernel e820 table thus need an extra
checking in memmap_entry_callback to match the e820 type and resource name.

NOTE:
Before verifying this patches, you need to merge the following patch, which
uses to fix an upstream bug. For more information, you can refer to the link
below.
https://lore.kernel.org/patchwork/patch/986979/

Lianbo Jiang (2):
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to 2nd kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 12 +++-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  1 +
 4 files changed, 15 insertions(+), 2 deletions(-)

-- 
2.17.1



[PATCH 1/2] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-09-17 Thread Lianbo Jiang
kdump use walk_iomem_res_desc to iterate io resources then add matched
desc to e820 table for 2nd kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to 2nd kernel e820 table thus need an extra
checking in memmap_entry_callback to match the e820 type and resource
name.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..f5e84fb9fe58 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char* e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..3c113e6545a3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (!strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..3e2fc4845fe7 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1012,7 +1012,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char* e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index f5d9fc70a04c..cc90633f35f9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(struct resource *res, 
unsigned long desc,
res->end = p->end;
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1



[PATCH 2/2] x86/kexec_file: add reserved e820 ranges to 2nd kernel e820 table

2018-09-17 Thread Lianbo Jiang
e820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region
otherwise it falls back to legacy mode.

When AMD SME kdump support, it needs to map dmi table area as unencrypted.
For normal boot these ranges sit in e820 reserved ranges thus the early
ioremap code naturally map them as unencrypted. So if we have same e820
reserve setup in kdump kernel then it will just work like normal kernel.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 3c113e6545a3..db453e9c117b 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add all reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, -1, ,
+   memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1



[PATCH 2/2] x86/kexec_file: add reserved e820 ranges to 2nd kernel e820 table

2018-09-17 Thread Lianbo Jiang
e820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region
otherwise it falls back to legacy mode.

When AMD SME kdump support, it needs to map dmi table area as unencrypted.
For normal boot these ranges sit in e820 reserved ranges thus the early
ioremap code naturally map them as unencrypted. So if we have same e820
reserve setup in kdump kernel then it will just work like normal kernel.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 3c113e6545a3..db453e9c117b 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add all reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, -1, ,
+   memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1



[PATCH] resource: fix an error which walks through iomem resources

2018-09-17 Thread Lianbo Jiang
When we walk through iomem resources by calling walk_iomem_res_desc(), the
values of the function parameter may be modified in the while loop of __walk
_iomem_res_desc(), which will cause us to not get the desired result in some
cases.

At present, it only restores the original value of res->end, but it doesn't
restore the original value of res->flags in the while loop of __walk_iomem
_res_desc(). Whenever the find_next_iomem_res() finds a resource and returns
the result, the original values of this resource will be modified, which might
lead to an error in the next loop. For example:

The original value of resource flags is: res->flags=0x8200(initial value)

p->flags   _ 0x81000200 __ 0x8200 _
  /  \  /  \
||___A||_._|__B_|..___|
0 0x
(memory address ranges)

Note: if ((p->flags & res->flags) != res->flags) continue;

When the resource A is found, the original value of this resource flags will
be changed to 0x81000200(res->flags=0x81000200), and continue to look for the
next resource, when the loop reaches resource B, it can not get the resource B
any more(you can refer to the for loop of find_next_iomem_res()), because the
value of conditional expression will become true and will also jump the resource
B. In fact, we should get the resource A and B when we walk through the whole
tree, but it only gets the resource A, the resource B is missed.

Signed-off-by: Lianbo Jiang 
---
 kernel/resource.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..f5d9fc70a04c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -375,6 +375,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 int (*func)(struct resource *, void *))
 {
u64 orig_end = res->end;
+   u64 orig_flags = res->flags;
int ret = -1;
 
while ((res->start < res->end) &&
@@ -385,6 +386,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 
res->start = res->end + 1;
res->end = orig_end;
+   res->flags = orig_flags;
}
 
return ret;
-- 
2.17.1



[PATCH] resource: fix an error which walks through iomem resources

2018-09-17 Thread Lianbo Jiang
When we walk through iomem resources by calling walk_iomem_res_desc(), the
values of the function parameter may be modified in the while loop of __walk
_iomem_res_desc(), which will cause us to not get the desired result in some
cases.

At present, it only restores the original value of res->end, but it doesn't
restore the original value of res->flags in the while loop of __walk_iomem
_res_desc(). Whenever the find_next_iomem_res() finds a resource and returns
the result, the original values of this resource will be modified, which might
lead to an error in the next loop. For example:

The original value of resource flags is: res->flags=0x8200(initial value)

p->flags   _ 0x81000200 __ 0x8200 _
  /  \  /  \
||___A||_._|__B_|..___|
0 0x
(memory address ranges)

Note: if ((p->flags & res->flags) != res->flags) continue;

When the resource A is found, the original value of this resource flags will
be changed to 0x81000200(res->flags=0x81000200), and continue to look for the
next resource, when the loop reaches resource B, it can not get the resource B
any more(you can refer to the for loop of find_next_iomem_res()), because the
value of conditional expression will become true and will also jump the resource
B. In fact, we should get the resource A and B when we walk through the whole
tree, but it only gets the resource A, the resource B is missed.

Signed-off-by: Lianbo Jiang 
---
 kernel/resource.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..f5d9fc70a04c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -375,6 +375,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 int (*func)(struct resource *, void *))
 {
u64 orig_end = res->end;
+   u64 orig_flags = res->flags;
int ret = -1;
 
while ((res->start < res->end) &&
@@ -385,6 +386,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 
res->start = res->end + 1;
res->end = orig_end;
+   res->flags = orig_flags;
}
 
return ret;
-- 
2.17.1



[PATCH 1/4 v7] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-07 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.17.1



[PATCH 1/4 v7] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-07 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.17.1



[PATCH 0/2] support kdump for AMD secure memory encryption(sme)

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel by
calling ioremap_encrypted().

When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)| 
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang <liji...@redhat.com>
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson <ander...@redhat.com>
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.17-rc4:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit 75bc37fefc44 ("Linux 4.17-rc4")
Author: Linus Torvalds <torva...@linux-foundation.org>
Date:   Sun May 6 16:57:38 2018 -1000

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Lianbo Jiang (2):
  add a function(ioremap_encrypted) for kdump when AMD sme enabled.
  support kdump when AMD secure memory encryption is active

 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/include/asm/io.h   |  2 ++
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 arch/x86/mm/ioremap.c   | 25 +
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 10 files changed, 135 insertions(+), 16 deletions(-)

-- 
2.9.5



[PATCH 1/2] add a function(ioremap_encrypted) for kdump when AMD sme enabled.

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel
by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang <liji...@redhat.com>
---
 arch/x86/include/asm/io.h |  2 ++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..06d2a9f 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,8 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..7a52d1e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,8 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) ||
+   (encrypted && sme_active()))
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5



[PATCH 0/2] support kdump for AMD secure memory encryption(sme)

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel by
calling ioremap_encrypted().

When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)| 
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang 
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson 
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.17-rc4:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit 75bc37fefc44 ("Linux 4.17-rc4")
Author: Linus Torvalds 
Date:   Sun May 6 16:57:38 2018 -1000

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Lianbo Jiang (2):
  add a function(ioremap_encrypted) for kdump when AMD sme enabled.
  support kdump when AMD secure memory encryption is active

 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/include/asm/io.h   |  2 ++
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 arch/x86/mm/ioremap.c   | 25 +
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 10 files changed, 135 insertions(+), 16 deletions(-)

-- 
2.9.5



[PATCH 1/2] add a function(ioremap_encrypted) for kdump when AMD sme enabled.

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel
by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/io.h |  2 ++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..06d2a9f 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,8 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..7a52d1e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,8 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) ||
+   (encrypted && sme_active()))
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5



[PATCH 2/2] support kdump when AMD secure memory encryption is active

2018-05-14 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the data(
encrypted or unencrypted). For example, when sme enabled, if the old memory
is encrypted, we will remap the old memory in encrypted way, which will
automatically decrypt the old memory encrypted when we read those data from
the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Signed-off-by: Lianbo Jiang <liji...@redhat.com>
---
 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 8 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 0ab2ab2..a5663b4 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -7,6 +7,10 @@
 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 static __always_inline __init void *dmi_alloc(unsigned len)
 {
@@ -14,7 +18,15 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remapearly_memremap
+static __always_inline __init void *dmi_early_remap(resource_size_t
+   phys_addr, unsigned long size)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys_addr, size);
+#endif
+   return early_memremap(phys_addr, size);
+}
 #define dmi_early_unmapearly_memunmap
 #define dmi_remap(_x, _l)  memremap(_x, _l, MEMREMAP_WB)
 #define dmi_unmap(_x)  memunmap(_x)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607..354ad66 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -48,6 +48,10 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
@@ -124,6 +128,10 @@ void __init __iomem *__acpi_map_table(unsigned long phys, 
unsigned long size)
if (!phys || !size)
return NULL;
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys, size);
+#endif
return early_memremap(phys, size);
 }
 
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e077..2ef67fc 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -48,3 +48,30 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
iounmap(vaddr);
return csize;
 }
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user(buf, vaddr + offset, csize)) {
+   iounmap(vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap(vaddr);
+   return csize

[PATCH 2/2] support kdump when AMD secure memory encryption is active

2018-05-14 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the data(
encrypted or unencrypted). For example, when sme enabled, if the old memory
is encrypted, we will remap the old memory in encrypted way, which will
automatically decrypt the old memory encrypted when we read those data from
the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 8 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 0ab2ab2..a5663b4 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -7,6 +7,10 @@
 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 static __always_inline __init void *dmi_alloc(unsigned len)
 {
@@ -14,7 +18,15 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remapearly_memremap
+static __always_inline __init void *dmi_early_remap(resource_size_t
+   phys_addr, unsigned long size)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys_addr, size);
+#endif
+   return early_memremap(phys_addr, size);
+}
 #define dmi_early_unmapearly_memunmap
 #define dmi_remap(_x, _l)  memremap(_x, _l, MEMREMAP_WB)
 #define dmi_unmap(_x)  memunmap(_x)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607..354ad66 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -48,6 +48,10 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
@@ -124,6 +128,10 @@ void __init __iomem *__acpi_map_table(unsigned long phys, 
unsigned long size)
if (!phys || !size)
return NULL;
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys, size);
+#endif
return early_memremap(phys, size);
 }
 
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e077..2ef67fc 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -48,3 +48,30 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
iounmap(vaddr);
return csize;
 }
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user(buf, vaddr + offset, csize)) {
+   iounmap(vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap(vaddr);
+   return csize;
+}
+#endif
diff --git a/driv