[PATCH 2/2] support kdump when AMD secure memory encryption is active

2018-05-14 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the data(
encrypted or unencrypted). For example, when sme enabled, if the old memory
is encrypted, we will remap the old memory in encrypted way, which will
automatically decrypt the old memory encrypted when we read those data from
the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Signed-off-by: Lianbo Jiang <liji...@redhat.com>
---
 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 8 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 0ab2ab2..a5663b4 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -7,6 +7,10 @@
 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 static __always_inline __init void *dmi_alloc(unsigned len)
 {
@@ -14,7 +18,15 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remapearly_memremap
+static __always_inline __init void *dmi_early_remap(resource_size_t
+   phys_addr, unsigned long size)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys_addr, size);
+#endif
+   return early_memremap(phys_addr, size);
+}
 #define dmi_early_unmapearly_memunmap
 #define dmi_remap(_x, _l)  memremap(_x, _l, MEMREMAP_WB)
 #define dmi_unmap(_x)  memunmap(_x)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607..354ad66 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -48,6 +48,10 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include 
+#include 
+#endif
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
@@ -124,6 +128,10 @@ void __init __iomem *__acpi_map_table(unsigned long phys, 
unsigned long size)
if (!phys || !size)
return NULL;
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   if (sme_active() && is_kdump_kernel())
+   return early_memremap_decrypted(phys, size);
+#endif
return early_memremap(phys, size);
 }
 
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e077..2ef67fc 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -48,3 +48,30 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
iounmap(vaddr);
return csize;
 }
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user(buf, vaddr + offset, csize)) {
+   iounmap(vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap(vaddr);
+   return csize

[PATCH 1/2] add a function(ioremap_encrypted) for kdump when AMD sme enabled.

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel
by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang <liji...@redhat.com>
---
 arch/x86/include/asm/io.h |  2 ++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..06d2a9f 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,8 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..7a52d1e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,8 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) ||
+   (encrypted && sme_active()))
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2] support kdump for AMD secure memory encryption(sme)

2018-05-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel by
calling ioremap_encrypted().

When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)| 
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang <liji...@redhat.com>
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson <ander...@redhat.com>
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.17-rc4:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit 75bc37fefc44 ("Linux 4.17-rc4")
Author: Linus Torvalds <torva...@linux-foundation.org>
Date:   Sun May 6 16:57:38 2018 -1000

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Lianbo Jiang (2):
  add a function(ioremap_encrypted) for kdump when AMD sme enabled.
  support kdump when AMD secure memory encryption is active

 arch/x86/include/asm/dmi.h  | 14 +-
 arch/x86/include/asm/io.h   |  2 ++
 arch/x86/kernel/acpi/boot.c |  8 
 arch/x86/kernel/crash_dump_64.c | 27 +++
 arch/x86/mm/ioremap.c   | 25 +
 drivers/acpi/tables.c   | 14 +-
 drivers/iommu/amd_iommu_init.c  |  9 -
 fs/proc/vmcore.c| 36 +++-
 include/linux/crash_dump.h  |  4 
 kernel/kexec_core.c | 12 
 10 files changed, 135 insertions(+), 16 deletions(-)

-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/4 V3] Support kdump for AMD secure memory encryption(SME)

2018-06-16 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel by
calling ioremap_encrypted().

When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

This patch is only for SME kdump, it is not support SEV kdump.

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang 
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson 
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.17-rc7:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit b04e217704b7 ("Linux 4.17-rc7")
Author: Linus Torvalds 
Date:   Sun May 27 13:01:47 2018 -0700

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. remove the '#ifdef' stuff throughout this patch.
3. put some logic into the early_memremap_pgprot_adjust() and clean the
previous unnecessary changes, for example: arch/x86/include/asm/dmi.h,
arch/x86/kernel/acpi/boot.c, drivers/acpi/tables.c.
4. add a new file and modify Makefile.
5. clean compile warning in copy_device_table() and some compile error.
6. split the original patch into four patches, it will be better for
review.

Some known issues:
1. about SME
Upstream kernel doesn't work when we use kexec in the follow command. The
system will hang.
(This issue doesn't matter with the kdump patch.)

Reproduce steps:
 # kexec -l /boot/vmlinuz-4.17.0-rc7+ --initrd=/boot/initramfs-4.17.0-rc7+.img 
--command-line="root=/dev/mapper/rhel_hp--dl385g10--03-root ro mem_encrypt=on 
rd.lvm.lv=rhel_hp-dl385g10-03/root rd.lvm.lv=rhel_hp-dl385g10-03/swap 
console=ttyS0,115200n81 LANG=en_US.UTF-8 earlyprintk=serial debug nokaslr"
 # kexec -e (or reboot)

The system will hang:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux...
Parsing ELF...[-here the system will hang]

2. about SEV
Upstream kernel(Host OS) doesn't work in host side, some drivers about
SEV always go wrong in host side. We can't boot SEV Guest OS to test
kdump patch. Maybe it is more reasonable to improve SEV in another
patch. When some drivers can work in host side and it can also boot
Virtual Machine(SEV Guest OS), it will be suitable to fix SEV for kdump.

[  369.426131] INFO: task systemd-udevd:865 blocked for more than 120 seconds.
[  369.433177]   Not tainted 4.17.0-rc5+ #60
[  369.437585] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[  369.445783] systemd-udevd   D0   865813 0x8004
[  369.451323] Call Trace:
[  369.453815]  ? __schedule+0x290/0x870
[  369.457523]  schedule+0x32/0x80
[  369.460714]  __sev_do_cmd_locked+0x1f6/0x2a0 [ccp]
[  369.465556]  ? cleanup_uevent_env+0x10/0x10
[  369.470084]  ? remove_wait_queue+0x60/0x60
[  369.

[PATCH 2/4 V3] Allocate pages for kdump without encryption when SME is enabled

2018-06-16 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate pages
for kdump without encryption in order to be able to boot the
second kernel in the same manner as kexec, which helps to keep
the same code style.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 20fef1a..3c22a9b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   unsigned int count, i;
+
+   pages->mapping = NULL;
+   set_page_private(pages, order);
+   count = 1 << order;
+   for (i = 0; i < count; i++)
+   SetPageReserved(pages + i);
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -865,6 +875,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -882,6 +893,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/4 V3] Remap the device table of IOMMU in encrypted manner for kdump

2018-06-16 Thread Lianbo Jiang
In kdump mode, it will copy the device table of IOMMU from the old
device table, which is encrypted when SME is enabled in the first
kernel. So we must remap it in encrypted manner in order to be
automatically decrypted when we read.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add some comments
2. clean compile warning.

 drivers/iommu/amd_iommu_init.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 904c575..a20af4c 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -889,11 +889,24 @@ static bool copy_device_table(void)
}
 
old_devtb_phys = entry & PAGE_MASK;
+
+   /*
+*  When sme enable in the first kernel, old_devtb_phys includes the
+*  memory encryption mask(sme_me_mask), we must remove the memory
+*  encryption mask to obtain the true physical address in kdump mode.
+*/
+   if (mem_encrypt_active() && is_kdump_kernel())
+   old_devtb_phys = __sme_clr(old_devtb_phys);
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (mem_encrypt_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/4 V3] Add a function(ioremap_encrypted) for kdump when AMD sme enabled

2018-06-16 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second
kernel by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. put some logic into the early_memremap_pgprot_adjust() for
early memremap.

 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 28 
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..989d60b 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..e365fc4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
@@ -688,6 +697,9 @@ pgprot_t __init 
early_memremap_pgprot_adjust(resource_size_t phys_addr,
if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
encrypted_prot = false;
 
+   if (sme_active() && is_kdump_kernel())
+   encrypted_prot = false;
+
ret

[PATCH 4/4 V3] Help to dump the old memory encrypted into vmcore file

2018-06-16 Thread Lianbo Jiang
In kdump mode, we need to dump the old memory into vmcore file,
if SME is enabled in the first kernel, we must remap the old
memory in encrypted manner, which will be automatically decrypted
when we read from DRAM. It helps to parse the vmcore for some tools.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add a new file and modify Makefile.
2. remove some code in sev_active().

 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 20 ++
 include/linux/crash_dump.h   | 11 
 4 files changed, 79 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 02d6f5c..afb5bad 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index 000..e44ef33
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af..5200266 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include "internal.h"
+#include 
+#include 
 
 /* List representing chunks of contiguous memory areas and their offsets in
  * vmcore file.
@@ -86,7 +88,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)
 {
unsigned long pfn, offset;
size_t nr_bytes;
@@ -108,8 +111,11 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
-   tmp = copy_oldmem_page(pfn, buf, nr_bytes,
-   offset, userbuf);
+   tmp = encrypted ? copy_oldmem_page_encrypted(pfn,
+   buf, nr_bytes, offset, userbuf)
+   : copy_oldmem_page(pfn, buf, nr_bytes,
+  offset, userbuf);
+
if (tmp < 0)
return tmp;
}
@@ -143,7 +149,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0);
+   return read_from_oldmem(buf, count, ppos, 0, false);
 }
 

[PATCH 1/2 V2] Add a function(ioremap_encrypted) for kdump when AMD sme enabled.

2018-06-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel
by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang 
---
Some changes based on V1:
1. remove the sme_active() check in __ioremap_caller().

 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 24 
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6e5b93..989d60b 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..24e0920 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 V2] Support kdump for AMD secure memory encryption(sme)

2018-06-14 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second kernel by
calling ioremap_encrypted().

When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)| 
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang 
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson 
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.17-rc7:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit b04e217704b7 ("Linux 4.17-rc7")
Author: Linus Torvalds 
Date:   Sun May 27 13:01:47 2018 -0700

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Some changes based on V1:
1. remove the sme_active() check in __ioremap_caller().
2. remove the '#ifdef' stuff throughout this patch.
3. put some logic into the early_memremap_pgprot_adjust() and clean the
previous unnecessary changes, for example: arch/x86/include/asm/dmi.h,
arch/x86/kernel/acpi/boot.c, drivers/acpi/tables.c.
4. rewrite two functions, copy_oldmem_page() and
copy_oldmem_page_encrypted().
5. distingish sme_active() and sev_active(), when a distinction doesn't
need, mem_encrypt_active() will be used.
6. clean compile warning in copy_device_table().

Some known issues:
1. about SME
Upstream kernel doesn't work when we use kexec in the follow command. The
system will hang.
(This issue doesn't matter with the kdump patch.)

Reproduce steps:
 # kexec -l /boot/vmlinuz-4.17.0-rc7+ --initrd=/boot/initramfs-4.17.0-rc7+.img 
--command-line="root=/dev/mapper/rhel_hp--dl385g10--03-root ro mem_encrypt=on 
rd.lvm.lv=rhel_hp-dl385g10-03/root rd.lvm.lv=rhel_hp-dl385g10-03/swap 
console=ttyS0,115200n81 LANG=en_US.UTF-8 earlyprintk=serial debug nokaslr"
 # kexec -e (or reboot)

The system will hang:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux... 
Parsing ELF...[-here the system will hang]

2. about SEV
Upstream kernel doesn't work about SEV on our machine, some drivers always
go wrong. We don't have the suitable machine to test SEV for kdump patch.
Maybe it is resonable to improve SEV in another patch. When SEV works
fine, we will test the kdump patch for SEV. 

[  369.426131] INFO: task systemd-udevd:865 blocked for more than 120 seconds.
[  369.433177]   Not tainted 4.17.0-rc5+ #60
[  369.437585] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[  369.445783] systemd-udevd   D0   865813 0x8004
[  369.451323] Call Trace:
[  369.453815]  ? __schedule+0x290/0x870
[  369.457523]  schedule+0x32/0x80
[  369.460714]  __sev_do_cmd_locked+0x1f6/0x2a0 [ccp]
[  369.465556]  ? cleanup_uevent_env+0x10/0x10
[  369.470084]  ? remove_wait_queue+0x60/0x60
[  369.474219]  ? 0xc0247000
[  369.477572]  __sev_platform_init_locked+0x2b/0x70 [ccp]
[  36

[PATCH 2/2 V2] Support kdump when AMD secure memory encryption is active

2018-06-14 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the data(
encrypted or unencrypted). For example, when sme enabled, if the old memory
is encrypted, we will remap the old memory in encrypted way, which will
automatically decrypt the old memory encrypted when we read those data from
the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

Signed-off-by: Lianbo Jiang 
---
Some changes based on V1:
1. remove the '#ifdef' stuff throughout this patch.
2. put some logic into the early_memremap_pgprot_adjust() and clean the
previous unnecessary changes, for example: arch/x86/include/asm/dmi.h,
arch/x86/kernel/acpi/boot.c, drivers/acpi/tables.c.
3. rewrite two functions, copy_oldmem_page() and
copy_oldmem_page_encrypted().
4. distingish sme_active() and sev_active(), when a distinction doesn't
need, mem_encrypt_active() will be used.
5. clean compile warning in copy_device_table().

 arch/x86/kernel/crash_dump_64.c | 42 +++--
 arch/x86/mm/ioremap.c   |  4 
 drivers/iommu/amd_iommu_init.c  | 14 +-
 fs/proc/vmcore.c| 20 +++-
 include/linux/crash_dump.h  |  5 +
 kernel/kexec_core.c | 12 
 6 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e077..a2c7b13 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,6 +11,23 @@
 #include 
 #include 
 
+static ssize_t copy_to(void *to, void *vaddr, unsigned long offset,
+  size_t size, int userbuf)
+{
+   if (userbuf) {
+   if (copy_to_user(to, vaddr + offset, size)) {
+   iounmap(vaddr);
+   return -ENOMEM;
+   }
+   } else
+   memcpy(to, vaddr + offset, size);
+
+   set_iounmap_nonlazy();
+   iounmap(vaddr);
+
+   return size;
+}
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
@@ -36,15 +53,20 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!vaddr)
return -ENOMEM;
 
-   if (userbuf) {
-   if (copy_to_user(buf, vaddr + offset, csize)) {
-   iounmap(vaddr);
-   return -EFAULT;
-   }
-   } else
-   memcpy(buf, vaddr + offset, csize);
+   return copy_to(buf, vaddr, offset, csize, userbuf);
+}
 
-   set_iounmap_nonlazy();
-   iounmap(vaddr);
-   return csize;
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   return copy_to(buf, vaddr, offset, csize, userbuf);
 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 24e0920..e365fc4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -696,6 +697,9 @@ pgprot_t __init 
early_memremap_pgprot_adjust(resource_size_t phys_addr,
if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
encrypted_prot = false;
 
+   if (sme_active() && is_kdump_kernel())
+   encrypted_prot = false;
+
return encrypted_prot ? pgprot_encrypted(prot)
  : pgprot_decrypted(prot);
 }
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iom

[PATCH 2/5 V5] Allocate pages for kdump without encryption when SME is enabled

2018-07-02 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate pages
for kdump without encryption in order to be able to boot the
second kernel in the same manner as kexec, which helps to keep
the same code style.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. remove some redundant codes for crash control pages.
2. add some comments.

 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4..e7efcd1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, we need to ensure that these pages are
+* unencrypted pages if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), which will make the code
+* become more simple.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +877,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/5 V5] Support kdump for AMD secure memory encryption(SME)

2018-07-02 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

This patch is only for SME kdump, it is not support SEV kdump.

For kdump(SME), there are two cases that doesn't support:
1. SME is enabled in the first kernel, but SME is disabled in the
second kernel
Because the old memory is encrypted, we can't decrypt the old memory
if SME is off in the second kernel.

2. SME is disabled in the first kernel, but SME is enabled in the
second kernel
Maybe it is unnecessary to support this case, because the old memory
is unencrypted, the old memory can be dumped as usual, we don't need
to enable sme in the second kernel, furthermore the requirement is
rare in actual deployment. Another, If we must support the scenario,
it will increase the complexity of the code, we will have to consider
how to transfer the sme flag from the first kernel to the second kernel,
in order to let the second kernel know that whether the old memory is
encrypted.
There are two manners to transfer the SME flag to the second kernel, the
first way is to modify the assembly code, which includes some common
code and the path is too long. The second way is to use kexec tool,
which could require the sme flag to be exported in the first kernel
by "proc" or "sysfs", kexec will read the sme flag from "proc" or
"sysfs" when we use kexec tool to load image, subsequently the sme flag
will be saved in boot_params, we can properly remap the old memory
according to the previously saved sme flag. Although we can fix this
issue, maybe it is too expensive to do this. By the way, we won't fix
the problem unless someone thinks it is necessary to do it.

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang 
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson 
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.18-rc3:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit 021c91791a5e7e85c567452f1be3e4c2c6cb6063
Author: Linus Torvalds 
Date:   Sun Jul 1 16:04:53 2018 -0700

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. remove the '#ifdef' stuff throughout this patch.
3. put some logic into the early_memremap_pgprot_adjust() and clean the
previous unnecessary changes, for example: arch/x86/include/asm/dmi.h,
arch/x86/kernel/acpi/boot.c, drivers/acpi/tables.c.
4. add a new file and modify Makefile.
5. clean compile warning in copy_device_table() and some compile error.
6. split the original patch into five patches, it will be better for
review.
7. add some comments.

Some known issues:
1. about SME
Upstream kernel doesn't work when we use kexec in the follow command. The
system will hang on 'HP ProLiant DL385Gen10 AMD EPYC 7251'. But it can't
reproduce on speedway.
(This issue doesn't matter with the kdump patch.)

Reproduce steps:
 # kexec -l /boot/vmlinuz-4.18.0-rc3+ --initrd=/boot/initramfs-4.18.0-rc3+.img 
--command-line="roo

[PATCH 1/5 V5] Add a function(ioremap_encrypted) for kdump when AMD sme enabled

2018-07-02 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second
kernel by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. revert some logic in the early_memremap_pgprot_adjust() for
early memremap and make it separate a new patch.

 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de6484..f8795f9 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..e01e6c6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 4/5 V5] Adjust some permanent mappings in unencrypted ways for kdump when SME is enabled.

2018-07-02 Thread Lianbo Jiang
For kdump, the acpi table and dmi table will need to be remapped in
unencrypted ways during early init, they have just a simple wrapper
around early_memremap(), but the early_memremap() remaps the memory
in encrypted ways by default when SME is enabled, so we put some logic
into the early_memremap_pgprot_adjust(), which will have an opportunity
to adjust it.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/mm/ioremap.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e01e6c6..3c1c8c4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -689,8 +689,17 @@ pgprot_t __init 
early_memremap_pgprot_adjust(resource_size_t phys_addr,
encrypted_prot = true;
 
if (sme_active()) {
+   /*
+* In kdump mode, the acpi table and dmi table will need to
+* be remapped in unencrypted ways during early init when
+* SME is enabled. They have just a simple wrapper around
+* early_memremap(), but the early_memremap() remaps the
+* memory in encrypted ways by default when SME is enabled,
+* so we must adjust it.
+*/
if (early_memremap_is_setup_data(phys_addr, size) ||
-   memremap_is_efi_data(phys_addr, size))
+   memremap_is_efi_data(phys_addr, size) ||
+   is_kdump_kernel())
encrypted_prot = false;
}
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 5/5 V5] Help to dump the old memory encrypted into vmcore file

2018-07-02 Thread Lianbo Jiang
In kdump mode, we need to dump the old memory into vmcore file,
if SME is enabled in the first kernel, we must remap the old
memory in encrypted manner, which will be automatically decrypted
when we read from DRAM. It helps to parse the vmcore for some tools.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add a new file and modify Makefile.
2. revert some code about previously using sev_active().

 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 21 ++
 include/linux/crash_dump.h   | 12 
 4 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 02d6f5c..afb5bad 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index 000..e1b1a57
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cfb6674..07c1934 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,6 +25,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
@@ -98,7 +101,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)
 {
unsigned long pfn, offset;
size_t nr_bytes;
@@ -120,8 +124,11 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
-   tmp = copy_oldmem_page(pfn, buf, nr_bytes,
-   offset, userbuf);
+   tmp = encrypted ? copy_oldmem_page_encrypted(pfn,
+   buf, nr_bytes, offset, userbuf)
+   : copy_oldmem_page(pfn, buf, nr_bytes,
+  offset, userbuf);
+
if (tmp < 0)
return tmp;
}
@@ -155,7 +162,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0);
+   return read_from_oldmem(buf, count, ppo

[PATCH 3/5 V5] Remap the device table of IOMMU in encrypted manner for kdump

2018-07-02 Thread Lianbo Jiang
In kdump mode, it will copy the device table of IOMMU from the old
device table, which is encrypted when SME is enabled in the first
kernel. So we must remap it in encrypted manner in order to be
automatically decrypted when we read.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add some comments
2. clean compile warning.
3. remove unnecessary code when we clear sme mask bit.

 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 904c575..4cebb00 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -888,12 +888,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump mode.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] kdump: fix an error that can not parse the e820 reserved region

2018-09-05 Thread Lianbo Jiang
When kexec-tools load the kernel and initramfs for kdump, kexec-tools will
read /proc/iomem and recreate the e820 ranges for kdump kernel. But it fails
to parse the e820 reserved region, because the memcmp() is case sensitive
when comparing the string. In fact, it may be "Reserved" or "reserved" in
the /proc/iomem, so we use the strncasecmp() instead of the memcmp() to fix
it.

Signed-off-by: Lianbo Jiang 
---
 kexec/arch/i386/crashdump-x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
index 437e8a8..6669c1a 100644
--- a/kexec/arch/i386/crashdump-x86.c
+++ b/kexec/arch/i386/crashdump-x86.c
@@ -287,7 +287,7 @@ static int get_crash_memory_ranges(struct memory_range 
**range, int *ranges,
type = RANGE_PRAM;
} else if(memcmp(str,"Persistent Memory\n",18) == 0 ) {
type = RANGE_PMEM;
-   } else if(memcmp(str,"reserved\n",9) == 0 ) {
+   } else if(strncasecmp(str,"reserved\n",9) == 0 ) {
type = RANGE_RESERVED;
} else if (memcmp(str, "GART\n", 5) == 0) {
gart_start = start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2] kdump: fix an error that can not parse the e820 reserved region

2018-09-05 Thread Lianbo Jiang
When kexec-tools load the kernel and initramfs for kdump, kexec-tools will
read /proc/iomem and recreate the e820 ranges for kdump kernel. But it fails
to parse the e820 reserved region, because the memcmp() is case sensitive
when comparing the string. In fact, it may be "Reserved" or "reserved" in
the /proc/iomem, so we have to fix these cases.

Signed-off-by: Lianbo Jiang 
---
 kexec/arch/i386/crashdump-x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
index 437e8a8..140f45b 100644
--- a/kexec/arch/i386/crashdump-x86.c
+++ b/kexec/arch/i386/crashdump-x86.c
@@ -289,6 +289,8 @@ static int get_crash_memory_ranges(struct memory_range 
**range, int *ranges,
type = RANGE_PMEM;
} else if(memcmp(str,"reserved\n",9) == 0 ) {
type = RANGE_RESERVED;
+   } else if (memcmp(str, "Reserved\n", 9) == 0) {
+   type = RANGE_RESERVED;
} else if (memcmp(str, "GART\n", 5) == 0) {
gart_start = start;
gart_end = end;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/4 v7] Support kdump for AMD secure memory encryption(SME)

2018-09-07 Thread Lianbo Jiang
es.

The kernel log:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux...
Parsing ELF...[---Here the system will hang]


Lianbo Jiang (4):
  x86/ioremap: add a function ioremap_encrypted() to remap kdump old
memory
  kexec: allocate unencrypted control pages for kdump in case SME is
enabled
  amd_iommu: remap the device table of IOMMU with the memory encryption
mask for kdump
  kdump/vmcore: support encrypted old memory with SME enabled

 arch/x86/include/asm/io.h|  3 ++
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 arch/x86/mm/ioremap.c| 25 -
 drivers/iommu/amd_iommu_init.c   | 14 ++--
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 kernel/kexec_core.c  | 12 +++
 8 files changed, 125 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/4 v7] amd_iommu: remap the device table of IOMMU with the memory encryption mask for kdump

2018-09-07 Thread Lianbo Jiang
In kdump kernel, it will copy the device table of IOMMU from the old device
table, which is encrypted when SME is enabled in the first kernel. So we
have to remap the old device table with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/4 v7] kexec: allocate unencrypted control pages for kdump in case SME is enabled

2018-09-07 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate unencrypted pages
for kdump in order to be able to boot the kdump kernel like kexec.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..e7efcd1a977b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, we need to ensure that these pages are
+* unencrypted pages if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), which will make the code
+* become more simple.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +877,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 4/4 v7] kdump/vmcore: support encrypted old memory with SME enabled

2018-09-07 Thread Lianbo Jiang
In kdump kernel, we need to dump the old memory into vmcore file,if SME
is enabled in the first kernel, we have to remap the old memory with the
memory encryption mask, which will be automatically decrypted when we
read from DRAM.

For SME kdump, there are two cases that doesn't support:

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

1. SME is enabled in the first kernel, but SME is disabled in kdump kernel
In this case, because the old memory is encrypted, we can't decrypt the
old memory.

2. SME is disabled in the first kernel, but SME is enabled in kdump kernel
On the one hand, the old memory is unencrypted, the old memory can be dumped
as usual, we don't need to enable SME in kdump kernel; On the other hand, it
will increase the complexity of the code, we will have to consider how to
pass the SME flag from the first kernel to the kdump kernel, it is really
too expensive to do this.

This patches are only for SME kdump, the patches don't support SEV kdump.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 4 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8824d01c0c35..dfbeae0e35ce 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index ..e1b1a577f197
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..3065c8bada6a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,6 +25,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
@@ -98,7 +101,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)
 {
unsigned long 

[PATCH 1/4 v7] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-07 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/5 V6] Support kdump for AMD secure memory encryption(SME)

2018-08-31 Thread Lianbo Jiang
rc5+ #60
[  369.437585] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[  369.445783] systemd-udevd   D0   865813 0x8004
[  369.451323] Call Trace:
[  369.453815]  ? __schedule+0x290/0x870
[  369.457523]  schedule+0x32/0x80
[  369.460714]  __sev_do_cmd_locked+0x1f6/0x2a0 [ccp]
[  369.465556]  ? cleanup_uevent_env+0x10/0x10
[  369.470084]  ? remove_wait_queue+0x60/0x60
[  369.474219]  ? 0xc0247000
[  369.477572]  __sev_platform_init_locked+0x2b/0x70 [ccp]
[  369.482843]  sev_platform_init+0x1d/0x30 [ccp]
[  369.487333]  psp_pci_init+0x40/0xe0 [ccp]
[  369.491380]  ? 0xc0247000
[  369.494936]  sp_mod_init+0x18/0x1000 [ccp]
[  369.499071]  do_one_initcall+0x4e/0x1d4
[  369.502944]  ? _cond_resched+0x15/0x30
[  369.506728]  ? kmem_cache_alloc_trace+0xae/0x1d0
[  369.511386]  ? do_init_module+0x22/0x220
[  369.515345]  do_init_module+0x5a/0x220
[  369.519444]  load_module+0x21cb/0x2a50
[  369.523227]  ? m_show+0x1c0/0x1c0
[  369.526571]  ? security_capable+0x3f/0x60
[  369.530611]  __do_sys_finit_module+0x94/0xe0
[  369.534915]  do_syscall_64+0x5b/0x180
[  369.538607]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  369.543698] RIP: 0033:0x7f708e6311b9
[  369.547536] RSP: 002b:79d32aa8 EFLAGS: 0246 ORIG_RAX: 
0139
[  369.555162] RAX: ffda RBX: 55602a04c2d0 RCX: 7f708e6311b9
[  369.562346] RDX:  RSI: 7f708ef52039 RDI: 0008
[  369.569801] RBP: 7f708ef52039 R08:  R09: 55602a048b20
[  369.576988] R10: 0008 R11: 0246 R12: 
[  369.584177] R13: 55602a075260 R14: 0002 R15: 

Lianbo Jiang (5):
  x86/ioremap: add a function ioremap_encrypted() to remap kdump old
memroy
  x86/ioremap: strengthen the logic in early_memremap_pgprot_adjust() to
adjust encryption mask
  kexec: allocate unencrypted control pages for kdump in case SME is
enabled
  iommu/amd_iommu: remap the device table of IOMMU with the memory
encryption mask for kdump
  kdump/vmcore: support encrypted old memory with SME enabled

 arch/x86/include/asm/io.h|  3 ++
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 arch/x86/mm/ioremap.c| 34 +-
 drivers/iommu/amd_iommu_init.c   | 14 ++--
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 kernel/kexec_core.c  | 12 +++
 8 files changed, 133 insertions(+), 17 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/5 V6] kexec: allocate unencrypted control pages for kdump in case SME is enabled

2018-08-31 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate unencrypted pages
for kdump in order to be able to boot the kdump kernel like kexec.

Signed-off-by: Lianbo Jiang 
---
 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..e7efcd1a977b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, we need to ensure that these pages are
+* unencrypted pages if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), which will make the code
+* become more simple.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +877,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 4/5 V6] iommu/amd_iommu: remap the device table of IOMMU with the memory encryption mask for kdump

2018-08-31 Thread Lianbo Jiang
In kdump kernel, it will copy the device table of IOMMU from the old device
table, which is encrypted when SME is enabled in the first kernel. So we
have to remap the old device table with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 5/5 V6] kdump/vmcore: support encrypted old memory with SME enabled

2018-08-31 Thread Lianbo Jiang
In kdump kernel, we need to dump the old memory into vmcore file,if SME
is enabled in the first kernel, we have to remap the old memory with the
memory encryption mask, which will be automatically decrypted when we
read from DRAM.

For SME kdump, there are two cases that doesn't support:

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

1. SME is enabled in the first kernel, but SME is disabled in kdump kernel
In this case, because the old memory is encrypted, we can't decrypt the
old memory.

2. SME is disabled in the first kernel, but SME is enabled in kdump kernel
On the one hand, the old memory is unencrypted, the old memory can be dumped
as usual, we don't need to enable SME in kdump kernel; On the other hand, it
will increase the complexity of the code, we will have to consider how to
pass the SME flag from the first kernel to the kdump kernel, it is really
too expensive to do this.

This patches are only for SME kdump, the patches don't support SEV kdump.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 4 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8824d01c0c35..dfbeae0e35ce 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index ..e1b1a577f197
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..3065c8bada6a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,6 +25,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
@@ -98,7 +101,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)
 {
unsigned long 

[PATCH 1/5 V6] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memroy

2018-08-31 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/5 V6] x86/ioremap: strengthen the logic in early_memremap_pgprot_adjust() to adjust encryption mask

2018-08-31 Thread Lianbo Jiang
For kdump kernel, when SME is enabled, the acpi table and dmi table will need
to be remapped without the memory encryption mask. So we have to strengthen
the logic in early_memremap_pgprot_adjust(), which makes us have an opportunity
to adjust the memory encryption mask.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/mm/ioremap.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e01e6c695add..f9d9a39955f3 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -689,8 +689,15 @@ pgprot_t __init 
early_memremap_pgprot_adjust(resource_size_t phys_addr,
encrypted_prot = true;
 
if (sme_active()) {
+/*
+ * In kdump kernel, the acpi table and dmi table will need
+ * to be remapped without the memory encryption mask. Here
+ * we have to strengthen the logic to adjust the memory
+ * encryption mask.
+ */
if (early_memremap_is_setup_data(phys_addr, size) ||
-   memremap_is_efi_data(phys_addr, size))
+   memremap_is_efi_data(phys_addr, size) ||
+   is_kdump_kernel())
encrypted_prot = false;
}
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] Fix array index out of bound exception

2018-03-06 Thread Lianbo Jiang
A data overflow may lead to a reversal, which may turn a positive
number into a large negative number, in this case, the string's
length will exceed the array size(for example, eta: -2147483648s),
here the array size is defined 16 characters. So, it is nessasary
to consider some exceptions.

Signed-off-by: Lianbo Jiang <liji...@redhat.com>
---
 print_info.c | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/print_info.c b/print_info.c
index e0e6a27..09e215a 100644
--- a/print_info.c
+++ b/print_info.c
@@ -16,6 +16,8 @@
 #include "print_info.h"
 #include 
 #include 
+#include 
+#include 
 
 #define PROGRESS_MAXLEN"50"
 
@@ -352,18 +354,21 @@ static void calc_delta(struct timeval *tv_start, struct 
timeval *delta)
 }
 
 /* produce less than 12 bytes on msg */
-static int eta_to_human_short (int secs, char* msg)
+static int eta_to_human_short (int64_t secs, char* msg, int maxsize)
 {
strcpy(msg, "eta: ");
msg += strlen("eta: ");
if (secs < 100)
-   sprintf(msg, "%ds", secs);
+   snprintf(msg, maxsize, "%"PRId64"s", secs);
else if (secs < 100 * 60)
-   sprintf(msg, "%dm%ds", secs / 60, secs % 60);
+   snprintf(msg, maxsize, "%"PRId64"m""%"PRId64"s",
+   secs / 60, secs % 60);
else if (secs < 48 * 3600)
-   sprintf(msg, "%dh%dm", secs / 3600, (secs / 60) % 60);
+   snprintf(msg, maxsize, "%"PRId64"h""%"PRId64"m",
+   secs / 3600, (secs / 60) % 60);
else if (secs < 100 * 86400)
-   sprintf(msg, "%dd%dh", secs / 86400, (secs / 3600) % 24);
+   snprintf(msg, maxsize, "%"PRId64"d""%"PRId64"h",
+   secs / 86400, (secs / 3600) % 24);
else
sprintf(msg, ">2day");
return 0;
@@ -379,8 +384,8 @@ print_progress(const char *msg, unsigned long current, 
unsigned long end, struct
static unsigned int lapse = 0;
static const char *spinner = "/|\\-";
struct timeval delta;
-   double eta;
-   char eta_msg[16] = " ";
+   int64_t eta;
+   char eta_msg[32] = " ";
 
if (current < end) {
tm = time(NULL);
@@ -395,7 +400,7 @@ print_progress(const char *msg, unsigned long current, 
unsigned long end, struct
calc_delta(start, );
eta = delta.tv_sec + delta.tv_usec / 1e6;
eta = (100 - progress) * eta / progress;
-   eta_to_human_short(eta, eta_msg);
+   eta_to_human_short(eta, eta_msg, sizeof(eta_msg));
}
if (flag_ignore_r_char) {
PROGRESS_MSG("%-" PROGRESS_MAXLEN "s: [%5.1f %%] %c  %16s\n",
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/5 V4] Add a function(ioremap_encrypted) for kdump when AMD sme enabled

2018-06-28 Thread Lianbo Jiang
It is convenient to remap the old memory encrypted to the second
kernel by calling ioremap_encrypted().

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. revert some logic in the early_memremap_pgprot_adjust() for
early memremap and make it separate a new patch.

 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de6484..f8795f9 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545..e01e6c6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
 
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+   return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+   __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
@@ -374,7 +383,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, 
unsigned long size,
 {
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/5 V4] Support kdump for AMD secure memory encryption(SME)

2018-06-28 Thread Lianbo Jiang
When sme enabled on AMD server, we also need to support kdump. Because
the memory is encrypted in the first kernel, we will remap the old memory
encrypted to the second kernel(crash kernel), and sme is also enabled in
the second kernel, otherwise the old memory encrypted can not be decrypted.
Because simply changing the value of a C-bit on a page will not
automatically encrypt the existing contents of a page, and any data in the
page prior to the C-bit modification will become unintelligible. A page of
memory that is marked encrypted will be automatically decrypted when read
from DRAM and will be automatically encrypted when written to DRAM.

For the kdump, it is necessary to distinguish whether the memory is
encrypted. Furthermore, we should also know which part of the memory is
encrypted or decrypted. We will appropriately remap the memory according
to the specific situation in order to tell cpu how to deal with the
data(encrypted or decrypted). For example, when sme enabled, if the old
memory is encrypted, we will remap the old memory in encrypted way, which
will automatically decrypt the old memory encrypted when we read those data
from the remapping address.

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

This patch is only for SME kdump, it is not support SEV kdump.

For kdump(SME), there are two cases that doesn't support:
1. SME is enabled in the first kernel, but SME is disabled in the
second kernel
Because the old memory is encrypted, we can't decrypt the old memory
if SME is off in the second kernel.

2. SME is disabled in the first kernel, but SME is enabled in the
second kernel
Maybe it is unnecessary to support this case, because the old memory
is unencrypted, the old memory can be dumped as usual, we don't need
to enable sme in the second kernel, furthermore the requirement is
rare in actual deployment. Another, If we must support the scenario,
it will increase the complexity of the code, we will have to consider
how to transfer the sme flag from the first kernel to the second kernel,
in order to let the second kernel know that whether the old memory is
encrypted.
There are two manners to tranfer the SME flag to the second kernel, the
first way is to modify the assembly code, which includes some common
code and the path is too long. The second way is to use kexec tool,
which could require the sme flag to be exported in the first kernel
by "proc" or "sysfs", kexec will read the sme flag from "proc" or
"sysfs" when we use kexec tool to load image, subsequently the sme flag
will be saved in boot_params, we can properly remap the old memory
according to the previously saved sme flag. Although we can fix this
issue, maybe it is too expensive to do this. By the way, we won't fix
the problem unless someone thinks it is necessary to do it.

Test tools:
makedumpfile[v1.6.3]: https://github.com/LianboJ/makedumpfile
commit e1de103eca8f (A draft for kdump vmcore about AMD SME)
Author: Lianbo Jiang 
Date:   Mon May 14 17:02:40 2018 +0800
Note: This patch can only dump vmcore in the case of SME enabled.

crash-7.2.1: https://github.com/crash-utility/crash.git
commit 1e1bd9c4c1be (Fix for the "bpf" command display on Linux 4.17-rc1)
Author: Dave Anderson 
Date:   Fri May 11 15:54:32 2018 -0400

Test environment:
HP ProLiant DL385Gen10 AMD EPYC 7251
8-Core Processor
32768 MB memory
600 GB disk space

Linux 4.18-rc2:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
commit 7daf201d7fe8 ("Linux 4.18-rc2")
Author: Linus Torvalds 
Date:   Sun Jun 24 20:54:29 2018 +0800

Reference:
AMD64 Architecture Programmer's Manual
https://support.amd.com/TechDocs/24593.pdf

Some changes:
1. remove the sme_active() check in __ioremap_caller().
2. remove the '#ifdef' stuff throughout this patch.
3. put some logic into the early_memremap_pgprot_adjust() and clean the
previous unnecessary changes, for example: arch/x86/include/asm/dmi.h,
arch/x86/kernel/acpi/boot.c, drivers/acpi/tables.c.
4. add a new file and modify Makefile.
5. clean compile warning in copy_device_table() and some compile error.
6. split the original patch into five patches, it will be better for
review.
7. modify elfcorehdr_read().
8. add some comments.

Some known issues:
1. about SME
Upstream kernel doesn't work when we use kexec in the follow command. The
system will hang on 'HP ProLiant DL385Gen10 AMD EPYC 7251'. But it can't
reproduce on speedway.
(This issue doesn't matter with the kdump patch.)

Reproduce steps:
 # kexec -l /boot/vmlinuz-4.18.0-rc2+ --initrd=/boot/initramfs-4.18.0-rc2+

[PATCH 3/5 V4] Remap the device table of IOMMU in encrypted manner for kdump

2018-06-28 Thread Lianbo Jiang
In kdump mode, it will copy the device table of IOMMU from the old
device table, which is encrypted when SME is enabled in the first
kernel. So we must remap it in encrypted manner in order to be
automatically decrypted when we read.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add some comments
2. clean compile warning.
3. remove unnecessary code when we clear sme mask bit.

 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 904c575..4cebb00 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -888,12 +888,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump mode.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 4/5 V4] Adjust some permanent mappings in unencrypted ways for kdump when SME is enabled.

2018-06-28 Thread Lianbo Jiang
For kdump, the acpi table and dmi table will need to be remapped in
unencrypted ways during early init, they have just a simple wrapper
around early_memremap(), but the early_memremap() remaps the memory
in encrypted ways by default when SME is enabled, so we put some logic
into the early_memremap_pgprot_adjust(), which will have an opportunity
to adjust it.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/mm/ioremap.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e01e6c6..3c1c8c4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -689,8 +689,17 @@ pgprot_t __init 
early_memremap_pgprot_adjust(resource_size_t phys_addr,
encrypted_prot = true;
 
if (sme_active()) {
+   /*
+* In kdump mode, the acpi table and dmi table will need to
+* be remapped in unencrypted ways during early init when
+* SME is enabled. They have just a simple wrapper around
+* early_memremap(), but the early_memremap() remaps the
+* memory in encrypted ways by default when SME is enabled,
+* so we must adjust it.
+*/
if (early_memremap_is_setup_data(phys_addr, size) ||
-   memremap_is_efi_data(phys_addr, size))
+   memremap_is_efi_data(phys_addr, size) ||
+   is_kdump_kernel())
encrypted_prot = false;
}
 
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 5/5 V4] Help to dump the old memory encrypted into vmcore file

2018-06-28 Thread Lianbo Jiang
In kdump mode, we need to dump the old memory into vmcore file,
if SME is enabled in the first kernel, we must remap the old
memory in encrypted manner, which will be automatically decrypted
when we read from DRAM. It helps to parse the vmcore for some tools.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. add a new file and modify Makefile.
2. revert some code about previously using sev_active().
3. modify elfcorehdr_read().

 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 45 +-
 include/linux/crash_dump.h   | 12 
 4 files changed, 104 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 02d6f5c..afb5bad 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index 000..e1b1a57
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cfb6674..5fef489 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -26,6 +26,8 @@
 #include 
 #include 
 #include "internal.h"
+#include 
+#include 
 
 /* List representing chunks of contiguous memory areas and their offsets in
  * vmcore file.
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)
 {
unsigned long pfn, offset;
size_t nr_bytes;
@@ -120,8 +123,11 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
-   tmp = copy_oldmem_page(pfn, buf, nr_bytes,
-   offset, userbuf);
+   tmp = encrypted ? copy_oldmem_page_encrypted(pfn,
+   buf, nr_bytes, offset, userbuf)
+   : copy_oldmem_page(pfn, buf, nr_bytes,
+  offset, userbuf);
+
if (tmp < 0)
return tmp;
}
@@ -151,11 +157,34 @@ void __weak elfcorehdr_free(unsigned long long addr)
 {}
 
 /*
- * Architectures may override this function to read from ELF header
+ * Architectures may override this function to re

[PATCH 2/5 V4] Allocate pages for kdump without encryption when SME is enabled

2018-06-28 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate pages
for kdump without encryption in order to be able to boot the
second kernel in the same manner as kexec, which helps to keep
the same code style.

Signed-off-by: Lianbo Jiang 
---
Some changes:
1. remove some redundant codes for crash control pages.
2. add some comments.

 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4..e7efcd1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, we need to ensure that these pages are
+* unencrypted pages if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), which will make the code
+* become more simple.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +877,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.9.5


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v4] add reserved e820 ranges to the kdump kernel e820 table

2018-10-22 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, it has been added in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

Furthermore, when AMD SME kdump support, it needs to map dmi table area as
decrypted. For normal boot, these ranges sit in e820 reserved ranges, thus
the early ioremap code naturally map them as decrypted. If it also has same
e820 reserve setup in kdump kernel then it will just work like normal
kernel.

Kdump uses walk_iomem_res_desc to iterate resources, then adds matched desc
to e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types, we
need add exact e820 type to the kdump kernel e820 table, thus it also needs
an extra checking in memmap_entry_callback() to match the e820 type and
resource name.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Note:
1. The patches are made based on this branch:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git

2. And you need to apply the follow patch before test kdump file_load,
   otherwise these patches won't work.
   commit <010a93bf97c7> "resource: Fix find_next_iomem_res() iteration
   issue"

Lianbo Jiang (2):
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 10 +-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v4] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-10-22 Thread Lianbo Jiang
kdump uses walk_iomem_res_desc() to iterate io resources, then adds matched
desc to e820 table for kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to kdump kernel e820 table, thus it also needs
an extra checking in memmap_entry_callback() to match the e820 type and
resource name.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..6d5451b36e80 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char *e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..ae724a6e0a5f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (res->name && !strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..f9761b2f7abb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1012,7 +1012,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char *e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index b3a3a1fc499e..6285a6b4de6c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(resource_size_t start, 
resource_size_t end,
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v4] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-10-22 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, it has been added in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

When AMD SME kdump support, it needs to map dmi table area as decrypted.
For normal boot, these ranges sit in e820 reserved ranges, thus the early
ioremap code naturally map them as decrypted. If it also has same e820
reserve setup in kdump kernel then it will just work like normal kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ae724a6e0a5f..d3167125800e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,10 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   cmd.type = E820_TYPE_RESERVED;
+   walk_iomem_res_desc(IORES_DESC_NONE, 0, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] kdump, vmcoreinfo: Export sme_me_mask value to vmcoreinfo

2018-10-26 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. So it is necessary
to write the sme_me_mask to vmcoreinfo.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..dcfdb64d1097 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -357,6 +357,8 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
 
+   VMCOREINFO_NUMBER(sme_me_mask);
+
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v5] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-11-06 Thread Lianbo Jiang
kdump uses walk_iomem_res_desc() to iterate io resources, then adds matched
desc to e820 table for kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to kdump kernel e820 table, thus it also needs
an extra checking in memmap_entry_callback() to match the e820 type and
resource name.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..6d5451b36e80 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char *e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..ae724a6e0a5f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (res->name && !strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..4c1fe4f8db1e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1011,7 +1011,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char *e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index b3a3a1fc499e..6285a6b4de6c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(resource_size_t start, 
resource_size_t end,
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v5] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-11-06 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, it has been added in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode, and also outputs the following kernel log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

Furthermore, when AMD SME kdump support, it needs to map dmi table area
as decrypted. For normal boot, these ranges sit in e820 reserved ranges,
thus the early ioremap code naturally map them as decrypted. If it also
has same e820 reserve setup in kdump kernel then it will just work like
normal kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ae724a6e0a5f..d3167125800e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,10 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   cmd.type = E820_TYPE_RESERVED;
+   walk_iomem_res_desc(IORES_DESC_NONE, 0, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v5] add reserved e820 ranges to the kdump kernel e820 table

2018-11-06 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, it has been added in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode, and also outputs the following kernel log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

Furthermore, when AMD SME kdump support, it needs to map dmi table area
as decrypted. For normal boot, these ranges sit in e820 reserved ranges,
thus the early ioremap code naturally map them as decrypted. If it also
has same e820 reserve setup in kdump kernel then it will just work like
normal kernel.

Kdump uses walk_iomem_res_desc to iterate resources, then adds matched
desc to e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to the kdump kernel e820 table, thus it also
needs an extra checking in memmap_entry_callback() to match the e820 type
and resource name.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Lianbo Jiang (2):
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 10 +-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v6] add reserved e820 ranges to the kdump kernel e820 table

2018-11-13 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might produce
two problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Lianbo Jiang (2):
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 10 +-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v6] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-11-13 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might produce
two problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
Changes since v5:
1. Improve the patch log

 arch/x86/kernel/crash.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ae724a6e0a5f..d3167125800e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,10 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   cmd.type = E820_TYPE_RESERVED;
+   walk_iomem_res_desc(IORES_DESC_NONE, 0, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v6] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-11-13 Thread Lianbo Jiang
When load the kernel image and initramfs by kexec_file_load syscall, it can
not add exact e820 reserved type to kdump kernel e820 table.

Kdump uses walk_iomem_res_desc() to iterate io resources, then adds matched
desc to e820 table for kdump kernel. But, when convert the e820 type into
the iores descriptors, several e820 types are converted to 'IORES_DES_NONE'
in this function e820_type_to_iores_desc(). So the walk_iomem_res_desc()
will get these unnecessary types(E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820_TYPE
_KERN) when iterate io resources by the 'IORES_DES_NONE'.

It needs filter out these redundant type(such as E820_TYPE_RAM/E820_TYPE_
UNUSABLE/E820_TYPE_KERN) in order to add exact e820 reserved type to kdump
kernel e820 table. Thus it also needs an extra checking in memmap_entry_
callback() to match the e820 type and resource name.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
Changes since v5:
1. Improve the patch log

 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..6d5451b36e80 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char *e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..ae724a6e0a5f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (res->name && !strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..4c1fe4f8db1e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1011,7 +1011,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char *e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index b0fbf685c77a..4ac07717e2b1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -373,6 +373,7 @@ static int find_next_iomem_res(resource_size_t start, 
resource_size_t end,
res->end = min(end, p->end);
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v7] resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'

2018-11-15 Thread Lianbo Jiang
The upstream kernel can not accurately add the e820 reserved type to
kdump krenel e820 table.

Kdump uses walk_iomem_res_desc() to iterate io resources, then adds
the matched resource ranges to the e820 table for kdump kernel. But,
when convert the e820 type into the iores descriptor, several e820
types are converted to 'IORES_DESC_NONE' in this function e820_type
_to_iores_desc(). So the walk_iomem_res_desc() will get unnecessary
types(such as E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820_TYPE_KERN) when
walk through io resources by the descriptor 'IORES_DESC_NONE'.

This patch adds the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
Changes since v5:
1. Improve the patch log

Changes since v6:
1. Modify this patch, and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces.
2. Improve patch log.

 arch/x86/kernel/e820.c | 2 +-
 include/linux/ioport.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..57fafdafb860 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1048,10 +1048,10 @@ static unsigned long __init 
e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM:return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM:return 
IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+   case E820_TYPE_RESERVED:return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE:/* Fall-through: */
-   case E820_TYPE_RESERVED:/* Fall-through: */
default:return IORES_DESC_NONE;
}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+   IORES_DESC_RESERVED = 8,
 };
 
 /* helpers to define resources */
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v7] add reserved e820 ranges to the kdump kernel e820 table

2018-11-15 Thread Lianbo Jiang
These patches add the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces and also pass the e820 reserved
ranges to kdump kernel.

At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might cause two
problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.

Lianbo Jiang (2):
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/kernel/crash.c | 6 ++
 arch/x86/kernel/e820.c  | 2 +-
 include/linux/ioport.h  | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v7] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-11-15 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might cause two
problems:

The first one is the MMCONFIG issue, although which does not make the
system crash or hang, this issue is still a potential risk, and also
might lead to the hot-plug device could not be recognized in kdump kernel.
Because the PCI MMCONFIG(extended mode) requires the reserved region
otherwise it falls back to legacy mode. For example, the kdump kernel
outputs the following log.

Example:
..
[   19.798354] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[   19.800653] [Firmware Info]: PCI: MMCONFIG at [mem 0x8000-0x8fff] 
not reserved in ACPI motherboard resources
[   19.800995] PCI: not using MMCONFIG
..

The correct kernel log is like this:
..
[0.082649] PCI: MMCONFIG for domain  [bus 00-ff] at [mem 
0x8000-0x8fff] (base 0x8000)
[0.083610] PCI: MMCONFIG at [mem 0x8000-0x8fff] reserved in E820
..

The second issue is that the e820 reserved ranges do not setup in kdump
kernel, which will cause some functions that related to the e820 reserved
ranges to become invalid. For example:

early_memremap()->
early_memremap_pgprot_adjust()->
memremap_should_map_decrypted()->
e820__get_entry_type()

Please focus on these functions, early_memremap_pgprot_adjust() and
memremap_should_map_decrypted().

In the first kernel, these ranges sit in e820 reserved ranges, so the
memremap_should_map_decrypted() will return true, that is to say, the
reserved memory is decrypted, then the early_memremap_pgprot_adjust()
will call the pgprot_decrypted() to clear the memory encryption mask.

In the second kernel, because the e820 reserved ranges are not passed
to the second kernel, these ranges don't sit in the e820 reserved ranges,
so the memremap_should_map_decrypted() will return false, that is to say,
the reserved memory is encrypted, and then the early_memremap_pgprot_
adjust() will also call the pgprot_encrypted() to set the memory encryption
mask.

In fact, in the second kernel, the e820 reserved memory is still decrypted.
Obviously, it has gone wrong. So, this issue must be fixed, otherwise kdump
won't work in this case.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
Changes since v5:
1. Improve the patch log

Changes since v6:
1. Modify this patch, and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.
2. Add comment in the code.

 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..5354a84f1684 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -380,6 +380,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add e820 reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/4 v8] iommu/amd: Remap the device table of IOMMU with the memory encryption mask for kdump

2018-09-29 Thread Lianbo Jiang
In kdump kernel, it will copy the device table of IOMMU from the old device
table, which is encrypted when SME is enabled in the first kernel. So the
old device table has to be remapped with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
Acked-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/4 v8] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-29 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
the old memory has to be remapped with the memory encryption mask.

Here we only talk about the case that SME is active in the first kernel,
and only care it's active too in kdump kernel. there are four cases that
need considered.

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in kdump
   kernel.

b. crash notes
   When dumping vmcore, the people usually need to read the useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It is allocated by kernel, need fill its pointer into mmio of amd iommu.
   It's encrypted in the first kernel, need read the old content to analyze
   and get useful information.

d. mmio of amd iommu
   Register reported by amd firmware, it's not RAM, which won't be
   encrypted in both the first kernel and kdump kernel.

To achieve the goal, the solution is:
1. add a new bool parameter "encrypted" to __ioremap_caller()
   It is a low level function, and check the newly added parameter, if it's
   true and in kdump kernel, will remap the memory with sme mask.

2. add a new function ioremap_encrypted() to explicitly passed in a "true"
   value for "encrypted".
   For above a, b, c, kdump kernel will call ioremap_encrypted();

3. adjust all existed ioremap wrapper functions, passed in "false" for
   encrypted to make them as before.

   ioremap_encrypted()\
   ioremap_cache() |
   ioremap_prot()  |
   ioremap_wt()|->__ioremap_caller()
   ioremap_wc()|
   ioremap_uc()    |
   ioremap_nocache()  /

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Remove a redundant header file "linux/crash_dump.h".(Suggested by
Borislav)
2. Modify code style issue.(Suggested by Borislav)
3. Improve patch log.(Suggested by Baoquan)

 arch/x86/include/asm/io.h |  2 ++
 arch/x86/mm/ioremap.c | 24 
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..b7b0bf36c400 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,8 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_siz

[PATCH 0/4 v8] Support kdump for AMD secure memory encryption(SME)

2018-09-29 Thread Lianbo Jiang
 2/4(Suggested by Borislav)
6. Delete a file arch/x86/kernel/crash_dump_encrypt.c and rewrite some
functions(Suggested by Borislav)
7. Modify all code style issue(Suggested by Borislav)

Some known issues:
1. about SME
Upstream kernel will hang on HP machine(DL385Gen10 AMD EPYC 7251) when
we execute the kexec command as follow:

# kexec -l /boot/vmlinuz-4.19.0-rc5+ --initrd=/boot/initramfs-4.19.0-rc5+.img 
--command-line="root=/dev/mapper/rhel_hp--dl385g10--03-root ro mem_encrypt=on 
rd.lvm.lv=rhel_hp-dl385g10-03/root rd.lvm.lv=rhel_hp-dl385g10-03/swap 
console=ttyS0,115200n81 LANG=en_US.UTF-8 earlyprintk=serial debug nokaslr"
# kexec -e (or reboot)

But this issue can not be reproduced on speedway machine, and this issue
is irrelevant to my posted patches.

The kernel log:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux...
Parsing ELF...[---Here the system will hang]

Lianbo Jiang (4):
  x86/ioremap: add a function ioremap_encrypted() to remap kdump old
memory
  kexec: allocate decrypted control pages for kdump in case SME is
enabled
  iommu/amd: Remap the device table of IOMMU with the memory encryption
mask for kdump
  kdump/vmcore: support encrypted old memory with SME enabled

 arch/x86/include/asm/io.h   |  2 +
 arch/x86/kernel/crash_dump_64.c | 65 -
 arch/x86/mm/ioremap.c   | 24 
 drivers/iommu/amd_iommu_init.c  | 14 ++-
 fs/proc/vmcore.c| 24 +---
 include/linux/crash_dump.h  |  3 ++
 kernel/kexec_core.c | 14 +++
 7 files changed, 121 insertions(+), 25 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/4 v8] kexec: allocate decrypted control pages for kdump in case SME is enabled

2018-09-29 Thread Lianbo Jiang
When SME is enabled in the first kernel, it needs to allocate decrypted
pages for kdump, because when it boots to the kdump kernel, these pages
won't be accessed encrypted at the initial stage, in order to boot the
kdump kernel in the same manner as originally booted.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Modify comment in the code.(Suggested by Borislav)
2. Improve patch log.(Suggested by Borislav)

 kernel/kexec_core.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..6353daaee7f1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,18 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, it needs to ensure that these pages are
+* decrypted if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), because these pages are
+* reserved memory and once the crash kernel is done,
+* it will always remain in these memory until reboot
+* or unloading.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +879,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +897,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 4/4 v8] kdump/vmcore: support encrypted old memory with SME enabled

2018-09-29 Thread Lianbo Jiang
In kdump kernel, the old memory needs to be dumped into vmcore file.
If SME is enabled in the first kernel, the old memory has to be
remapped with the memory encryption mask, which will be automatically
decrypted when read from DRAM.

For SME kdump, there are two cases that doesn't support:

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

1. SME is enabled in the first kernel, but SME is disabled in kdump kernel
In this case, because the old memory is encrypted, it can't be decrypted.
The root cause is that the encryption key is not visible to any software
runnint on the CPU cores(AMD cpu with SME), and is randomly generated on
eache system reset. That is to say, kdump kernel won't have a chance to
get the encryption key. So the encrypted memory can not be decrypted
unless SME is active.

2. SME is disabled in the first kernel, but SME is enabled in kdump kernel
On the one hand, the old memory is decrypted, the old memory can be dumped
as usual, so SME doesn't need to be enabled in kdump kernel; On the other
hand, it will increase the complexity of the code, because that will have
to consider how to pass the SME flag from the first kernel to the kdump
kernel, it is really too expensive to do this.

This patches are only for SME kdump, the patches don't support SEV kdump.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Delete a file arch/x86/kernel/crash_dump_encrypt.c, and move the
copy_oldmem_page_encrypted() to arch/x86/kernel/crash_dump_64.c, also
rewrite some functions.(Suggested by Borislav)
2. Modify all code style issue.(Suggested by Borislav)
3. Remove a reduntant header file.(Suggested by Borislav)
4. Improve patch log.(Suggested by Borislav)

 arch/x86/kernel/crash_dump_64.c | 65 -
 fs/proc/vmcore.c| 24 +---
 include/linux/crash_dump.h  |  3 ++
 3 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..6adbde592c44 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -12,7 +12,7 @@
 #include 
 
 /**
- * copy_oldmem_page - copy one page from "oldmem"
+ * __copy_oldmem_page - copy one page from "old memory encrypted or decrypted"
  * @pfn: page frame number to be copied
  * @buf: target memory address for the copy; this can be in kernel address
  * space or user address space (see @userbuf)
@@ -20,31 +20,78 @@
  * @offset: offset in bytes into the page (based on pfn) to begin the copy
  * @userbuf: if set, @buf is in user address space, use copy_to_user(),
  * otherwise @buf is in kernel address space, use memcpy().
+ * @encrypted: if true, the old memory is encrypted.
+ * if false, the old memory is decrypted.
  *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ * Copy a page from "old memory encrypted or decrypted". For this page, there
+ * is no pte mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-   size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf,
+ bool encrypted)
 {
void  *vaddr;
 
if (!csize)
return 0;
 
-   vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (encrypted)
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+   else
+   vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
 
if (userbuf) {
-   if (copy_to_user(buf, vaddr + offset, csize)) {
-   iounmap(vaddr);
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
 
set_iounmap_nonlazy();
-   iounmap(vaddr);
+   iounmap((void __iomem *)vaddr);
return csize;
 }
+
+/**
+ * copy_oldmem_page - copy one page from "old memory decrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be

[PATCH v8 RESEND 1/4] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-29 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
the old memory has to be remapped with the memory encryption mask.

Here we only talk about the case that SME is active in the first kernel,
and only care it's active too in kdump kernel. there are four cases that
need considered.

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in kdump
   kernel.

b. crash notes
   When dumping vmcore, the people usually need to read the useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It is allocated by kernel, need fill its pointer into mmio of amd iommu.
   It's encrypted in the first kernel, need read the old content to analyze
   and get useful information.

d. mmio of amd iommu
   Register reported by amd firmware, it's not RAM, which won't be
   encrypted in both the first kernel and kdump kernel.

To achieve the goal, the solution is:
1. add a new bool parameter "encrypted" to __ioremap_caller()
   It is a low level function, and check the newly added parameter, if it's
   true and in kdump kernel, will remap the memory with sme mask.

2. add a new function ioremap_encrypted() to explicitly passed in a "true"
   value for "encrypted".
   For above a, b, c, kdump kernel will call ioremap_encrypted();

3. adjust all existed ioremap wrapper functions, passed in "false" for
   encrypted to make them as before.

   ioremap_encrypted()\
   ioremap_cache() |
   ioremap_prot()  |
   ioremap_wt()|->__ioremap_caller()
   ioremap_wc()|
   ioremap_uc()    |
   ioremap_nocache()  /

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Remove a redundant header file "linux/crash_dump.h".(Suggested by
Borislav)
2. Modify code style issue.(Suggested by Borislav)
3. Improve patch log.(Suggested by Baoquan)

 arch/x86/include/asm/io.h |  2 ++
 arch/x86/mm/ioremap.c | 24 
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..b7b0bf36c400 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,8 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned 
long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_siz

[PATCH v8 RESEND 0/4] Support kdump for AMD secure memory encryption(SME)

2018-09-29 Thread Lianbo Jiang
 2/4(Suggested by Borislav)
6. Delete a file arch/x86/kernel/crash_dump_encrypt.c and rewrite some
functions(Suggested by Borislav)
7. Modify all code style issue(Suggested by Borislav)
8. Modify compile error "fs/proc/vmcore.c:115: undefined reference
   to `copy_oldmem_page_encrypted'"

Some known issues:
1. about SME
Upstream kernel will hang on HP machine(DL385Gen10 AMD EPYC 7251) when
we execute the kexec command as follow:

# kexec -l /boot/vmlinuz-4.19.0-rc5+ --initrd=/boot/initramfs-4.19.0-rc5+.img 
--command-line="root=/dev/mapper/rhel_hp--dl385g10--03-root ro mem_encrypt=on 
rd.lvm.lv=rhel_hp-dl385g10-03/root rd.lvm.lv=rhel_hp-dl385g10-03/swap 
console=ttyS0,115200n81 LANG=en_US.UTF-8 earlyprintk=serial debug nokaslr"
# kexec -e (or reboot)

But this issue can not be reproduced on speedway machine, and this issue
is irrelevant to my posted patches.

The kernel log:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux...
Parsing ELF...[---Here the system will hang]

Lianbo Jiang (4):
  x86/ioremap: add a function ioremap_encrypted() to remap kdump old
memory
  kexec: allocate decrypted control pages for kdump in case SME is
enabled
  iommu/amd: Remap the device table of IOMMU with the memory encryption
mask for kdump
  kdump/vmcore: support encrypted old memory with SME enabled

 arch/x86/include/asm/io.h   |  2 +
 arch/x86/kernel/crash_dump_64.c | 65 -
 arch/x86/mm/ioremap.c   | 24 
 drivers/iommu/amd_iommu_init.c  | 14 ++-
 fs/proc/vmcore.c| 24 +---
 include/linux/crash_dump.h  | 13 +++
 kernel/kexec_core.c | 14 +++
 7 files changed, 131 insertions(+), 25 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v8 RESEND 4/4] kdump/vmcore: support encrypted old memory with SME enabled

2018-09-29 Thread Lianbo Jiang
In kdump kernel, the old memory needs to be dumped into vmcore file.
If SME is enabled in the first kernel, the old memory has to be
remapped with the memory encryption mask, which will be automatically
decrypted when read from DRAM.

For SME kdump, there are two cases that doesn't support:

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

1. SME is enabled in the first kernel, but SME is disabled in kdump kernel
In this case, because the old memory is encrypted, it can't be decrypted.
The root cause is that the encryption key is not visible to any software
runnint on the CPU cores(AMD cpu with SME), and is randomly generated on
eache system reset. That is to say, kdump kernel won't have a chance to
get the encryption key. So the encrypted memory can not be decrypted
unless SME is active.

2. SME is disabled in the first kernel, but SME is enabled in kdump kernel
On the one hand, the old memory is decrypted, the old memory can be dumped
as usual, so SME doesn't need to be enabled in kdump kernel; On the other
hand, it will increase the complexity of the code, because that will have
to consider how to pass the SME flag from the first kernel to the kdump
kernel, it is really too expensive to do this.

This patches are only for SME kdump, the patches don't support SEV kdump.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Delete a file arch/x86/kernel/crash_dump_encrypt.c, and move the
copy_oldmem_page_encrypted() to arch/x86/kernel/crash_dump_64.c, also
rewrite some functions.(Suggested by Borislav)
2. Modify all code style issue.(Suggested by Borislav)
3. Remove a reduntant header file.(Suggested by Borislav)
4. Improve patch log.(Suggested by Borislav)
5. Modify compile error "fs/proc/vmcore.c:115: undefined reference
   to `copy_oldmem_page_encrypted'" 

 arch/x86/kernel/crash_dump_64.c | 65 -
 fs/proc/vmcore.c| 24 +---
 include/linux/crash_dump.h  | 13 +++
 3 files changed, 87 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..6adbde592c44 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -12,7 +12,7 @@
 #include 
 
 /**
- * copy_oldmem_page - copy one page from "oldmem"
+ * __copy_oldmem_page - copy one page from "old memory encrypted or decrypted"
  * @pfn: page frame number to be copied
  * @buf: target memory address for the copy; this can be in kernel address
  * space or user address space (see @userbuf)
@@ -20,31 +20,78 @@
  * @offset: offset in bytes into the page (based on pfn) to begin the copy
  * @userbuf: if set, @buf is in user address space, use copy_to_user(),
  * otherwise @buf is in kernel address space, use memcpy().
+ * @encrypted: if true, the old memory is encrypted.
+ * if false, the old memory is decrypted.
  *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ * Copy a page from "old memory encrypted or decrypted". For this page, there
+ * is no pte mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-   size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf,
+ bool encrypted)
 {
void  *vaddr;
 
if (!csize)
return 0;
 
-   vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+   if (encrypted)
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+   else
+   vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, 
PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
 
if (userbuf) {
-   if (copy_to_user(buf, vaddr + offset, csize)) {
-   iounmap(vaddr);
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
 
set_iounmap_nonlazy();
-   iounmap(vaddr);
+   iounmap((void __iomem *)vaddr);
return csize;
 }
+
+/**
+ * copy_oldmem_page - copy one page from "old memory de

[PATCH v8 RESEND 3/4] iommu/amd: Remap the device table of IOMMU with the memory encryption mask for kdump

2018-09-29 Thread Lianbo Jiang
In kdump kernel, it will copy the device table of IOMMU from the old device
table, which is encrypted when SME is enabled in the first kernel. So the
old device table has to be remapped with the memory encryption mask.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
Acked-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu_init.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
}
}
 
-   old_devtb_phys = entry & PAGE_MASK;
+   /*
+* When SME is enabled in the first kernel, the entry includes the
+* memory encryption mask(sme_me_mask), we must remove the memory
+* encryption mask to obtain the true physical address in kdump kernel.
+*/
+   old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
+
if (old_devtb_phys >= 0x1ULL) {
pr_err("The address of old device table is above 4G, not 
trustworthy!\n");
return false;
}
-   old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+   old_devtb = (sme_active() && is_kdump_kernel())
+   ? (__force void *)ioremap_encrypted(old_devtb_phys,
+   dev_table_size)
+   : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+
if (!old_devtb)
return false;
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v8 RESEND 2/4] kexec: allocate decrypted control pages for kdump in case SME is enabled

2018-09-29 Thread Lianbo Jiang
When SME is enabled in the first kernel, it needs to allocate decrypted
pages for kdump, because when it boots to the kdump kernel, these pages
won't be accessed encrypted at the initial stage, in order to boot the
kdump kernel in the same manner as originally booted.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
Changes since v7:
1. Modify comment in the code.(Suggested by Borislav)
2. Improve patch log.(Suggested by Borislav)

 kernel/kexec_core.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..6353daaee7f1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,18 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, it needs to ensure that these pages are
+* decrypted if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), because these pages are
+* reserved memory and once the crash kernel is done,
+* it will always remain in these memory until reboot
+* or unloading.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +879,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +897,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/3 v3] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-09-21 Thread Lianbo Jiang
kdump uses walk_iomem_res_desc() to iterate io resources, then adds matched
desc to e820 table for kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to kdump kernel e820 table, thus it also needs
an extra checking in memmap_entry_callback() to match the e820 type and
resource name.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..6d5451b36e80 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char *e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..ae724a6e0a5f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (res->name && !strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..f9761b2f7abb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1012,7 +1012,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char *e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index f5d9fc70a04c..cc90633f35f9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(struct resource *res, 
unsigned long desc,
res->end = p->end;
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/3 v3] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-09-21 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region
otherwise it falls back to legacy mode.

When AMD SME kdump support, it needs to map dmi table area as unencrypted.
For normal boot, these ranges sit in e820 reserved ranges, thus the early
ioremap code naturally map them as unencrypted. If we also have same e820
reserve setup in kdump kernel then it will just work like normal kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ae724a6e0a5f..3460be990e0c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,11 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add all reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   walk_iomem_res_desc(IORES_DESC_NONE, 0, 0, -1, ,
+   memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/3 v3] add reserved e820 ranges to the kdump kernel e820 table

2018-09-21 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

Furthermore, when AMD SME kdump support, it needs to map dmi table area as
unencrypted. For normal boot, these ranges sit in e820 reserved ranges,
thus the early ioremap code naturally map them as unencrypted. If we also
have same e820 reserve setup in kdump kernel then it will just work like
normal kernel.

Kdump uses walk_iomem_res_desc to iterate resources, then adds matched desc
to e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types, we
need add exact e820 type to the kdump kernel e820 table, thus it also needs
an extra checking in memmap_entry_callback() to match the e820 type and
resource name.

By the way, we also fix an error which walks through iomem resources, the
values of the function parameter may be modified in the while loop of
__walk_iomem_res_desc(), which will cause us to not get the desired result
in some cases.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Lianbo Jiang (3):
  resource: fix an error which walks through iomem resources
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 11 ++-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  3 +++
 4 files changed, 16 insertions(+), 2 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/3 v3] resource: fix an error which walks through iomem resources

2018-09-21 Thread Lianbo Jiang
When we walk through iomem resources by calling walk_iomem_res_desc(),
the values of the function parameter may be modified in the while loop
of __walk_iomem_res_desc(), which will cause us to not get the desired
result in some cases.

At present, it only restores the original value of res->end, but it
doesn't restore the original value of res->flags in the while loop of
__walk_iomem _res_desc(). Whenever the find_next_iomem_res() finds a
resource and returns the result, the original values of this resource
will be modified, which might lead to an error in the next loop. For
example:

The original value of resource flags is:
 res->flags=0x8200(initial value)

p->flags   _ 0x81000200 __ 0x8200 _
  /  \  /  \
||___A||_._|__B_|..___|
00x
(memory address ranges)

Note: if ((p->flags & res->flags) != res->flags) continue;

When the resource A is found, the original value of this resource flags
will be changed to 0x81000200(res->flags=0x81000200), and continue to
look for the next resource, when the loop reaches resource B, it can not
get the resource B any more(you can refer to the for loop of find_next
_iomem_res()), because the value of conditional expression will become
true and will also jump the resource B.

In fact, we should get the resource A and B when we walk through the
whole tree, but it only gets the resource A, the resource B is missed.

Signed-off-by: Lianbo Jiang 
---
 kernel/resource.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..f5d9fc70a04c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -375,6 +375,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 int (*func)(struct resource *, void *))
 {
u64 orig_end = res->end;
+   u64 orig_flags = res->flags;
int ret = -1;
 
while ((res->start < res->end) &&
@@ -385,6 +386,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 
res->start = res->end + 1;
res->end = orig_end;
+   res->flags = orig_flags;
}
 
return ret;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 RESEND 0/4] Support kdump for AMD secure memory encryption(SME)

2018-09-27 Thread Lianbo Jiang
arlyprintk=serial debug nokaslr"
# kexec -e (or reboot)

But this issue can not be reproduced on speedway machine, and this issue
is irrelevant to my posted patches.

The kernel log:
[ 1248.932239] kexec_core: Starting new kernel
early console in extract_kernel
input_data: 0x00087e91c3b4
input_len: 0x0067fcbd
output: 0x00087d40
output_len: 0x01b6fa90
kernel_total_size: 0x01a9d000
trampoline_32bit: 0x00099000

Decompressing Linux...
Parsing ELF...[---Here the system will hang]

Lianbo Jiang (4):
  x86/ioremap: add a function ioremap_encrypted() to remap kdump old
memory
  kexec: allocate unencrypted control pages for kdump in case SME is
enabled
  iommu/amd: Remap the device table of IOMMU with the memory encryption
mask for kdump
  kdump/vmcore: support encrypted old memory with SME enabled

 arch/x86/include/asm/io.h|  3 ++
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 arch/x86/mm/ioremap.c| 25 -
 drivers/iommu/amd_iommu_init.c   | 14 ++--
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 kernel/kexec_core.c  | 12 +++
 8 files changed, 125 insertions(+), 16 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 RESEND 1/4] x86/ioremap: add a function ioremap_encrypted() to remap kdump old memory

2018-09-27 Thread Lianbo Jiang
When SME is enabled on AMD machine, the memory is encrypted in the first
kernel. In this case, SME also needs to be enabled in kdump kernel, and
we have to remap the old memory with the memory encryption mask.

Here we only talk about the case that SME is active in the first kernel,
and only care it's active too in kdump kernel. there are four cases we
need considered.

a. dump vmcore
   It is encrypted in the first kernel, and needs be read out in kdump
   kernel.

b. crash notes
   When dumping vmcore, the people usually need to read the useful
   information from notes, and the notes is also encrypted.

c. iommu device table
   It is allocated by kernel, need fill its pointer into mmio of amd iommu.
   It's encrypted in the first kernel, need read the old content to analyze
   and get useful information.

d. mmio of amd iommu
   Register reported by amd firmware, it's not RAM, we don't encrypt in
   both the first kernel and kdump kernel.

To achieve the goal, the solution is:
1. add a new bool parameter "encrypted" to __ioremap_caller()
   It is a low level function, and check the newly added parameter, if it's
   true and in kdump kernel, will remap the memory with sme mask.

2. add a new function ioremap_encrypted() to explicitly passed in a "true"
   value for "encrypted".
   For above a, b, c, we will call ioremap_encrypted();

3. adjust all existed ioremap wrapper functions, passed in "false" for
   encrypted to make them an before.

   ioremap_encrypted()\
   ioremap_cache() |
   ioremap_prot()  |
   ioremap_wt()|->__ioremap_caller()
   ioremap_wc()|
   ioremap_uc()    |
   ioremap_nocache()  /

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c | 25 +
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..f8795f9581c7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -192,6 +192,9 @@ extern void __iomem *ioremap_cache(resource_size_t offset, 
unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, 
unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
+   unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 
 /**
  * ioremap -   map bus memory into CPU space
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..e01e6c695add 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "physaddr.h"
 
@@ -131,7 +132,8 @@ static void __ioremap_check_mem(resource_size_t addr, 
unsigned long size,
  * caller shouldn't need to know that small detail.
  */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-   unsigned long size, enum page_cache_mode pcm, void *caller)
+   unsigned long size, enum page_cache_mode pcm,
+   void *caller, bool encrypted)
 {
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +201,7 @@ static void __iomem *__ioremap_caller(resource_size_t 
phys_addr,
 * resulting mapping.
 */
prot = PAGE_KERNEL_IO;
-   if (sev_active() && mem_flags.desc_other)
+   if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
 
switch (pcm) {
@@ -291,7 +293,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
 
@@ -324,7 +326,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, 
unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
 
return __ioremap_caller(phys_addr, size, pcm,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
 
@@ -341,7 +343,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-   __builtin_return_address(0));
+   __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
@@ -358,14 +360,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {

[PATCH 0/3 v2] add reserved e820 ranges to the kdump kernel e820 table

2018-09-19 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region otherwise
it falls back to legacy mode.

Furthermore, when AMD SME kdump support, it needs to map dmi table area as
unencrypted. For normal boot these ranges sit in e820 reserved ranges thus
the early ioremap code naturally map them as unencrypted. So if we have same
e820 reserve setup in kdump kernel then it will just work like normal kernel.

Kdump use walk_iomem_res_desc to iterate resources then add matched desc to
e820 table for the kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types, we
need add exact e820 type to the kdump kernel e820 table thus need an extra
checking in memmap_entry_callback() to match the e820 type and resource name.

By the way, we also fix an error which walks through iomem resources, the
values of the function parameter may be modified in the while loop of
__walk_iomem_res_desc(), which will cause us to not get the desired result
in some cases.

Changes since v1:
1. We modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Lianbo Jiang (3):
  resource: fix an error which walks through iomem resources
  x86/kexec_file: add e820 entry in case e820 type string matches to io
resource name
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/crash.c | 11 ++-
 arch/x86/kernel/e820.c  |  2 +-
 kernel/resource.c   |  3 +++
 4 files changed, 16 insertions(+), 2 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/3 v2] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-09-19 Thread Lianbo Jiang
E820 reserved ranges is useful in kdump kernel, we have added this in
kexec-tools code.

One reason is PCI mmconf (extended mode) requires reserved region
otherwise it falls back to legacy mode.

When AMD SME kdump support, it needs to map dmi table area as unencrypted.
For normal boot these ranges sit in e820 reserved ranges thus the early
ioremap code naturally map them as unencrypted. So if we have same e820
reserve setup in kdump kernel then it will just work like normal kernel.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ae724a6e0a5f..3460be990e0c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -384,6 +384,11 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add all reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   walk_iomem_res_desc(IORES_DESC_NONE, 0, 0, -1, ,
+   memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/3 v2] resource: fix an error which walks through iomem resources

2018-09-19 Thread Lianbo Jiang
When we walk through iomem resources by calling walk_iomem_res_desc(),
the values of the function parameter may be modified in the while loop
of __walk_iomem_res_desc(), which will cause us to not get the desired
result in some cases.

At present, it only restores the original value of res->end, but it
doesn't restore the original value of res->flags in the while loop of
__walk_iomem _res_desc(). Whenever the find_next_iomem_res() finds a
resource and returns the result, the original values of this resource
will be modified, which might lead to an error in the next loop. For
example:

The original value of resource flags is:
 res->flags=0x8200(initial value)

p->flags   _ 0x81000200 __ 0x8200 _
  /  \  /  \
||___A||_._|__B_|..___|
00x
(memory address ranges)

Note: if ((p->flags & res->flags) != res->flags) continue;

When the resource A is found, the original value of this resource flags
will be changed to 0x81000200(res->flags=0x81000200), and continue to
look for the next resource, when the loop reaches resource B, it can not
get the resource B any more(you can refer to the for loop of find_next
_iomem_res()), because the value of conditional expression will become
true and will also jump the resource B.

In fact, we should get the resource A and B when we walk through the
whole tree, but it only gets the resource A, the resource B is missed.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 kernel/resource.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..f5d9fc70a04c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -375,6 +375,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 int (*func)(struct resource *, void *))
 {
u64 orig_end = res->end;
+   u64 orig_flags = res->flags;
int ret = -1;
 
while ((res->start < res->end) &&
@@ -385,6 +386,7 @@ static int __walk_iomem_res_desc(struct resource *res, 
unsigned long desc,
 
res->start = res->end + 1;
res->end = orig_end;
+   res->flags = orig_flags;
}
 
return ret;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/3 v2] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name

2018-09-19 Thread Lianbo Jiang
kdump use walk_iomem_res_desc to iterate io resources then add matched
desc to e820 table for kdump kernel.

But IORES_DESC_NONE resource type includes several different e820 types,
we need add exact e820 type to kdump kernel e820 table thus need an extra
checking in memmap_entry_callback() to match the e820 type and resource
name.

Signed-off-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/include/asm/e820/api.h | 2 ++
 arch/x86/kernel/crash.c | 6 +-
 arch/x86/kernel/e820.c  | 2 +-
 kernel/resource.c   | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..6d5451b36e80 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -42,6 +42,8 @@ extern void e820__register_nosave_regions(unsigned long 
limit_pfn);
 
 extern int  e820__get_entry_type(u64 start, u64 end);
 
+extern const char *e820_type_to_string(struct e820_entry *entry);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained 
inside
  * the ISA region.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..ae724a6e0a5f 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -314,11 +315,14 @@ static int memmap_entry_callback(struct resource *res, 
void *arg)
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
+   const char *name;
 
ei.addr = res->start;
ei.size = resource_size(res);
ei.type = cmd->type;
-   add_e820_entry(params, );
+   name = e820_type_to_string();
+   if (res->name && !strcmp(name, res->name))
+   add_e820_entry(params, );
 
return 0;
 }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c88c23c658c1..f9761b2f7abb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1012,7 +1012,7 @@ void __init e820__finish_early_params(void)
}
 }
 
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+const char *e820_type_to_string(struct e820_entry *entry)
 {
switch (entry->type) {
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
diff --git a/kernel/resource.c b/kernel/resource.c
index f5d9fc70a04c..cc90633f35f9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -366,6 +366,7 @@ static int find_next_iomem_res(struct resource *res, 
unsigned long desc,
res->end = p->end;
res->flags = p->flags;
res->desc = p->desc;
+   res->name = p->name;
return 0;
 }
 
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 RESEND 2/4] kexec: allocate unencrypted control pages for kdump in case SME is enabled

2018-09-27 Thread Lianbo Jiang
When SME is enabled in the first kernel, we will allocate unencrypted pages
for kdump in order to be able to boot the kdump kernel like kexec.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
 kernel/kexec_core.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..e7efcd1a977b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,16 @@ static struct page 
*kimage_alloc_crash_control_pages(struct kimage *image,
}
}
 
+   if (pages) {
+   /*
+* For kdump, we need to ensure that these pages are
+* unencrypted pages if SME is enabled.
+* By the way, it is unnecessary to call the arch_
+* kexec_pre_free_pages(), which will make the code
+* become more simple.
+*/
+   arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+   }
return pages;
 }
 
@@ -867,6 +877,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result  = -ENOMEM;
goto out;
}
+   arch_kexec_post_alloc_pages(page_address(page), 1, 0);
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
@@ -884,6 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
+   arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
goto out;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v7 RESEND 4/4] kdump/vmcore: support encrypted old memory with SME enabled

2018-09-27 Thread Lianbo Jiang
In kdump kernel, we need to dump the old memory into vmcore file,if SME
is enabled in the first kernel, we have to remap the old memory with the
memory encryption mask, which will be automatically decrypted when we
read from DRAM.

For SME kdump, there are two cases that doesn't support:

 --
| first-kernel | second-kernel | kdump support |
|  (mem_encrypt=on|off)|   (yes|no)|
|--+---+---|
| on   | on| yes   |
| off  | off   | yes   |
| on   | off   | no|
| off  | on| no|
|__|___|___|

1. SME is enabled in the first kernel, but SME is disabled in kdump kernel
In this case, because the old memory is encrypted, we can't decrypt the
old memory.

2. SME is disabled in the first kernel, but SME is enabled in kdump kernel
On the one hand, the old memory is unencrypted, the old memory can be dumped
as usual, we don't need to enable SME in kdump kernel; On the other hand, it
will increase the complexity of the code, we will have to consider how to
pass the SME flag from the first kernel to the kdump kernel, it is really
too expensive to do this.

This patches are only for SME kdump, the patches don't support SEV kdump.

Signed-off-by: Lianbo Jiang 
Reviewed-by: Tom Lendacky 
---
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/crash_dump_encrypt.c | 53 
 fs/proc/vmcore.c | 21 +++
 include/linux/crash_dump.h   | 12 +++
 4 files changed, 81 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/crash_dump_encrypt.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8824d01c0c35..dfbeae0e35ce 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_KEXEC_CORE)  += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)   += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)   += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump_$(BITS).o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)  += crash_dump_encrypt.o
 obj-y  += kprobes/
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_DOUBLEFAULT)  += doublefault.o
diff --git a/arch/x86/kernel/crash_dump_encrypt.c 
b/arch/x86/kernel/crash_dump_encrypt.c
new file mode 100644
index ..e1b1a577f197
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_encrypt.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Lianbo Jiang (liji...@redhat.com)
+ * Copyright (C) RedHat Corporation, 2018. All rights reserved
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * copy_oldmem_page_encrypted - copy one page from "oldmem encrypted"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem encrypted". For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to
+ * kmap_atomic.
+ */
+
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+   size_t csize, unsigned long offset, int userbuf)
+{
+   void  *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT,
+ PAGE_SIZE);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   set_iounmap_nonlazy();
+   iounmap((void __iomem *)vaddr);
+   return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..3065c8bada6a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,6 +25,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
@@ -98,7 +101,8 @@ static int pfn_is_ram(unsigned long pfn)
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-   u64 *ppos, int userbuf)
+   u64 *ppos, int userbuf,
+   bool encrypted)

[PATCH 2/2 v5] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-06 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v5] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-06 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a convention between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Changes since v4:
1. Remove a command that dumping the VMCOREINFO contents from this
   document.
2. Merge the 'PG_buddy' and 'PG_offline' into the PG_* flag in this
   document.
3. Correct some of the mistakes in this document.

*** BLURB HERE ***

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 500 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 503 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v5] kdump: add the vmcoreinfo documentation

2019-01-06 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variables as a convention between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 500 +
 1 file changed, 500 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..8e444586b87b
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,500 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned
+on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools can compute their values based on the offset of these
+variables. The variables are helpful

[PATCH 0/2] makedumpfile needs to remove the memory encryption

2019-01-22 Thread Lianbo Jiang
The patchset did two things:
[1] add a new variable 'sme_mask' to number_table

The variable will be used to store the sme mask for crashed kernel,
the sme_mask denotes whether the old memory is encrypted or not.

[2] remove the memory encryption mask to obtain the true physical
address

For AMD machine with SME feature, if SME is enabled in the first
kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
the memory encryption mask, so makedumpfile needs to remove the
memory encryption mask to obtain the true physical address.

References:

x86/kdump: Export the SME mask to vmcoreinfo
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=65f750e5457aef9a8085a99d613fea0430303e93
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=f263245a0ce2c4e23b89a58fa5f7dfc048e11929

Lianbo Jiang (2):
  Makedumpfile: add a new variable 'sme_mask' to number_table
  Remove the memory encryption mask to obtain the true physical address

 arch/x86_64.c  | 3 +++
 makedumpfile.c | 4 
 makedumpfile.h | 1 +
 3 files changed, 8 insertions(+)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2] Makedumpfile: add a new variable 'sme_mask' to number_table

2019-01-22 Thread Lianbo Jiang
It will be used to store the sme mask for crashed kernel, the
sme_mask denotes whether the old memory is encrypted or not.

Signed-off-by: Lianbo Jiang 
---
 makedumpfile.c | 3 +++
 makedumpfile.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/makedumpfile.c b/makedumpfile.c
index 8923538..a03aaa1 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -1743,6 +1743,7 @@ get_structure_info(void)
ENUM_NUMBER_INIT(NR_FREE_PAGES, "NR_FREE_PAGES");
ENUM_NUMBER_INIT(N_ONLINE, "N_ONLINE");
ENUM_NUMBER_INIT(pgtable_l5_enabled, "pgtable_l5_enabled");
+   ENUM_NUMBER_INIT(sme_mask, "sme_mask");
 
ENUM_NUMBER_INIT(PG_lru, "PG_lru");
ENUM_NUMBER_INIT(PG_private, "PG_private");
@@ -2276,6 +2277,7 @@ write_vmcoreinfo_data(void)
WRITE_NUMBER("NR_FREE_PAGES", NR_FREE_PAGES);
WRITE_NUMBER("N_ONLINE", N_ONLINE);
WRITE_NUMBER("pgtable_l5_enabled", pgtable_l5_enabled);
+   WRITE_NUMBER("sme_mask", sme_mask);
 
WRITE_NUMBER("PG_lru", PG_lru);
WRITE_NUMBER("PG_private", PG_private);
@@ -2672,6 +2674,7 @@ read_vmcoreinfo(void)
READ_NUMBER("NR_FREE_PAGES", NR_FREE_PAGES);
READ_NUMBER("N_ONLINE", N_ONLINE);
READ_NUMBER("pgtable_l5_enabled", pgtable_l5_enabled);
+   READ_NUMBER("sme_mask", sme_mask);
 
READ_NUMBER("PG_lru", PG_lru);
READ_NUMBER("PG_private", PG_private);
diff --git a/makedumpfile.h b/makedumpfile.h
index 73813ed..e97b2e7 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -1912,6 +1912,7 @@ struct number_table {
longNR_FREE_PAGES;
longN_ONLINE;
longpgtable_l5_enabled;
+   longsme_mask;
 
/*
* Page flags
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2] Remove the memory encryption mask to obtain the true physical address

2019-01-22 Thread Lianbo Jiang
For AMD machine with SME feature, if SME is enabled in the first
kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
the memory encryption mask, so makedumpfile needs to remove the
memory encryption mask to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86_64.c  | 3 +++
 makedumpfile.c | 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index 537fb78..7651d36 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -346,6 +346,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
return NOT_PADDR;
}
pud_paddr  = pgd & ENTRY_MASK;
+   pud_paddr = pud_paddr & ~(NUMBER(sme_mask));
}
 
/*
@@ -371,6 +372,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
 * Get PMD.
 */
pmd_paddr  = pud_pte & ENTRY_MASK;
+   pmd_paddr = pmd_paddr & ~(NUMBER(sme_mask));
pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long);
if (!readmem(PADDR, pmd_paddr, _pte, sizeof pmd_pte)) {
ERRMSG("Can't get pmd_pte (pmd_paddr:%lx).\n", pmd_paddr);
@@ -391,6 +393,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
 * Get PTE.
 */
pte_paddr  = pmd_pte & ENTRY_MASK;
+   pte_paddr = pte_paddr & ~(NUMBER(sme_mask));
pte_paddr += pte_index(vaddr) * sizeof(unsigned long);
if (!readmem(PADDR, pte_paddr, , sizeof pte)) {
ERRMSG("Can't get pte (pte_paddr:%lx).\n", pte_paddr);
diff --git a/makedumpfile.c b/makedumpfile.c
index a03aaa1..81c7bb4 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -977,6 +977,7 @@ next_page:
read_size = MIN(info->page_size - PAGEOFFSET(paddr), size);
 
pgaddr = PAGEBASE(paddr);
+   pgaddr = pgaddr & ~(NUMBER(sme_mask));
pgbuf = cache_search(pgaddr, read_size);
if (!pgbuf) {
++cache_miss;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v3] kdump: add the vmcoreinfo documentation

2018-12-16 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it would normalize the
exported variable as a standard ABI between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 456 +
 1 file changed, 456 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..d71260bf383a
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,456 @@
+
+   Documentation for VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+It is a special ELF note section. The VMCOREINFO contains the first
+kernel's various information, for example, structure size, page size,
+symbol values and field offset, etc. These data are packed into an ELF
+note section, and these data will also help user-space tools(e.g. crash
+makedumpfile) analyze the first kernel's memory usage.
+
+In general, makedumpfile can dump the VMCOREINFO contents from vmlinux
+in the first kernel. For example:
+# makedumpfile -g VMCOREINFO -x vmlinux
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+The number of OS release. Based on this version number, people can find
+the source code for the corresponding version. When analyzing the vmcore,
+people must read the source code to find the reason why the kernel crashed.
+
+PAGE_SIZE
+=
+The size of a page. It is the smallest unit of data for memory management
+in kernel. It is usually 4k bytes and the page is aligned in 4k bytes,
+which is very important for computing address.
+
+init_uts_ns
+===
+This is the UTS namespace, which is used to isolate two specific elements
+of the system that relate to the uname system call. The UTS namespace is
+named after the data structure used to store information returned by the
+uname system call.
+
+User-space tools can get the kernel name, host name, kernel release number,
+kernel version, architecture name and OS type from the 'init_uts_ns'.
+
+node_online_map
+===
+It is a macro definition, actually it is an array node_states[N_ONLINE],
+and it represents the set of online node in a system, one bit position
+per node number.
+
+This is used to keep track of which nodes are in the system and online.
+
+swapper_pg_dir
+=
+It generally indicates the pgd for the kernel. When mmu is enabled in
+config file, the 'swapper_pg_dir' is valid.
+
+The 'swapper_pg_dir' helps to translate the virtual address to a physical
+address.
+
+_stext
+==
+It is an assemble symbol that defines the beginning of the text section.
+In general, the '_stext' indicates the kernel start address. This is used
+to convert a virtual address to a physical address when the virtual address
+does not belong to the 'vmalloc' address.
+
+vmap_area_list
+==
+It stores the virtual area list, makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+===
+Physical addresses are translated to struct pages by treating them as an
+index into the mem_map array. Shifting a physical address PAGE_SHIFT bits
+to the right will treat it as a PFN from physical address 0, which is also
+an index within the mem_map array.
+
+In short, it can map the address to struct page.
+
+contig_page_data
+
+Makedumpfile can get the pglist_data structure from this symbol
+'contig_page_data'. The pglist_data structure is used to describe the
+memory layout.
+
+User-space tools can use this symbols for excluding free pages.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+==
+Export the address of 'mem_section' array, and it's length, structure size,
+and the 'section_mem_map' offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them will help to translate
+the address.
+
+page
+
+The size of a 'page' structure. In kernel, the page is an important data
+structure, it is widely used to compute the continuous memory.
+
+pglist_data
+===
+The size of a 'pglist_data' structure. This value will be used to check if
+the 'pglist_data' structure is valid. It is also one of the conditions for
+checking the memory type.
+
+zone
+
+The size of a 'zone' structure. This value is often used to check if the
+'zone' structure is found. It is necessary structures for excluding free
+pages.
+
+free_area

[PATCH 2/2 v3] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-16 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..1860fe24117d 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,24 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   /*
+* Currently, the local variable 'sme_mask' stores the value of
+* sme_me_mask(bit 47), and also write the value of sme_mask to
+* the vmcoreinfo.
+* If need, the bit(sme_mask) might be redefined in the future,
+* but the 'bit63' will be reserved.
+* For example:
+* [ misc  ][ enc bit  ][ other misc SME info   ]
+* ____1000______..._
+* 63   59   55   51   47   43   39   35   31   27   ... 3
+*/
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v3] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-16 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it would normalize the
exported variable as a standard ABI between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 456 +
 arch/x86/kernel/machine_kexec_64.c |  14 +
 2 files changed, 470 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v8] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-11-29 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), kernel does not pass the e820
reserved ranges to the second kernel, which might cause two problems:

The first one is the MMCONFIG issue. The basic problem is that this device
is in PCI segment 1 and the kernel PCI probing can not find it without all
the e820 I/O reservations being present in the e820 table. And the kdump
kernel does not have those reservations because the kexec command does not
pass the I/O reservation via the "memmap=xxx" command line option. (This
problem does not show up for other vendors, as SGI is apparently the
actually fails for everyone, but devices in segment 0 are then found by
some legacy lookup method.) The workaround for this is to pass the I/O
reserved regions to the kdump kernel.

MMCONFIG(aka ECAM) space is described in the ACPI MCFG table. If you don't
have ECAM: (a) PCI devices won't work at all on non-x86 systems that use
only ECAM for config access, (b) you won't be albe to access devices on
non-0 segments, (c) you won't be able to access extended config space(
address 0x100-0x), which means none of the Extended Capabilities will
be available(AER, ACS, ATS, etc). [Bjorn's comment]

The second issue is that the SME kdump kernel doesn't work without the
e820 reserved ranges. When SME is active in kdump kernel, actually, those
reserved regions are still decrypted, but because those reserved ranges are
not present at all in kdump kernel e820 table, those reserved regions are
considered as encrypted, it goes wrong.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..5354a84f1684 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -380,6 +380,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add e820 reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v8] resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'

2018-11-29 Thread Lianbo Jiang
When doing kexec_file_load, the first kernel needs to pass the e820
reserved ranges to the second kernel. But kernel can not exactly
match the e820 reserved ranges when walking through the iomem resources
with the descriptor 'IORES_DESC_NONE', because several e820 types(
e.g. E820_TYPE_RESERVED_KERN/E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820
_TYPE_RESERVED) are converted to the descriptor 'IORES_DESC_NONE'. It
may pass these four types to the kdump kernel, that is not desired result.

So, this patch adds a new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

In addition, since the new descriptor 'IORES_DESC_RESERVED' is introduced,
these code originally related to the descriptor 'IORES_DESC_NONE' need to
be updated. Otherwise, it will be easily confused and also cause some
errors. Because the 'E820_TYPE_RESERVED' type is converted to the new
descriptor 'IORES_DESC_RESERVED' instead of 'IORES_DESC_NONE', it has been
changed.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/ia64/kernel/efi.c |  4 
 arch/x86/kernel/e820.c |  2 +-
 arch/x86/mm/ioremap.c  | 13 -
 include/linux/ioport.h |  1 +
 kernel/resource.c  |  6 +++---
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 8f106638913c..1841e9b4db30 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1231,6 +1231,10 @@ efi_initialize_iomem_resources(struct resource 
*code_resource,
break;
 
case EFI_RESERVED_TYPE:
+   name = "reserved";
+   desc = IORES_DESC_RESERVED;
+   break;
+
case EFI_RUNTIME_SERVICES_CODE:
case EFI_RUNTIME_SERVICES_DATA:
case EFI_ACPI_RECLAIM_MEMORY:
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..57fafdafb860 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1048,10 +1048,10 @@ static unsigned long __init 
e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM:return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM:return 
IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+   case E820_TYPE_RESERVED:return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE:/* Fall-through: */
-   case E820_TYPE_RESERVED:/* Fall-through: */
default:return IORES_DESC_NONE;
}
 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 5378d10f1d31..fea2ef99415d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,7 +83,18 @@ static bool __ioremap_check_ram(struct resource *res)
 
 static int __ioremap_check_desc_other(struct resource *res)
 {
-   return (res->desc != IORES_DESC_NONE);
+   /*
+* But now, the 'E820_TYPE_RESERVED' type is converted to the new
+* descriptor 'IORES_DESC_RESERVED' instead of 'IORES_DESC_NONE',
+* it has been changed. And the value of 'mem_flags.desc_other'
+* is equal to 'true' if we don't strengthen the condition in this
+* function, that is wrong. Because originally it is equal to
+* 'false' for the same reserved type.
+*
+* So, that would be nice to keep it the same as before.
+*/
+   return ((res->desc != IORES_DESC_NONE) &&
+   (res->desc != IORES_DESC_RESERVED));
 }
 
 static int __ioremap_res_check(struct resource *res, void *arg)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+   IORES_DESC_RESERVED = 8,
 };
 
 /* helpers to define resources */
diff --git a/kernel/resource.c b/kernel/resource.c
index b0fbf685c77a..f34a632c4169 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -994,7 +994,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
res->start = start;
res->end = end;
res->flags = type | IORESOURCE_BUSY;
-   res->desc = IORES_DESC_NONE;
+   res->desc = IORES_DESC_RESERVED;
 
while (1) {
 
@@ -1029,7 +1029,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
 

[PATCH 0/2 v8] add reserved e820 ranges to the kdump kernel e820 table

2018-11-29 Thread Lianbo Jiang
This patchset did two things:
a). add a new I/O resource descriptor 'IORES_DESC_RESERVED'

When doing kexec_file_load, the first kernel needs to pass the e820
reserved ranges to the second kernel. But kernel can not exactly
match the e820 reserved ranges when walking through the iomem resources
with the descriptor 'IORES_DESC_NONE', because several e820 types(
e.g. E820_TYPE_RESERVED_KERN/E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820
_TYPE_RESERVED) are converted to the descriptor 'IORES_DESC_NONE'. It
may pass these four types to the kdump kernel, that is not desired result.

So, this patch adds a new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

In addition, since the new descriptor 'IORES_DESC_RESERVED' is introduced,
these code originally related to the descriptor 'IORES_DESC_NONE' need to
be updated. Otherwise, it will be easily confused and also cause some
errors. Because the 'E820_TYPE_RESERVED' type is converted to the new
descriptor 'IORES_DESC_RESERVED' instead of 'IORES_DESC_NONE', it has been
changed.

b). add the e820 reserved ranges to kdump kernel e820 table

At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), kernel does not pass the e820
reserved ranges to the second kernel, which might cause two problems:

The first one is the MMCONFIG issue. The basic problem is that this device
is in PCI segment 1 and the kernel PCI probing can not find it without all
the e820 I/O reservations being present in the e820 table. And the kdump
kernel does not have those reservations because the kexec command does not
pass the I/O reservation via the "memmap=xxx" command line option. (This
problem does not show up for other vendors, as SGI is apparently the
actually fails for everyone, but devices in segment 0 are then found by
some legacy lookup method.) The workaround for this is to pass the I/O
reserved regions to the kdump kernel.

MMCONFIG(aka ECAM) space is described in the ACPI MCFG table. If you don't
have ECAM: (a) PCI devices won't work at all on non-x86 systems that use
only ECAM for config access, (b) you won't be albe to access devices on
non-0 segments, (c) you won't be able to access extended config space(
address 0x100-0x), which means none of the Extended Capabilities will
be available(AER, ACS, ATS, etc). [Bjorn's comment]

The second issue is that the SME kdump kernel doesn't work without the
e820 reserved ranges. When SME is active in kdump kernel, actually, those
reserved regions are still decrypted, but because those reserved ranges are
not present at all in kdump kernel e820 table, those reserved regions are
considered as encrypted, it goes wrong.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces,
   and also updates these codes relates to 'IORES_DESC_NONE'.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.
3. Update patch log.

Changes since v7:
1. Improve patch log.
2. Improve this function __ioremap_check_desc_other().
3. Modify code comment in the __ioremap_check_desc_other()


Lianbo Jiang (2):
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/ia64/kernel/efi.c  |  4 
 arch/x86/kernel/crash.c |  6 ++
 arch/x86/kernel/e820.c  |  2 +-
 arch/x86/mm/ioremap.c   | 13 -
 include/linux/ioport.h  |  1 +
 kernel/resource.c   |  6 +++---
 6 files changed, 27 insertions(+), 5 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v2] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-01 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a standard ABI between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 400 +
 arch/x86/kernel/machine_kexec_64.c |  14 +
 2 files changed, 414 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v2] kdump: add the vmcoreinfo documentation

2018-12-01 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a standard ABI between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 400 +
 1 file changed, 400 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..c6759be14af7
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,400 @@
+
+   Documentation for Vmcoreinfo
+
+
+===
+What is the vmcoreinfo?
+===
+The vmcoreinfo contains the first kernel's various information, for
+example, structure size, page size, symbol values and field offset,
+etc. These data are encapsulated into an elf format, and these data
+will also help user-space tools(e.g. makedumpfile, crash) analyze the
+first kernel's memory usage.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+The number of OS release.
+
+PAGE_SIZE
+=
+The size of a page. It is usually 4k bytes.
+
+init_uts_ns
+===
+This is the UTS namespace, which is used to isolate two specific elements
+of the system that relate to the uname system call. The UTS namespace is
+named after the data structure used to store information returned by the
+uname system call.
+
+node_online_map
+===
+It is a macro definition, actually it is an arrary node_states[N_ONLINE],
+and it represents the set of online node in a system, one bit position
+per node number.
+
+swapper_pg_dir
+=
+It is always an array, it gerenally stands for the pgd for the kernel.
+When mmu is enabled in config file, the 'swapper_pg_dir' is valid.
+
+_stext
+==
+It is an assemble directive that defines the beginning of the text section.
+In gerenal, the '_stext' indicates the kernel start address.
+
+vmap_area_list
+==
+It stores the virtual area list, makedumpfile can get the vmalloc start
+value according to this variable.
+
+mem_map
+===
+Physical addresses are translated to struct pages by treating them as an
+index into the mem_map array. Shifting a physical address PAGE_SHIFT bits
+to the right will treat it as a PFN from physical address 0, which is also
+an index within the mem_map array.
+
+In a word, it can map the address to struct page.
+
+contig_page_data
+
+Makedumpfile can get the pglist_data structure according to this symbol
+'contig_page_data'. The pglist_data structure is used to describe the
+memory layout.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+==
+Export the address of 'mem_section' array, and it's length, structure size,
+and the 'section_mem_map' offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them will help to translate
+the address.
+
+page
+
+The size of a 'page' structure.
+
+pglist_data
+===
+The size of a 'pglist_data' structure.
+
+zone
+
+The size of a 'zone' structure.
+
+free_area
+=
+The size of a 'free_area' structure.
+
+list_head
+=
+The size of a 'list_head' structure.
+
+nodemask_t
+==
+The size of a 'nodemask_t' type.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+===
+The page structure is a familiar concept for most of linuxer, there is no
+need to explain too much. To know more information, please refer to the
+definition of the page struct(include/linux/mm_types.h).
+
+(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_
+  spanned_pages|node_id)
+===
+On NUMA machines, each NUMA node would have a pg_data_t to describe
+it's memory layout. On UMA machines there is a single pglist_data which
+describes the whole memory.
+
+The pglist_data structure contains these varibales, here export their
+offset in the pglist_data structure, which is defined in this file
+"include/linux/mmzone.h".
+
+(zone, free_area|vm_stat|spanned_pages)
+===
+The offset of these variables in the structure zone.
+
+Each node is divided up into a number of blocks called zones which
+represent ranges within memory. A zone is described by a structure zone.
+Each zone type is suitable for a different type of usage.
+
+

[PATCH 2/2 v2] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-01 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..1860fe24117d 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,24 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   /*
+* Currently, the local variable 'sme_mask' stores the value of
+* sme_me_mask(bit 47), and also write the value of sme_mask to
+* the vmcoreinfo.
+* If need, the bit(sme_mask) might be redefined in the future,
+* but the 'bit63' will be reserved.
+* For example:
+* [ misc  ][ enc bit  ][ other misc SME info   ]
+* ____1000______..._
+* 63   59   55   51   47   43   39   35   31   27   ... 3
+*/
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 RESEND v7] add reserved e820 ranges to the kdump kernel e820 table

2018-11-23 Thread Lianbo Jiang
These patches add the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces, and in order to make it still
work after the new descriptor is added, these codes originally related
to 'IORES_DESC_NONE' have been updated.

In addition, for the MMCONFIG issue and the SME kdump issue, it is
necessary to pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces,
   and also updates these codes relates to 'IORES_DESC_NONE'.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.
3. Update patch log.

Lianbo Jiang (2):
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/ia64/kernel/efi.c  | 4 
 arch/x86/kernel/crash.c | 6 ++
 arch/x86/kernel/e820.c  | 2 +-
 arch/x86/mm/ioremap.c   | 9 -
 include/linux/ioport.h  | 1 +
 kernel/resource.c   | 6 +++---
 6 files changed, 23 insertions(+), 5 deletions(-)

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 RESEND v7] resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'

2018-11-23 Thread Lianbo Jiang
The upstream kernel can not accurately add the e820 reserved type to
kdump krenel e820 table.

Kdump uses walk_iomem_res_desc() to iterate io resources, then adds
the matched resource ranges to the e820 table for kdump kernel. But,
when convert the e820 type to the iores descriptor, several e820
types are converted to 'IORES_DESC_NONE' in this function e820_type
_to_iores_desc(). So the walk_iomem_res_desc() will get the redundant
types(such as E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820_TYPE_KERN) when
walk through io resources with the descriptor 'IORES_DESC_NONE'.

This patch adds the new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

Furthermore, in order to make it still work after the new descriptor
is added, these codes originally related to 'IORES_DESC_NONE' have
been updated.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/ia64/kernel/efi.c | 4 
 arch/x86/kernel/e820.c | 2 +-
 arch/x86/mm/ioremap.c  | 9 -
 include/linux/ioport.h | 1 +
 kernel/resource.c  | 6 +++---
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 8f106638913c..1841e9b4db30 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1231,6 +1231,10 @@ efi_initialize_iomem_resources(struct resource 
*code_resource,
break;
 
case EFI_RESERVED_TYPE:
+   name = "reserved";
+   desc = IORES_DESC_RESERVED;
+   break;
+
case EFI_RUNTIME_SERVICES_CODE:
case EFI_RUNTIME_SERVICES_DATA:
case EFI_ACPI_RECLAIM_MEMORY:
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d..57fafdafb860 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1048,10 +1048,10 @@ static unsigned long __init 
e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM:return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM:return 
IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+   case E820_TYPE_RESERVED:return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE:/* Fall-through: */
-   case E820_TYPE_RESERVED:/* Fall-through: */
default:return IORES_DESC_NONE;
}
 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 5378d10f1d31..91b6112e7489 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,7 +83,14 @@ static bool __ioremap_check_ram(struct resource *res)
 
 static int __ioremap_check_desc_other(struct resource *res)
 {
-   return (res->desc != IORES_DESC_NONE);
+   /*
+* The E820_TYPE_RESERVED was converted to the IORES_DESC_NONE
+* before the new IORES_DESC_RESERVED is added, so it contained
+* the e820 reserved type. In order to make it still work for
+* SEV, here keep it the same as before.
+*/
+   return ((res->desc != IORES_DESC_NONE) ||
+   (res->desc != IORES_DESC_RESERVED));
 }
 
 static int __ioremap_res_check(struct resource *res, void *arg)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+   IORES_DESC_RESERVED = 8,
 };
 
 /* helpers to define resources */
diff --git a/kernel/resource.c b/kernel/resource.c
index b0fbf685c77a..f34a632c4169 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -994,7 +994,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
res->start = start;
res->end = end;
res->flags = type | IORESOURCE_BUSY;
-   res->desc = IORES_DESC_NONE;
+   res->desc = IORES_DESC_RESERVED;
 
while (1) {
 
@@ -1029,7 +1029,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
next_res->start = conflict->end + 1;
next_res->end = end;
next_res->flags = type | IORESOURCE_BUSY;
-   next_res->desc = IORES_DESC_NONE;
+   next_res->desc = IORES_DESC_RESERVED;
}
} else {
res->start = con

[PATCH 2/2 RESEND v7] x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

2018-11-23 Thread Lianbo Jiang
At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), the upstream kernel does not
pass the e820 reserved ranges to the second kernel, which might cause two
problems:

The first one is the MMCONFIG issue. The basic problem is that this device
is in PCI segment 1 and the kernel PCI probing can not find it without all
the e820 I/O reservations being present in the e820 table. And the kdump
kernel does not have those reservations because the kexec command does not
pass the I/O reservation via the "memmap=xxx" command line option. (This
problem does not show up for other vendors, as SGI is apparently the
actually fails for everyone, but devices in segment 0 are then found by
some legacy lookup method.) The workaround for this is to pass the I/O
reserved regions to the kdump kernel.

MMCONFIG(aka ECAM) space is described in the ACPI MCFG table. If you don't
have ECAM: (a) PCI devices won't work at all on non-x86 systems that use
only ECAM for config access, (b) you won't be albe to access devices on
non-0 segments, (c) you won't be able to access extended config space(
address 0x100-0x), which means none of the Extended Capabilities will
be available(AER, ACS, ATS, etc). [Bjorn's comment]

The second issue is that the SME kdump kernel doesn't work without the
e820 reserved ranges. When SME is active in kdump kernel, actually, those
reserved regions are still decrypted, but because those reserved ranges are
not present at all in kdump kernel e820 table, those reserved regions are
considered as encrypted, it goes wrong.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Suggested-by: Dave Young 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/crash.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f631a3f15587..5354a84f1684 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -380,6 +380,12 @@ int crash_setup_memmap_entries(struct kimage *image, 
struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, ,
memmap_entry_callback);
 
+   /* Add e820 reserved ranges */
+   cmd.type = E820_TYPE_RESERVED;
+   flags = IORESOURCE_MEM;
+   walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, ,
+  memmap_entry_callback);
+
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v6] kdump: add the vmcoreinfo documentation

2019-01-10 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 500 +
 1 file changed, 500 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..8e444586b87b
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,500 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned
+on 4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools can compute their values based on the offset of these
+variables. The variables are helpful to exclude unnecessary pages.
+
+(pglist_data, node_zones|nr_zones|node_mem_map

[PATCH 0/2 v6] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-10 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crashed kernel was encrypted or not. If SME is enabled in the first
kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so makedumpfile needs to remove the sme mask to
obtain the true physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Changes since v4:
1. Remove a command that dumping the VMCOREINFO contents from this
   document.
2. Merge the 'PG_buddy' and 'PG_offline' into the PG_* flag in this
   document.
3. Correct some of the mistakes in this document.

Changes since v5:
1. Improve patch log.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 500 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 503 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2 v6] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2019-01-10 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crashed kernel was encrypted or not. If SME is enabled
in the first kernel, the crashed kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so makedumpfile needs to remove
the sme mask to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2 v4] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-19 Thread Lianbo Jiang
This patchset did two things:
a. add a new document for vmcoreinfo

This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variable as a convention between kernel and use-space.

b. export the value of sme mask to vmcoreinfo

For AMD machine with SME feature, makedumpfile tools need to know whether
the crash kernel was encrypted or not. If SME is enabled in the first
kernel, the crash kernel's page table(pgd/pud/pmd/pte) contains the
memory encryption mask, so need to remove the sme mask to obtain the true
physical address.

Changes since v1:
1. No need to export a kernel-internal mask to userspace, so copy the
value of sme_me_mask to a local variable 'sme_mask' and write the value
of sme_mask to vmcoreinfo.
2. Add comment for the code.
3. Improve the patch log.
4. Add the vmcoreinfo documentation.

Changes since v2:
1. Improve the vmcoreinfo document, add more descripts for these
variables exported.
2. Fix spelling errors in the document.

Changes since v3:
1. Still improve the vmcoreinfo document, and make it become more
clear and easy to read.
2. Move sme_mask comments in the code to the vmcoreinfo document.
3. Improve patch log.

Lianbo Jiang (2):
  kdump: add the vmcoreinfo documentation
  kdump,vmcoreinfo: Export the value of sme mask to vmcoreinfo

 Documentation/kdump/vmcoreinfo.txt | 513 +
 arch/x86/kernel/machine_kexec_64.c |   3 +
 2 files changed, 516 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2 v4] kdump: add the vmcoreinfo documentation

2018-12-19 Thread Lianbo Jiang
This document lists some variables that export to vmcoreinfo, and briefly
describles what these variables indicate. It should be instructive for
many people who do not know the vmcoreinfo, and it also normalizes the
exported variables as a convention between kernel and use-space.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 Documentation/kdump/vmcoreinfo.txt | 513 +
 1 file changed, 513 insertions(+)
 create mode 100644 Documentation/kdump/vmcoreinfo.txt

diff --git a/Documentation/kdump/vmcoreinfo.txt 
b/Documentation/kdump/vmcoreinfo.txt
new file mode 100644
index ..1f1f69143600
--- /dev/null
+++ b/Documentation/kdump/vmcoreinfo.txt
@@ -0,0 +1,513 @@
+
+   VMCOREINFO
+
+
+===
+What is the VMCOREINFO?
+===
+
+VMCOREINFO is a special ELF note section. It contains various
+information from the kernel like structure size, page size, symbol
+values, field offsets, etc. These data are packed into an ELF note
+section and used by user-space tools like crash and makedumpfile to
+analyze a kernel's memory layout.
+
+To dump the VMCOREINFO contents, one can do:
+
+# makedumpfile -g VMCOREINFO -x vmlinux
+
+
+Common variables
+
+
+init_uts_ns.name.release
+
+
+The version of the Linux kernel. Used to find the corresponding source
+code from which the kernel has been built.
+
+PAGE_SIZE
+-
+
+The size of a page. It is the smallest unit of data for memory
+management in kernel. It is usually 4096 bytes and a page is aligned on
+4096 bytes. Used for computing page addresses.
+
+init_uts_ns
+---
+
+This is the UTS namespace, which is used to isolate two specific
+elements of the system that relate to the uname(2) system call. The UTS
+namespace is named after the data structure used to store information
+returned by the uname(2) system call.
+
+User-space tools can get the kernel name, host name, kernel release
+number, kernel version, architecture name and OS type from it.
+
+node_online_map
+---
+
+An array node_states[N_ONLINE] which represents the set of online node
+in a system, one bit position per node number. Used to keep track of
+which nodes are in the system and online.
+
+swapper_pg_dir
+-
+
+The global page directory pointer of the kernel. Used to translate
+virtual to physical addresses.
+
+_stext
+--
+
+Defines the beginning of the text section. In general, _stext indicates
+the kernel start address. Used to convert a virtual address from the
+direct kernel map to a physical address.
+
+vmap_area_list
+--
+
+Stores the virtual area list. makedumpfile can get the vmalloc start
+value from this variable. This value is necessary for vmalloc translation.
+
+mem_map
+---
+
+Physical addresses are translated to struct pages by treating them as
+an index into the mem_map array. Right-shifting a physical address
+PAGE_SHIFT bits converts it into a page frame number which is an index
+into that mem_map array.
+
+Used to map an address to the corresponding struct page.
+
+contig_page_data
+
+
+Makedumpfile can get the pglist_data structure from this symbol, which
+is used to describe the memory layout.
+
+User-space tools use this to exclude free pages when dumping memory.
+
+mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map)
+--
+
+The address of the mem_section array, its length, structure size, and
+the section_mem_map offset.
+
+It exists in the sparse memory mapping model, and it is also somewhat
+similar to the mem_map variable, both of them are used to translate an
+address.
+
+page
+
+
+The size of a page structure. struct page is an important data structure
+and it is widely used to compute the contiguous memory.
+
+pglist_data
+---
+
+The size of a pglist_data structure. This value will be used to check
+if the pglist_data structure is valid. It is also used for checking the
+memory type.
+
+zone
+
+
+The size of a zone structure. This value is often used to check if the
+zone structure has been found. It is also used for excluding free pages.
+
+free_area
+-
+
+The size of a free_area structure. It indicates whether the free_area
+structure is valid or not. Useful for excluding free pages.
+
+list_head
+-
+
+The size of a list_head structure. Used when iterating lists in a
+post-mortem analysis session.
+
+nodemask_t
+--
+
+The size of a nodemask_t type. Used to compute the number of online
+nodes.
+
+(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|
+   compound_order|compound_head)
+---
+
+User-space tools

[PATCH 2/2 v4] kdump, vmcoreinfo: Export the value of sme mask to vmcoreinfo

2018-12-19 Thread Lianbo Jiang
For AMD machine with SME feature, makedumpfile tools need to know
whether the crash kernel was encrypted or not. If SME is enabled
in the first kernel, the crash kernel's page table(pgd/pud/pmd/pte)
contains the memory encryption mask, so need to remove the sme mask
to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/machine_kexec_64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a7..bc4108096b18 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,10 +352,13 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+   u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
pgtable_l5_enabled());
+   VMCOREINFO_NUMBER(sme_mask);
 
 #ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL(node_data);
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2] Remove the memory encryption mask to obtain the true physical address

2019-01-27 Thread Lianbo Jiang
For AMD machine with SME feature, if SME is enabled in the first
kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
the memory encryption mask, so makedumpfile needs to remove the
memory encryption mask to obtain the true physical address.

Signed-off-by: Lianbo Jiang 
---
Changes since v1:
1. Merge them into a patch.
2. The sme_mask is not an enum number, remove it.
3. Sanity check whether the sme_mask is in vmcoreinfo.
4. Deal with the huge pages case.
5. Cover the 5-level path.

 arch/x86_64.c  | 30 +-
 makedumpfile.c |  4 
 makedumpfile.h |  1 +
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index 537fb78..7b3ed10 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -291,6 +291,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
unsigned long page_dir, pgd, pud_paddr, pud_pte, pmd_paddr, pmd_pte;
unsigned long pte_paddr, pte;
unsigned long p4d_paddr, p4d_pte;
+   unsigned long sme_me_mask = ~0UL;
 
/*
 * Get PGD.
@@ -302,6 +303,9 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
return NOT_PADDR;
}
 
+   if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
+   sme_me_mask = ~(NUMBER(sme_mask));
+
if (check_5level_paging()) {
page_dir += pgd5_index(vaddr) * sizeof(unsigned long);
if (!readmem(PADDR, page_dir, , sizeof pgd)) {
@@ -309,7 +313,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long pagetable)
return NOT_PADDR;
}
if (info->vaddr_for_vtop == vaddr)
-   MSG("  PGD : %16lx => %16lx\n", page_dir, pgd);
+   MSG("  PGD : %16lx => %16lx\n", page_dir, (pgd & 
sme_me_mask));
 
if (!(pgd & _PAGE_PRESENT)) {
ERRMSG("Can't get a valid pgd.\n");
@@ -318,20 +322,20 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
pagetable)
/*
 * Get P4D.
 */
-   p4d_paddr  = pgd & ENTRY_MASK;
+   p4d_paddr  = pgd & ENTRY_MASK & sme_me_mask;
p4d_paddr += p4d_index(vaddr) * sizeof(unsigned long);
if (!readmem(PADDR, p4d_paddr, _pte, sizeof p4d_pte)) {
ERRMSG("Can't get p4d_pte (p4d_paddr:%lx).\n", 
p4d_paddr);
return NOT_PADDR;
}
if (info->vaddr_for_vtop == vaddr)
-   MSG("  P4D : %16lx => %16lx\n", p4d_paddr, p4d_pte);
+   MSG("  P4D : %16lx => %16lx\n", p4d_paddr, (p4d_pte & 
sme_me_mask));
 
if (!(p4d_pte & _PAGE_PRESENT)) {
ERRMSG("Can't get a valid p4d_pte.\n");
return NOT_PADDR;
}
-   pud_paddr  = p4d_pte & ENTRY_MASK;
+   pud_paddr  = p4d_pte & ENTRY_MASK & sme_me_mask;
}else {
page_dir += pgd_index(vaddr) * sizeof(unsigned long);
if (!readmem(PADDR, page_dir, , sizeof pgd)) {
@@ -339,13 +343,13 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
pagetable)
return NOT_PADDR;
}
if (info->vaddr_for_vtop == vaddr)
-   MSG("  PGD : %16lx => %16lx\n", page_dir, pgd);
+   MSG("  PGD : %16lx => %16lx\n", page_dir, (pgd & 
sme_me_mask));
 
if (!(pgd & _PAGE_PRESENT)) {
ERRMSG("Can't get a valid pgd.\n");
return NOT_PADDR;
}
-   pud_paddr  = pgd & ENTRY_MASK;
+   pud_paddr  = pgd & ENTRY_MASK & sme_me_mask;
}
 
/*
@@ -357,47 +361,47 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
pagetable)
return NOT_PADDR;
}
if (info->vaddr_for_vtop == vaddr)
-   MSG("  PUD : %16lx => %16lx\n", pud_paddr, pud_pte);
+   MSG("  PUD : %16lx => %16lx\n", pud_paddr, (pud_pte & 
sme_me_mask));
 
if (!(pud_pte & _PAGE_PRESENT)) {
ERRMSG("Can't get a valid pud_pte.\n");
return NOT_PADDR;
}
if (pud_pte & _PAGE_PSE)/* 1GB pages */
-   return (pud_pte & ENTRY_MASK & PUD_MASK) +
+   return (pud_pte & ENTRY_MASK & PUD_MASK & sme_me_mask) +
(vaddr & ~PUD_MASK);
 
/*
 * Get PMD.
 */
-   pmd_paddr  = pud_pte & ENTRY_MASK;
+   pmd_paddr  = pud_pte & ENTRY_MASK & sme_me_mask;
pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long

[PATCH 2/3 v9] resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'

2019-03-21 Thread Lianbo Jiang
When doing kexec_file_load, the first kernel needs to pass the e820
reserved ranges to the second kernel. But kernel can not exactly
match the e820 reserved ranges when walking through the iomem resources
with the descriptor 'IORES_DESC_NONE', because several e820 types(
e.g. E820_TYPE_RESERVED_KERN/E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820
_TYPE_RESERVED) are converted to the descriptor 'IORES_DESC_NONE'. It
may pass these four types to the kdump kernel, that is not desired result.

So, this patch adds a new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

In addition, since the new descriptor 'IORES_DESC_RESERVED' is introduced,
these code originally related to the descriptor 'IORES_DESC_NONE' need to
be updated. Otherwise, it will be easily confused and also cause some
errors. Because the 'E820_TYPE_RESERVED' type is converted to the new
descriptor 'IORES_DESC_RESERVED' instead of 'IORES_DESC_NONE', it has been
changed.

Suggested-by: Borislav Petkov 
Signed-off-by: Lianbo Jiang 
---
 arch/x86/kernel/e820.c | 2 +-
 include/linux/ioport.h | 1 +
 kernel/resource.c  | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 2879e234e193..16fcde196243 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1050,10 +1050,10 @@ static unsigned long __init 
e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM:return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM:return 
IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+   case E820_TYPE_RESERVED:return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE:/* Fall-through: */
-   case E820_TYPE_RESERVED:/* Fall-through: */
default:return IORES_DESC_NONE;
}
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..6ed59de48bd5 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -133,6 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
+   IORES_DESC_RESERVED = 8,
 };
 
 /* helpers to define resources */
diff --git a/kernel/resource.c b/kernel/resource.c
index e81b17b53fa5..ee7348761858 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -990,7 +990,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
res->start = start;
res->end = end;
res->flags = type | IORESOURCE_BUSY;
-   res->desc = IORES_DESC_NONE;
+   res->desc = IORES_DESC_RESERVED;
 
while (1) {
 
@@ -1025,7 +1025,7 @@ __reserve_region_with_split(struct resource *root, 
resource_size_t start,
next_res->start = conflict->end + 1;
next_res->end = end;
next_res->flags = type | IORESOURCE_BUSY;
-   next_res->desc = IORES_DESC_NONE;
+   next_res->desc = IORES_DESC_RESERVED;
}
} else {
res->start = conflict->end + 1;
@@ -1488,7 +1488,7 @@ static int __init reserve_setup(char *str)
res->start = io_start;
res->end = io_start + io_num - 1;
res->flags |= IORESOURCE_BUSY;
-   res->desc = IORES_DESC_NONE;
+   res->desc = IORES_DESC_RESERVED;
res->child = NULL;
if (request_resource(parent, res) == 0)
reserved = x+1;
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/3 v9] add reserved e820 ranges to the kdump kernel e820 table

2019-03-21 Thread Lianbo Jiang
This patchset did three things:
a). Change the examination condition to avoid confusion

Following the commit <0e4c12b45aa8> ("x86/mm, resource: Use
PAGE_KERNEL protection for ioremap of memory pages"), here
it is really checking for the 'IORES_DESC_ACPI_*' values.
Therefore, it is necessary to change the examination condition
to avoid confusion.

b). add a new I/O resource descriptor 'IORES_DESC_RESERVED'

When doing kexec_file_load, the first kernel needs to pass the e820
reserved ranges to the second kernel. But kernel can not exactly
match the e820 reserved ranges when walking through the iomem resources
with the descriptor 'IORES_DESC_NONE', because several e820 types(
e.g. E820_TYPE_RESERVED_KERN/E820_TYPE_RAM/E820_TYPE_UNUSABLE/E820
_TYPE_RESERVED) are converted to the descriptor 'IORES_DESC_NONE'. It
may pass these four types to the kdump kernel, that is not desired result.

So, this patch adds a new I/O resource descriptor 'IORES_DESC_RESERVED'
for the iomem resources search interfaces. It is helpful to exactly
match the reserved resource ranges when walking through iomem resources.

In addition, since the new descriptor 'IORES_DESC_RESERVED' is introduced,
these code originally related to the descriptor 'IORES_DESC_NONE' need to
be updated. Otherwise, it will be easily confused and also cause some
errors. Because the 'E820_TYPE_RESERVED' type is converted to the new
descriptor 'IORES_DESC_RESERVED' instead of 'IORES_DESC_NONE', it has been
changed.

c). add the e820 reserved ranges to kdump kernel e820 table

At present, when use the kexec_file_load syscall to load the kernel image
and initramfs(for example: kexec -s -p xxx), kernel does not pass the e820
reserved ranges to the second kernel, which might cause two problems:

The first one is the MMCONFIG issue. The basic problem is that this device
is in PCI segment 1 and the kernel PCI probing can not find it without all
the e820 I/O reservations being present in the e820 table. And the kdump
kernel does not have those reservations because the kexec command does not
pass the I/O reservation via the "memmap=xxx" command line option. (This
problem does not show up for other vendors, as SGI is apparently the
actually fails for everyone, but devices in segment 0 are then found by
some legacy lookup method.) The workaround for this is to pass the I/O
reserved regions to the kdump kernel.

MMCONFIG(aka ECAM) space is described in the ACPI MCFG table. If you don't
have ECAM: (a) PCI devices won't work at all on non-x86 systems that use
only ECAM for config access, (b) you won't be albe to access devices on
non-0 segments, (c) you won't be able to access extended config space(
address 0x100-0x), which means none of the Extended Capabilities will
be available(AER, ACS, ATS, etc). [Bjorn's comment]

The second issue is that the SME kdump kernel doesn't work without the
e820 reserved ranges. When SME is active in kdump kernel, actually, those
reserved regions are still decrypted, but because those reserved ranges are
not present at all in kdump kernel e820 table, those reserved regions are
considered as encrypted, it goes wrong.

The e820 reserved range is useful in kdump kernel, so it is necessary to
pass the e820 reserved ranges to kdump kernel.

Changes since v1:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.

Changes since v2:
1. Modified the value of flags to "0", when walking through the whole
tree for e820 reserved ranges.
2. Modified the invalid SOB chain issue.

Changes since v3:
1. Dropped [PATCH 1/3 v3] resource: fix an error which walks through iomem
   resources. Please refer to this commit <010a93bf97c7> "resource: Fix
   find_next_iomem_res() iteration issue"

Changes since v4:
1. Improve the patch log, and add kernel log.

Changes since v5:
1. Rewrite these patches log.

Changes since v6:
1. Modify the [PATCH 1/2], and add the new I/O resource descriptor
   'IORES_DESC_RESERVED' for the iomem resources search interfaces,
   and also updates these codes relates to 'IORES_DESC_NONE'.
2. Modify the [PATCH 2/2], and walk through io resource based on the
   new descriptor 'IORES_DESC_RESERVED'.
3. Update patch log.

Changes since v7:
1. Improve patch log.
2. Improve this function __ioremap_check_desc_other().
3. Modify code comment in the __ioremap_check_desc_other()

Changes since v8:
1. Get rid of all changes about ia64.(Borislav's suggestion)
2. Change the examination condition to the 'IORES_DESC_ACPI_*'.
3. Modify the signature. This patch(add the new I/O resource
   descriptor 'IORES_DESC_RESERVED') was suggested by Boris.

Lianbo Jiang (3):
  x86/mm: Change the examination condition to avoid confusion
  resource: add the new I/O resource descriptor 'IORES_DESC_RESERVED'
  x86/kexec_file: add reserved e820 ranges to kdump kernel e820 table

 arch/x86/kernel/crash.c | 6 ++
 arch/x86/kernel/e820.c  | 2 +-
 arch/x86

[PATCH 1/3 v9] x86/mm: Change the examination condition to avoid confusion

2019-03-21 Thread Lianbo Jiang
Following the commit <0e4c12b45aa8> ("x86/mm, resource: Use
PAGE_KERNEL protection for ioremap of memory pages"), here
it is really checking for the 'IORES_DESC_ACPI_*' values.
Therefore, it is necessary to change the examination condition
to avoid confusion.

Signed-off-by: Lianbo Jiang 
---
 arch/x86/mm/ioremap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0029604af8a4..0e3ba620612d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,7 +83,8 @@ static bool __ioremap_check_ram(struct resource *res)
 
 static int __ioremap_check_desc_other(struct resource *res)
 {
-   return (res->desc != IORES_DESC_NONE);
+   return ((res->desc == IORES_DESC_ACPI_TABLES) ||
+   (res->desc == IORES_DESC_ACPI_NV_STORAGE));
 }
 
 static int __ioremap_res_check(struct resource *res, void *arg)
-- 
2.17.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


  1   2   >