[tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2013-01-31 Thread tip-bot for Fenghua Yu
Commit-ID:  ec400ddeff200b068ddc6c70f7321f49ecf32ed5
Gitweb: http://git.kernel.org/tip/ec400ddeff200b068ddc6c70f7321f49ecf32ed5
Author: Fenghua Yu 
AuthorDate: Thu, 20 Dec 2012 23:44:28 -0800
Committer:  H. Peter Anvin 
CommitDate: Thu, 31 Jan 2013 13:19:18 -0800

x86/microcode_intel_early.c: Early update ucode on Intel's CPU

Implementation of early update ucode on Intel's CPU.

load_ucode_intel_bsp() scans ucode in initrd image file which is a cpio format
ucode followed by ordinary initrd image file. The binary ucode file is stored
in kernel/x86/microcode/GenuineIntel.bin in the cpio data. All ucode
patches with the same model as BSP are saved in memory. A matching ucode patch
is updated on BSP.

load_ucode_intel_ap() reads saved ucoded patches and updates ucode on AP.

Signed-off-by: Fenghua Yu 
Link: 
http://lkml.kernel.org/r/1356075872-3054-9-git-send-email-fenghua...@intel.com
Signed-off-by: H. Peter Anvin 
---
 arch/x86/kernel/microcode_intel_early.c | 796 
 1 file changed, 796 insertions(+)

diff --git a/arch/x86/kernel/microcode_intel_early.c 
b/arch/x86/kernel/microcode_intel_early.c
new file mode 100644
index 000..7890bc8
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -0,0 +1,796 @@
+/*
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu 
+ *H Peter Anvin" 
+ *
+ * This allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+   unsigned int mc_saved_count;
+   struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state __cpuinit
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+unsigned int mc_saved_count,
+struct ucode_cpu_info *uci)
+{
+   struct microcode_intel *ucode_ptr, *new_mc = NULL;
+   int new_rev = uci->cpu_sig.rev;
+   enum ucode_state state = UCODE_OK;
+   unsigned int mc_size;
+   struct microcode_header_intel *mc_header;
+   unsigned int csig = uci->cpu_sig.sig;
+   unsigned int cpf = uci->cpu_sig.pf;
+   int i;
+
+   for (i = 0; i < mc_saved_count; i++) {
+   ucode_ptr = mc_saved_p[i];
+
+   mc_header = (struct microcode_header_intel *)ucode_ptr;
+   mc_size = get_totalsize(mc_header);
+   if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+   new_rev = mc_header->rev;
+   new_mc  = ucode_ptr;
+   }
+   }
+
+   if (!new_mc) {
+   state = UCODE_NFOUND;
+   goto out;
+   }
+
+   uci->mc = (struct microcode_intel *)new_mc;
+out:
+   return state;
+}
+
+static void __cpuinit
+microcode_pointer(struct microcode_intel **mc_saved,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start, int mc_saved_count)
+{
+   int i;
+
+   for (i = 0; i < mc_saved_count; i++)
+   mc_saved[i] = (struct microcode_intel *)
+ (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void __cpuinit
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+  struct mc_saved_data *mc_saved_data)
+{
+   int i;
+   struct microcode_intel ***mc_saved;
+
+   mc_saved = (struct microcode_intel ***)
+  __pa_symbol(_saved_data->mc_saved);
+   for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+   struct microcode_intel *p;
+
+   p = *(struct microcode_intel **)
+   __pa(mc_saved_data->mc_saved + i);
+   mc_saved_tmp[i] = (struct microcode_intel *)__pa(p);
+   }
+}
+#endif
+
+static enum ucode_state __cpuinit
+load_microcode(struct mc_saved_data *mc_saved_data,
+  unsigned long *mc_saved_in_initrd,
+  unsigned long initrd_start,
+  struct ucode_cpu_info *uci)
+{
+   struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+   unsigned int count = mc_saved_data->mc_saved_count;
+
+   if (!mc_saved_data->mc_saved) {
+   microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
+ initrd_start, count);
+
+   return 

[tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2013-01-31 Thread tip-bot for Fenghua Yu
Commit-ID:  ec400ddeff200b068ddc6c70f7321f49ecf32ed5
Gitweb: http://git.kernel.org/tip/ec400ddeff200b068ddc6c70f7321f49ecf32ed5
Author: Fenghua Yu fenghua...@intel.com
AuthorDate: Thu, 20 Dec 2012 23:44:28 -0800
Committer:  H. Peter Anvin h...@linux.intel.com
CommitDate: Thu, 31 Jan 2013 13:19:18 -0800

x86/microcode_intel_early.c: Early update ucode on Intel's CPU

Implementation of early update ucode on Intel's CPU.

load_ucode_intel_bsp() scans ucode in initrd image file which is a cpio format
ucode followed by ordinary initrd image file. The binary ucode file is stored
in kernel/x86/microcode/GenuineIntel.bin in the cpio data. All ucode
patches with the same model as BSP are saved in memory. A matching ucode patch
is updated on BSP.

load_ucode_intel_ap() reads saved ucoded patches and updates ucode on AP.

Signed-off-by: Fenghua Yu fenghua...@intel.com
Link: 
http://lkml.kernel.org/r/1356075872-3054-9-git-send-email-fenghua...@intel.com
Signed-off-by: H. Peter Anvin h...@linux.intel.com
---
 arch/x86/kernel/microcode_intel_early.c | 796 
 1 file changed, 796 insertions(+)

diff --git a/arch/x86/kernel/microcode_intel_early.c 
b/arch/x86/kernel/microcode_intel_early.c
new file mode 100644
index 000..7890bc8
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -0,0 +1,796 @@
+/*
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu fenghua...@intel.com
+ *H Peter Anvin h...@zytor.com
+ *
+ * This allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include linux/module.h
+#include linux/mm.h
+#include linux/slab.h
+#include linux/earlycpio.h
+#include linux/initrd.h
+#include linux/cpu.h
+#include asm/msr.h
+#include asm/microcode_intel.h
+#include asm/processor.h
+#include asm/tlbflush.h
+#include asm/setup.h
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+   unsigned int mc_saved_count;
+   struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state __cpuinit
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+unsigned int mc_saved_count,
+struct ucode_cpu_info *uci)
+{
+   struct microcode_intel *ucode_ptr, *new_mc = NULL;
+   int new_rev = uci-cpu_sig.rev;
+   enum ucode_state state = UCODE_OK;
+   unsigned int mc_size;
+   struct microcode_header_intel *mc_header;
+   unsigned int csig = uci-cpu_sig.sig;
+   unsigned int cpf = uci-cpu_sig.pf;
+   int i;
+
+   for (i = 0; i  mc_saved_count; i++) {
+   ucode_ptr = mc_saved_p[i];
+
+   mc_header = (struct microcode_header_intel *)ucode_ptr;
+   mc_size = get_totalsize(mc_header);
+   if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+   new_rev = mc_header-rev;
+   new_mc  = ucode_ptr;
+   }
+   }
+
+   if (!new_mc) {
+   state = UCODE_NFOUND;
+   goto out;
+   }
+
+   uci-mc = (struct microcode_intel *)new_mc;
+out:
+   return state;
+}
+
+static void __cpuinit
+microcode_pointer(struct microcode_intel **mc_saved,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start, int mc_saved_count)
+{
+   int i;
+
+   for (i = 0; i  mc_saved_count; i++)
+   mc_saved[i] = (struct microcode_intel *)
+ (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void __cpuinit
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+  struct mc_saved_data *mc_saved_data)
+{
+   int i;
+   struct microcode_intel ***mc_saved;
+
+   mc_saved = (struct microcode_intel ***)
+  __pa_symbol(mc_saved_data-mc_saved);
+   for (i = 0; i  mc_saved_data-mc_saved_count; i++) {
+   struct microcode_intel *p;
+
+   p = *(struct microcode_intel **)
+   __pa(mc_saved_data-mc_saved + i);
+   mc_saved_tmp[i] = (struct microcode_intel *)__pa(p);
+   }
+}
+#endif
+
+static enum ucode_state __cpuinit
+load_microcode(struct mc_saved_data *mc_saved_data,
+  unsigned long *mc_saved_in_initrd,
+  unsigned long initrd_start,
+  struct ucode_cpu_info *uci)
+{
+   struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+   

Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 08:16 PM, Jacob Shin wrote:
> 
> Not exactly sure why the wierd boundaries, I'll have to ask the BIOS
> side folks to be sure. But if I were to guess ..
> 
> Here is the NUMA spew out, physically there is 128 GB connected to
> each memory controller node. The PCI MMIO region starts at 0xc800.
> 4 GB - 0xc800 = 0x380 (896 MB). So we loose 896 MB due to PCI
> MMIO hole, so the first node ends at 128 GB + 896 MB to talk to all of
> 128 GB off of the first memory controller, and hence the weird 896 MB
> offset.
> 

It would obviously be better if the slack were at the end of the total
memory, instead of end of the < 1T range.  If the PCI MMIO hole were a
power of 2 (e.g. 1G) that would also reduce the likelihood of problems
and reduce MTRR pressure.

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 06:37:45PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 04:29 PM, Jacob Shin wrote:
> > On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
> >> On 12/19/2012 04:07 PM, Jacob Shin wrote:
> >>>
> >>> From what I remember, accessing memory around the memory hole (not
> >>> just the HT hole, but e03800 ~ 100 on our mentioned system
> >>> ) generated prefetches because the memory hole was marked as WB in PAT.
> >>>
> >>> I'll take a look at the system again, try the blanket MTRR covering
> >>> 0xe0 ~ 1TB, and talk to our BIOS guys.
> >>>
> >>
> >> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
> > 
> > Yes, MCE every time and it was fatal.
> > 
> 
> OK, one more question... there is something odd with the memory ranges here:
> 
>  BIOS-e820: [mem 0x0001-0x00e037ff] usable
>  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
>  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> 
> The first usable range here is 4G to 896G + 896M which is an awfully
> strange number.  Similarly, the second range is 1T to 1T + 128G - 16M.
> The little fiddly bits imply that there is either overshoot of some sort
> going on -- possibly reserved memory -- or these are fairly arbitrary
> sizes that don't match any physical bank sizes in which case it should
> be possible to shuffle it differently...

Not exactly sure why the wierd boundaries, I'll have to ask the BIOS
side folks to be sure. But if I were to guess ..

Here is the NUMA spew out, physically there is 128 GB connected to
each memory controller node. The PCI MMIO region starts at 0xc800.
4 GB - 0xc800 = 0x380 (896 MB). So we loose 896 MB due to PCI
MMIO hole, so the first node ends at 128 GB + 896 MB to talk to all of
128 GB off of the first memory controller, and hence the weird 896 MB
offset.

[0.00] SRAT: Node 0 PXM 0 0-a
[0.00] SRAT: Node 0 PXM 0 10-c800
[0.00] SRAT: Node 0 PXM 0 1-203800
[0.00] SRAT: Node 1 PXM 1 203800-403800
[0.00] SRAT: Node 2 PXM 2 403800-603800
[0.00] SRAT: Node 3 PXM 3 603800-803800
[0.00] SRAT: Node 4 PXM 4 803800-a03800
[0.00] SRAT: Node 5 PXM 5 a03800-c03800
[0.00] SRAT: Node 6 PXM 6 c03800-e03800
[0.00] SRAT: Node 7 PXM 7 100-11fff00
[0.00] NUMA: Initialized distance table, cnt=8
[0.00] NUMA: Node 0 [0,a) + [10,c800) -> [0,c800)
[0.00] NUMA: Node 0 [0,c800) + [1,203800) -> 
[0,203800)
[0.00] Initmem setup node 0 -00203800
[0.00]   NODE_DATA [002037ff5000 - 002037ff]
[0.00] Initmem setup node 1 00203800-00403800
[0.00]   NODE_DATA [004037ff5000 - 004037ff]
[0.00] Initmem setup node 2 00403800-00603800
[0.00]   NODE_DATA [006037ff5000 - 006037ff]
[0.00] Initmem setup node 3 00603800-00803800
[0.00]   NODE_DATA [008037ff5000 - 008037ff]
[0.00] Initmem setup node 4 00803800-00a03800
[0.00]   NODE_DATA [00a037ff5000 - 00a037ff]
[0.00] Initmem setup node 5 00a03800-00c03800
[0.00]   NODE_DATA [00c037ff5000 - 00c037ff]
[0.00] Initmem setup node 6 00c03800-00e03800
[0.00]   NODE_DATA [00e037ff2000 - 00e037ffcfff]
[0.00] Initmem setup node 7 0100-011fff00
[0.00]   NODE_DATA [011ffeff1000 - 011ffeffbfff]
[0.00] Zone PFN ranges:
[0.00]   DMA  0x0010 -> 0x1000
[0.00]   DMA320x1000 -> 0x0010
[0.00]   Normal   0x0010 -> 0x11fff000
[0.00] Movable zone start PFN for each node
[0.00] early_node_map[10] active PFN ranges
[0.00] 0: 0x0010 -> 0x0099
[0.00] 0: 0x0100 -> 0x000c7ec0
[0.00] 0: 0x0010 -> 0x02038000
[0.00] 1: 0x02038000 -> 0x04038000
[0.00] 2: 0x04038000 -> 0x06038000
[0.00] 3: 0x06038000 -> 0x08038000
[0.00] 4: 0x08038000 -> 0x0a038000
[0.00] 5: 0x0a038000 -> 0x0c038000
[0.00] 6: 0x0c038000 -> 0x0e038000
[0.00] 7: 0x1000 -> 0x11fff000
[0.00] On node 0 totalpages: 33553993
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 5 pages reserved
[0.00]   DMA zone: 3916 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 800504 pages, LIFO batch:31
[0.00]   Normal zone: 447552 pages used for memmap
[0.00]   Normal zone: 32287680 pages, LIFO batch:31
[0.00] On node 1 totalpages: 33554432
[0.00]   Normal zone: 458752 

Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:29 PM, Jacob Shin wrote:
> On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
>> On 12/19/2012 04:07 PM, Jacob Shin wrote:
>>>
>>> From what I remember, accessing memory around the memory hole (not
>>> just the HT hole, but e03800 ~ 100 on our mentioned system
>>> ) generated prefetches because the memory hole was marked as WB in PAT.
>>>
>>> I'll take a look at the system again, try the blanket MTRR covering
>>> 0xe0 ~ 1TB, and talk to our BIOS guys.
>>>
>>
>> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
> 
> Yes, MCE every time and it was fatal.
> 

OK, one more question... there is something odd with the memory ranges here:

 BIOS-e820: [mem 0x0001-0x00e037ff] usable
 BIOS-e820: [mem 0x00e03800-0x00fc] reserved
 BIOS-e820: [mem 0x0100-0x011ffeff] usable

The first usable range here is 4G to 896G + 896M which is an awfully
strange number.  Similarly, the second range is 1T to 1T + 128G - 16M.
The little fiddly bits imply that there is either overshoot of some sort
going on -- possibly reserved memory -- or these are fairly arbitrary
sizes that don't match any physical bank sizes in which case it should
be possible to shuffle it differently...

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:29 PM, Jacob Shin wrote:
> On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
>> On 12/19/2012 04:07 PM, Jacob Shin wrote:
>>>
>>> From what I remember, accessing memory around the memory hole (not
>>> just the HT hole, but e03800 ~ 100 on our mentioned system
>>> ) generated prefetches because the memory hole was marked as WB in PAT.
>>>
>>> I'll take a look at the system again, try the blanket MTRR covering
>>> 0xe0 ~ 1TB, and talk to our BIOS guys.
>>>
>>
>> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
> 
> Yes, MCE every time and it was fatal.
> 

So regardless of address.  Bother.

-hpa


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 04:07 PM, Jacob Shin wrote:
> > 
> > From what I remember, accessing memory around the memory hole (not
> > just the HT hole, but e03800 ~ 100 on our mentioned system
> > ) generated prefetches because the memory hole was marked as WB in PAT.
> > 
> > I'll take a look at the system again, try the blanket MTRR covering
> > 0xe0 ~ 1TB, and talk to our BIOS guys.
> > 
> 
> Yes, but do they all #MC (as opposed to, say, fetching all FFs)?

Yes, MCE every time and it was fatal.

> 
>   -hpa
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:07 PM, Jacob Shin wrote:
> 
> From what I remember, accessing memory around the memory hole (not
> just the HT hole, but e03800 ~ 100 on our mentioned system
> ) generated prefetches because the memory hole was marked as WB in PAT.
> 
> I'll take a look at the system again, try the blanket MTRR covering
> 0xe0 ~ 1TB, and talk to our BIOS guys.
> 

Yes, but do they all #MC (as opposed to, say, fetching all FFs)?

-hpa


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 04:10 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 04:02:25PM -0800, H. Peter Anvin wrote:

The goal should be to have this into -tip and -next by the middle of
January in order to make the 3.9 merge window, I think.


...and an easy back-out strategy in case there are too many issues while
testing. Maybe don't merge it into tip/master so that it can be removed
easily, or something to that effect.



We keep everything in topic branches; tip:master is a synthetic branch 
which can be regenerated as needed.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:02:25PM -0800, H. Peter Anvin wrote:
> The goal should be to have this into -tip and -next by the middle of
> January in order to make the 3.9 merge window, I think.

...and an easy back-out strategy in case there are too many issues while
testing. Maybe don't merge it into tip/master so that it can be removed
easily, or something to that effect.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 03:40 PM, Jacob Shin wrote:
> >>
> >>Just make the hole a bit bigger, so it starts at 0xfc, then you
> >>only need one MTRR.  This is the correct BIOS-level fix, and it really
> >>needs to happen.
> >>
> >>Do these systems actually exist in the field or are they engineering
> >>prototypes?  In the latter case, we might be done at that point.
> >
> >Yes, HP is shipping (or will ship soon) such systems.
> >
> 
> Can you get them to fix the BIOS first, or at least ship a BIOS
> update?  Otherwise there will be a probabilistic failure, and it
> sounds like it is your (AMD's) fault.
> 
> >>The other bit is that building the real kernel page tables iteratively
> >>(ignoring the early page tables here) is safer, since the real page
> >>table builder is fully aware of the memory map.  This means any
> >>"spillover" from the early page tables gets minimized to regions where
> >>there are data objects that have to be accessed early.  Since Yinghai
> >>already had iterative page table building working, I don't see any
> >>reason to not use that capability.
> >
> >Yes, I'll test again with latest, but Yinghai's patchset mapping only
> >RAM from top down solved our problem.
> 
> Please don't make me go Steve Ballmer on you.
> 
> We're talking about two different things... the early page tables
> versus the permanent page tables.  The permanent page tables we can
> handle because the page table creation at that point is aware of the
> memory map.

Ah okay,

> 
> The early page tables are what is used before we get to that point.
> Creating them on demand means that if there are no early-needed data
> structures near the hole, there will be no access and everything
> will be okay, but as the early page table creation *is not and
> cannot be* aware of the memory map.  Right now that simply cannot
> happen, because all such data structures are confined to 32-bit
> addresses, however *THAT WILL CHANGE AND WILL CHANGE SOON*, exactly
> because these kinds of large-memory system needs that to happen.
> You may start seeing failures at that time, and there isn't a huge
> lot we can do about it.
> 
> We are trying to discuss mitigation strategies with you, but you
> haven't really given us any useful information, e.g. what happens
> near the various boundaries of the hole, what could trigger
> prefeching into the range, and what it would take to fix the BIOSes.

>From what I remember, accessing memory around the memory hole (not
just the HT hole, but e03800 ~ 100 on our mentioned system
) generated prefetches because the memory hole was marked as WB in PAT.

I'll take a look at the system again, try the blanket MTRR covering
0xe0 ~ 1TB, and talk to our BIOS guys.

> 
>   -hpa
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Borislav Petkov wrote:


This is done on the BSP, right? So we can measure it how long it takes
by taking TSC values of start and end.



Yes, and we can count the number of #PF traps cheaply enough.  It would 
be interesting to put a counter on the number of #PFs and the number of 
resets and read them out on a large-system boot.




Sounds doable but we should take a hard look at the patches so that we
don't miss anything.

Also, I don't know how stuff like that would be approached for a wider
testing - I mean, it is a serious change in x86 boot code and there will
be issues.



The goal should be to have this into -tip and -next by the middle of 
January in order to make the 3.9 merge window, I think.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:55 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:

We are trying to discuss mitigation strategies with you, but you
haven't really given us any useful information, e.g. what happens near
the various boundaries of the hole, what could trigger prefeching into
the range, and what it would take to fix the BIOSes.


Another thing we could do (I admit it is ugly) is to add a quirk to the
#MC handler and detect that specific condition by looking at the address
reported in MCi_ADDR and exit early by not panicking the system.

Again, this is ugly but a possibility, still.



I would really, really hate to have to deal with an early MCE handler, too.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:21 PM, Jacob Shin wrote:

On Thu, Dec 20, 2012 at 12:03:29AM +0100, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:

I can check but right, they might be used up. But even if we had slots
available, the memory range that needs to be covered is in large
enough address and aligned in such a way that you cannot cover it with
variable range MTRRs.


Actually, if I'm not mistaken, you only need to cover the HT hole with
one MTRR - the rest remains WB. And in order the mask bits to work, we
could make it a little bigger - we waste some memory but that's nothing
in comparison to the MCE.


Actually all memory hole above 4GB and under TOM2 needs to be marked
as UC, if the kernel just blanket calls init_memory_mapping from 4GB
to top of memory.

Right we would be loosing memory, and I think depending on the
alignment of the boundary and how many MTRRs you have avaiable to use,
significant chunks of memory could be lost. I need to go refresh on
how variable range MTRRs are programmed, it has been a while.



In this particular case an MTRR at 0xe0 would lose 896 MB of 
RAM, or just under 0.1% of the total.


If it is only the HT region that causes trouble and not the rest of the 
hole you could just plant an MTRR at 0xfc and not lose any 
memory at all.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:
> We are trying to discuss mitigation strategies with you, but you
> haven't really given us any useful information, e.g. what happens near
> the various boundaries of the hole, what could trigger prefeching into
> the range, and what it would take to fix the BIOSes.

Another thing we could do (I admit it is ugly) is to add a quirk to the
#MC handler and detect that specific condition by looking at the address
reported in MCi_ADDR and exit early by not panicking the system.

Again, this is ugly but a possibility, still.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Jacob Shin wrote:


Just make the hole a bit bigger, so it starts at 0xfc, then you
only need one MTRR.  This is the correct BIOS-level fix, and it really
needs to happen.

Do these systems actually exist in the field or are they engineering
prototypes?  In the latter case, we might be done at that point.


Yes, HP is shipping (or will ship soon) such systems.



Can you get them to fix the BIOS first, or at least ship a BIOS update? 
 Otherwise there will be a probabilistic failure, and it sounds like it 
is your (AMD's) fault.



The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
"spillover" from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.


Yes, I'll test again with latest, but Yinghai's patchset mapping only
RAM from top down solved our problem.


Please don't make me go Steve Ballmer on you.

We're talking about two different things... the early page tables versus 
the permanent page tables.  The permanent page tables we can handle 
because the page table creation at that point is aware of the memory map.


The early page tables are what is used before we get to that point. 
Creating them on demand means that if there are no early-needed data 
structures near the hole, there will be no access and everything will be 
okay, but as the early page table creation *is not and cannot be* aware 
of the memory map.  Right now that simply cannot happen, because all 
such data structures are confined to 32-bit addresses, however *THAT 
WILL CHANGE AND WILL CHANGE SOON*, exactly because these kinds of 
large-memory system needs that to happen.  You may start seeing failures 
at that time, and there isn't a huge lot we can do about it.


We are trying to discuss mitigation strategies with you, but you haven't 
really given us any useful information, e.g. what happens near the 
various boundaries of the hole, what could trigger prefeching into the 
range, and what it would take to fix the BIOSes.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:43 PM, H. Peter Anvin  wrote:
> On 12/19/2012 03:40 PM, Yinghai Lu wrote:
>>
>> On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin  wrote:
>>>
>>> The other bit is that building the real kernel page tables iteratively
>>> (ignoring the early page tables here) is safer, since the real page
>>> table builder is fully aware of the memory map.  This means any
>>> "spillover" from the early page tables gets minimized to regions where
>>> there are data objects that have to be accessed early.  Since Yinghai
>>> already had iterative page table building working, I don't see any
>>> reason to not use that capability.
>>
>>
>> that is v6, right?
>>
>> including that patch
>>
>
> No, that's just a different way to create the early page tables (and it
> doesn't solve anything, quite on the contrary.)  I'm talking about the
> strategy for creating the *permanent* page tables
>

i'm confused. permanent one is in tip/x86/mm2 right?

for for-x86-boot:
so you want v7 plus attached patch ? that change to 2M per PF.

Yinghai


fix_hpa_pe_pgt.patch
Description: Binary data


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:40 PM, Jacob Shin  wrote:
> On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:
>> The other bit is that building the real kernel page tables iteratively
>> (ignoring the early page tables here) is safer, since the real page
>> table builder is fully aware of the memory map.  This means any
>> "spillover" from the early page tables gets minimized to regions where
>> there are data objects that have to be accessed early.  Since Yinghai
>> already had iterative page table building working, I don't see any
>> reason to not use that capability.
>
> Yes, I'll test again with latest, but Yinghai's patchset mapping only
> RAM from top down solved our problem.

that is for-x86-mm or tip:x86/mm2

we are taking about for-x86-boot, and it will allow kernel to be loaded above 4G
to solve the kdump problem.

so early map will have two way
1. extend head_64.S to cover kernel instead of just [0, 1G)
2. or peter's #PF handler version patch to set pg table dynamically.
it could cover 1G when PF happens.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Yinghai Lu wrote:

On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin  wrote:

The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
"spillover" from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.


that is v6, right?

including that patch



No, that's just a different way to create the early page tables (and it 
doesn't solve anything, quite on the contrary.)  I'm talking about the 
strategy for creating the *permanent* page tables


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 03:03 PM, Borislav Petkov wrote:
> > On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
> >> I can check but right, they might be used up. But even if we had slots
> >> available, the memory range that needs to be covered is in large
> >> enough address and aligned in such a way that you cannot cover it with
> >> variable range MTRRs.
> > 
> > Actually, if I'm not mistaken, you only need to cover the HT hole with
> > one MTRR - the rest remains WB. And in order the mask bits to work, we
> > could make it a little bigger - we waste some memory but that's nothing
> > in comparison to the MCE.
> > 
> > You might need to talk to hw guys about the feasibility of this deal
> > though.
> > 
> 
> Just make the hole a bit bigger, so it starts at 0xfc, then you
> only need one MTRR.  This is the correct BIOS-level fix, and it really
> needs to happen.
> 
> Do these systems actually exist in the field or are they engineering
> prototypes?  In the latter case, we might be done at that point.

Yes, HP is shipping (or will ship soon) such systems.

> 
> Really, though, AMD should have added a TOM3 for memory above the 1T
> mark since they should have been able to see a 1T hole coming from the
> design of HyperTransport.  This would be the correct hardware-level fix,
> but I don't expect that to happen.
> 

I'll feed this conversation back to our hardware folks, but yes we
still need to handle today's systems.

> Now, calming down a little bit, we are definitely dealing with BIOS
> engineers and so f*ckups are going to happen, again and again.  The
> question is what to do about it.
> 
> The only truly "safe" option is to limit early mappings to 4K pages.
> This is highly undesirable for a bunch of reasons.  Reducing mapping
> granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
> the exposure somewhat; it would be interesting to gather trap statistics
> and try to get a feel for if this actually changes the boot time
> measurably or not.
> 
> The other bit is that building the real kernel page tables iteratively
> (ignoring the early page tables here) is safer, since the real page
> table builder is fully aware of the memory map.  This means any
> "spillover" from the early page tables gets minimized to regions where
> there are data objects that have to be accessed early.  Since Yinghai
> already had iterative page table building working, I don't see any
> reason to not use that capability.

Yes, I'll test again with latest, but Yinghai's patchset mapping only
RAM from top down solved our problem.

Thanks,

> 
> Thoughts?
> 
>   -hpa
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin  wrote:
> The other bit is that building the real kernel page tables iteratively
> (ignoring the early page tables here) is safer, since the real page
> table builder is fully aware of the memory map.  This means any
> "spillover" from the early page tables gets minimized to regions where
> there are data objects that have to be accessed early.  Since Yinghai
> already had iterative page table building working, I don't see any
> reason to not use that capability.

that is v6, right?

including that patch

---

Subject: [PATCH] x86, 64bit: Set extra ident mapping for whole kernel range

Current when kernel is loaded above 1G, only [_text, _text+2M] is set
up with extra ident page table.
That is not enough, some variables that could be used early are out of
that range, like BRK for early page table.
Need to set map for [_text, _end] include text/data/bss/brk...

Also current kernel is not allowed to be loaded above 512g, it thinks
that address is too big.
We need to add one extra spare page for level3 to point that 512g range.
Need to check _text range and set level4 pg with that spare level3 page,
and set level3 with level2 page to cover [_text, _end] with extra mapping.

At last, to handle crossing GB boundary, we need to add another
level2 spare page. To handle crossing 512GB boundary, we need to
add another level3 spare page to next 512G range.

Test on with kexec-tools with local test code to force loading kernel
cross 1G, 5G, 512g, 513g.

We need this to put relocatable 64bit bzImage high above 1g.

-v4: add crossing GB boundary handling.
-v5: use spare pages from BRK, so could save pages when kernel is not
loaded above 1GB.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:

[ … ]

> Now, calming down a little bit, we are definitely dealing with BIOS
> engineers and so f*ckups are going to happen, again and again.

Yeppers.

> The only truly "safe" option is to limit early mappings to 4K pages.
> This is highly undesirable for a bunch of reasons.  Reducing mapping
> granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
> the exposure somewhat; it would be interesting to gather trap statistics
> and try to get a feel for if this actually changes the boot time
> measurably or not.

This is done on the BSP, right? So we can measure it how long it takes
by taking TSC values of start and end.

> The other bit is that building the real kernel page tables iteratively
> (ignoring the early page tables here) is safer, since the real page
> table builder is fully aware of the memory map.  This means any
> "spillover" from the early page tables gets minimized to regions where
> there are data objects that have to be accessed early.

That shouldn't be a "lot", relatively speaking.

> Since Yinghai already had iterative page table building working, I
> don't see any reason to not use that capability.
> 
> Thoughts?

Sounds doable but we should take a hard look at the patches so that we
don't miss anything.

Also, I don't know how stuff like that would be approached for a wider
testing - I mean, it is a serious change in x86 boot code and there will
be issues.

Hmm.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:30 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 03:17:59PM -0800, H. Peter Anvin wrote:

I presume with "too big" he really means "oddly shaped".


Yeah, that's why it could be enlarged a little in order to adjust it to
the MTRR scheme. This is what the BKDG says about it:



Yes, they should just cap the hole a few megabytes short and put an UC 
MTRR at 0xfc.  That should happen regardless... this system is 
dangerous without it.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:17:59PM -0800, H. Peter Anvin wrote:
> I presume with "too big" he really means "oddly shaped".

Yeah, that's why it could be enlarged a little in order to adjust it to
the MTRR scheme. This is what the BKDG says about it:

PhysMask and PhysBase are used together to determine whether a target
physical-address falls within the specified address range. PhysMask
is logically ANDed with PhysBase and separately ANDed with the upper
40 bits of the target physical-address. If the results of the two
operations are identical, the target physical-address falls within the
specified memory range. The pseudo-code for the operation is:

MaskBase = PhysMask AND PhysBase
MaskTarget = PhysMask AND Target_Address[51:12]
IF MaskBase == MaskTarget
target address is in range
ELSE
target address is not in range

And then there are the alignment requirements:

* The boundary on which a variable range is aligned must be equal to the
range size. For example, a memory range of 16 Mbytes must be aligned on
a 16-Mbyte boundary.

* The range size must be a power of 2 (2n, 52 > n > 11), with a minimum
allowable size of 4 Kbytes. For example, 4 Mbytes and 8 Mbytes are
allowable memory range sizes, but 6 Mbytes is not allowable.

and then some examples about how to calculate those values.

Jacob, if you still have the system, you might try to experiment with
that, provided there are some variable MTRRs free, of course. And also
provided, there's nothing else in the hw stopping us from doing that.

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 02:55 PM, Jacob Shin wrote:
> 
> Well, really the problem is with any memory hole above 4GB that is too
> big to be covered by variable range MTRRs as UC. Because the kernel
> use to just simply do init_memory_mapping for 4GB ~ top of memory,
> any memory hole above 4GB are marked as WB in PATs.
> 
> How is this handled in Intel architecture? If there are memory holes
> that are too big to be covered by variable range MTRRs as UC, are
> there other MTRR like CPU registers that the BIOS programs?
> 

Intel CPUs don't have the TOM augmentation to the MTRR mechanism, and so
MTRRs need to explicitly enable caching of memory rather than the other
way around.

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 03:03 PM, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
>> I can check but right, they might be used up. But even if we had slots
>> available, the memory range that needs to be covered is in large
>> enough address and aligned in such a way that you cannot cover it with
>> variable range MTRRs.
> 
> Actually, if I'm not mistaken, you only need to cover the HT hole with
> one MTRR - the rest remains WB. And in order the mask bits to work, we
> could make it a little bigger - we waste some memory but that's nothing
> in comparison to the MCE.
> 
> You might need to talk to hw guys about the feasibility of this deal
> though.
> 

Just make the hole a bit bigger, so it starts at 0xfc, then you
only need one MTRR.  This is the correct BIOS-level fix, and it really
needs to happen.

Do these systems actually exist in the field or are they engineering
prototypes?  In the latter case, we might be done at that point.

Really, though, AMD should have added a TOM3 for memory above the 1T
mark since they should have been able to see a 1T hole coming from the
design of HyperTransport.  This would be the correct hardware-level fix,
but I don't expect that to happen.

Now, calming down a little bit, we are definitely dealing with BIOS
engineers and so f*ckups are going to happen, again and again.  The
question is what to do about it.

The only truly "safe" option is to limit early mappings to 4K pages.
This is highly undesirable for a bunch of reasons.  Reducing mapping
granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
the exposure somewhat; it would be interesting to gather trap statistics
and try to get a feel for if this actually changes the boot time
measurably or not.

The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
"spillover" from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.

Thoughts?

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Thu, Dec 20, 2012 at 12:03:29AM +0100, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
> > I can check but right, they might be used up. But even if we had slots
> > available, the memory range that needs to be covered is in large
> > enough address and aligned in such a way that you cannot cover it with
> > variable range MTRRs.
> 
> Actually, if I'm not mistaken, you only need to cover the HT hole with
> one MTRR - the rest remains WB. And in order the mask bits to work, we
> could make it a little bigger - we waste some memory but that's nothing
> in comparison to the MCE.

Actually all memory hole above 4GB and under TOM2 needs to be marked
as UC, if the kernel just blanket calls init_memory_mapping from 4GB
to top of memory.

Right we would be loosing memory, and I think depending on the
alignment of the boundary and how many MTRRs you have avaiable to use,
significant chunks of memory could be lost. I need to go refresh on
how variable range MTRRs are programmed, it has been a while.

> 
> You might need to talk to hw guys about the feasibility of this deal
> though.
> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 03:00 PM, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 04:55:06PM -0600, Jacob Shin wrote:
>> Well, really the problem is with any memory hole above 4GB that is too
>> big to be covered by variable range MTRRs as UC.
> 
> Why, their PhysBase field is the 40 MSB bits of the physical address.
> That should be more than TB.
> 

I presume with "too big" he really means "oddly shaped".

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
> I can check but right, they might be used up. But even if we had slots
> available, the memory range that needs to be covered is in large
> enough address and aligned in such a way that you cannot cover it with
> variable range MTRRs.

Actually, if I'm not mistaken, you only need to cover the HT hole with
one MTRR - the rest remains WB. And in order the mask bits to work, we
could make it a little bigger - we waste some memory but that's nothing
in comparison to the MCE.

You might need to talk to hw guys about the feasibility of this deal
though.

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:55:06PM -0600, Jacob Shin wrote:
> Well, really the problem is with any memory hole above 4GB that is too
> big to be covered by variable range MTRRs as UC.

Why, their PhysBase field is the 40 MSB bits of the physical address.
That should be more than TB.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 11:51:55PM +0100, Borislav Petkov wrote:
> On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
> > The real question is what we can do to mitigate the damage.
> 
> Let's try the first thing that comes to mind: waste a variable MTRR on
> it:
> 
> [0.00] MTRR variable ranges enabled:
> [0.00]   0 base  mask 8000 write-back
> [0.00]   1 base 8000 mask C000 write-back
> [0.00]   2 base C000 mask F000 write-back
> [0.00]   3 base 0001 mask  write-back
> [0.00]   4 base 0002 mask E000 write-back
> [0.00]   5 base 00022000 mask F000 write-back
> [0.00]   6 disabled
> [0.00]   7 disabled
> 
> one of those last two. This is a small box though so I'm guessing on 1T
> boxes those last two won't be disabled. Jacob?

I can check but right, they might be used up. But even if we had slots
available, the memory range that needs to be covered is in large
enough address and aligned in such a way that you cannot cover it with
variable range MTRRs.

> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 02:47 PM, Yinghai Lu wrote:
> 
> on demand to only map 2M will help ?
> or have to return to v6 version for-x86-boot ?
> 

Why would 2M be inherently better than 1G?  I realize it works for the
*one particular system* that you have a specimen for, but that is not a
sensible approach for architecture.

The problem remains no matter how you slice it; we need a general
solution.  The fact that this system was ever built reflects a number of
critical failures that should be surprising but sadly are not.

-hpa

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
> On 12/19/2012 02:05 PM, Jacob Shin wrote:
> >On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
> >>There are a few very serious problems we need to figure out related to 
> >>generalizing very early boot.  If this range gets mapped, will the CPU 
> >>treat it as WB?  If so, with what consequences for either the HT region or 
> >>the hole below it?
> >
> >Hm .. I guess I need to read the whole email thread .. but if you can
> >explain it in short, what are the problems?
> >
> >Yes the CPU treats it as WB because the region is under TOM2, so by
> >default it is WB, and also when you create direct mapping page tables,
> >the PATs mark them as WB.
> >
> >What we have seen is that even though the kernel never generate memory
> >accesses in the hole (since E820 says that it is not RAM) when kernel
> >read/writes memory near the hole, the CPU was prefetching into the
> >hole because PATs say that it is WB. This resulted in MCE because
> >there is no physical RAM there.
> >
> 
> IOW, epic f*ckup.
> 
> The problem is that before we have awareness of the memory map, we
> need to map things in order to access them.  This is a big problem
> and right now there are ridiculous heuristics.  I have been working
> on mapping on demand, but there are concerns about the boundaries
> (i.e. what happens if the mapping spill over into a pit like this.)
> 
> This kind of stuff is really not acceptable.  A region which will
> cause malfunction if prefetched should not be WB in the MTRR system
> (I include TOM* in that.)  The real question is what we can do to
> mitigate the damage.

Well, really the problem is with any memory hole above 4GB that is too
big to be covered by variable range MTRRs as UC. Because the kernel
use to just simply do init_memory_mapping for 4GB ~ top of memory,
any memory hole above 4GB are marked as WB in PATs.

How is this handled in Intel architecture? If there are memory holes
that are too big to be covered by variable range MTRRs as UC, are
there other MTRR like CPU registers that the BIOS programs?


Thanks,

-Jacob

> 
>   -hpa
> 
> -- 
> H. Peter Anvin, Intel Open Source Technology Center
> I work for Intel.  I don't speak on their behalf.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
> The real question is what we can do to mitigate the damage.

Let's try the first thing that comes to mind: waste a variable MTRR on
it:

[0.00] MTRR variable ranges enabled:
[0.00]   0 base  mask 8000 write-back
[0.00]   1 base 8000 mask C000 write-back
[0.00]   2 base C000 mask F000 write-back
[0.00]   3 base 0001 mask  write-back
[0.00]   4 base 0002 mask E000 write-back
[0.00]   5 base 00022000 mask F000 write-back
[0.00]   6 disabled
[0.00]   7 disabled

one of those last two. This is a small box though so I'm guessing on 1T
boxes those last two won't be disabled. Jacob?

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 2:25 PM, H. Peter Anvin  wrote:
>
> The problem is that before we have awareness of the memory map, we need to
> map things in order to access them.  This is a big problem and right now
> there are ridiculous heuristics.  I have been working on mapping on demand,
> but there are concerns about the boundaries (i.e. what happens if the
> mapping spill over into a pit like this.)
>
> This kind of stuff is really not acceptable.  A region which will cause
> malfunction if prefetched should not be WB in the MTRR system (I include
> TOM* in that.)  The real question is what we can do to mitigate the damage.

on demand to only map 2M will help ?
or have to return to v6 version for-x86-boot ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 02:05 PM, Jacob Shin wrote:

On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:

There are a few very serious problems we need to figure out related to 
generalizing very early boot.  If this range gets mapped, will the CPU treat it 
as WB?  If so, with what consequences for either the HT region or the hole 
below it?


Hm .. I guess I need to read the whole email thread .. but if you can
explain it in short, what are the problems?

Yes the CPU treats it as WB because the region is under TOM2, so by
default it is WB, and also when you create direct mapping page tables,
the PATs mark them as WB.

What we have seen is that even though the kernel never generate memory
accesses in the hole (since E820 says that it is not RAM) when kernel
read/writes memory near the hole, the CPU was prefetching into the
hole because PATs say that it is WB. This resulted in MCE because
there is no physical RAM there.



IOW, epic f*ckup.

The problem is that before we have awareness of the memory map, we need 
to map things in order to access them.  This is a big problem and right 
now there are ridiculous heuristics.  I have been working on mapping on 
demand, but there are concerns about the boundaries (i.e. what happens 
if the mapping spill over into a pit like this.)


This kind of stuff is really not acceptable.  A region which will cause 
malfunction if prefetched should not be WB in the MTRR system (I include 
TOM* in that.)  The real question is what we can do to mitigate the damage.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
> There are a few very serious problems we need to figure out related to 
> generalizing very early boot.  If this range gets mapped, will the CPU treat 
> it as WB?  If so, with what consequences for either the HT region or the hole 
> below it?

Hm .. I guess I need to read the whole email thread .. but if you can
explain it in short, what are the problems?

Yes the CPU treats it as WB because the region is under TOM2, so by
default it is WB, and also when you create direct mapping page tables,
the PATs mark them as WB.

What we have seen is that even though the kernel never generate memory
accesses in the hole (since E820 says that it is not RAM) when kernel
read/writes memory near the hole, the CPU was prefetching into the
hole because PATs say that it is WB. This resulted in MCE because
there is no physical RAM there.

-Jacob

> 
> Jacob Shin  wrote:
> 
> >On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
> >> On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
> >> > On 12/15/2012 03:15 PM, Yinghai Lu wrote:
> >> > >>
> >> > >>That is for the kernel region itself (that code is actually
> >unchanged from
> >> > >>the current code), and yes, we could cap that one to _end if
> >there are
> >> > >>systems which have bugs in that area.  The dynamic page tables
> >map 1G
> >> > >>aligned at a time.
> >> > >
> >> > >dynamic should be 2M too.
> >> > >
> >> > >AMD system:
> >> > >
> >> >
> >>http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> >> > >
> >> > >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> >> > >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> >> > >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> >> > >
> >> > >the hole is not 1G aligned.
> >> > >
> >> > >or HT region is from e04000 ?
> >> > >
> >> > 
> >> > The HT region starts at 0xfd -- after that reserved region,
> >> > so I have no idea what that particular system is trying to do or
> >> > what is requirements are (nor what its MTRR setup is, since you
> >> > didn't post it.)
> >> 
> >> This is something that Jacob should be able to answer since he's been
> >> dealing with the 1T support.
> >> 
> >> Jacob, how is the HT hole marked on AMD? I know hazily that we do say
> >> "all memory regions cacheable by default if not explicitly marked"
> >but
> >> we need to exclude the HT hole from that, right?
> >> 
> >> So how are we doing that, MTRRs?
> >
> >HT hole is architectural, I guess in manuals somewhere and is:
> >0xfd ~ 0x100. CPU cannot generate memory read/write in
> >that region.
> >
> >On that above particular system, there is 1TB of total RAM, and since
> >we do not want to loose memory around the HT hole, what BIOS has done
> >is programmed the DRAM controller to move the last 128 GB of memory
> >to above the HT region. There are 8 memory nodes, the last DRAM
> >address of the 7th node is 0xe03800. Then there is a hole and the
> >first address of the last memory node starts at 1TB.
> >
> >MTRRs only cover under 4GB, and does not cover the HT hole.
> >
> >Yinghai's mm patchset to only direct map regions backed by RAM solves
> >our memory hole around HT area.
> >
> >I've tested Yinghai's patchset (several of early versions)
> >successfully on our above 1TB system. I'll try the latest tip/mm2
> >again sometime later today, but I'm pretty sure it should be fine.
> >
> >Thanks,
> >
> >-Jacob
> >
> >> 
> >> Thanks.
> >> 
> >> -- 
> >> Regards/Gruss,
> >> Boris.
> >> 
> >> Sent from a fat crate under my desk. Formatting is fine.
> >> --
> >> 
> 
> -- 
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
There are a few very serious problems we need to figure out related to 
generalizing very early boot.  If this range gets mapped, will the CPU treat it 
as WB?  If so, with what consequences for either the HT region or the hole 
below it?

Jacob Shin  wrote:

>On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
>> On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
>> > On 12/15/2012 03:15 PM, Yinghai Lu wrote:
>> > >>
>> > >>That is for the kernel region itself (that code is actually
>unchanged from
>> > >>the current code), and yes, we could cap that one to _end if
>there are
>> > >>systems which have bugs in that area.  The dynamic page tables
>map 1G
>> > >>aligned at a time.
>> > >
>> > >dynamic should be 2M too.
>> > >
>> > >AMD system:
>> > >
>> >
>>http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
>> > >
>> > >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
>> > >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
>> > >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
>> > >
>> > >the hole is not 1G aligned.
>> > >
>> > >or HT region is from e04000 ?
>> > >
>> > 
>> > The HT region starts at 0xfd -- after that reserved region,
>> > so I have no idea what that particular system is trying to do or
>> > what is requirements are (nor what its MTRR setup is, since you
>> > didn't post it.)
>> 
>> This is something that Jacob should be able to answer since he's been
>> dealing with the 1T support.
>> 
>> Jacob, how is the HT hole marked on AMD? I know hazily that we do say
>> "all memory regions cacheable by default if not explicitly marked"
>but
>> we need to exclude the HT hole from that, right?
>> 
>> So how are we doing that, MTRRs?
>
>HT hole is architectural, I guess in manuals somewhere and is:
>0xfd ~ 0x100. CPU cannot generate memory read/write in
>that region.
>
>On that above particular system, there is 1TB of total RAM, and since
>we do not want to loose memory around the HT hole, what BIOS has done
>is programmed the DRAM controller to move the last 128 GB of memory
>to above the HT region. There are 8 memory nodes, the last DRAM
>address of the 7th node is 0xe03800. Then there is a hole and the
>first address of the last memory node starts at 1TB.
>
>MTRRs only cover under 4GB, and does not cover the HT hole.
>
>Yinghai's mm patchset to only direct map regions backed by RAM solves
>our memory hole around HT area.
>
>I've tested Yinghai's patchset (several of early versions)
>successfully on our above 1TB system. I'll try the latest tip/mm2
>again sometime later today, but I'm pretty sure it should be fine.
>
>Thanks,
>
>-Jacob
>
>> 
>> Thanks.
>> 
>> -- 
>> Regards/Gruss,
>> Boris.
>> 
>> Sent from a fat crate under my desk. Formatting is fine.
>> --
>> 

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
> On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
> > On 12/15/2012 03:15 PM, Yinghai Lu wrote:
> > >>
> > >>That is for the kernel region itself (that code is actually unchanged from
> > >>the current code), and yes, we could cap that one to _end if there are
> > >>systems which have bugs in that area.  The dynamic page tables map 1G
> > >>aligned at a time.
> > >
> > >dynamic should be 2M too.
> > >
> > >AMD system:
> > >
> > >http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> > >
> > >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> > >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> > >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> > >
> > >the hole is not 1G aligned.
> > >
> > >or HT region is from e04000 ?
> > >
> > 
> > The HT region starts at 0xfd -- after that reserved region,
> > so I have no idea what that particular system is trying to do or
> > what is requirements are (nor what its MTRR setup is, since you
> > didn't post it.)
> 
> This is something that Jacob should be able to answer since he's been
> dealing with the 1T support.
> 
> Jacob, how is the HT hole marked on AMD? I know hazily that we do say
> "all memory regions cacheable by default if not explicitly marked" but
> we need to exclude the HT hole from that, right?
> 
> So how are we doing that, MTRRs?

HT hole is architectural, I guess in manuals somewhere and is:
0xfd ~ 0x100. CPU cannot generate memory read/write in
that region.

On that above particular system, there is 1TB of total RAM, and since
we do not want to loose memory around the HT hole, what BIOS has done
is programmed the DRAM controller to move the last 128 GB of memory
to above the HT region. There are 8 memory nodes, the last DRAM
address of the 7th node is 0xe03800. Then there is a hole and the
first address of the last memory node starts at 1TB.

MTRRs only cover under 4GB, and does not cover the HT hole.

Yinghai's mm patchset to only direct map regions backed by RAM solves
our memory hole around HT area.

I've tested Yinghai's patchset (several of early versions)
successfully on our above 1TB system. I'll try the latest tip/mm2
again sometime later today, but I'm pretty sure it should be fine.

Thanks,

-Jacob

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
> Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
> On 12/15/2012 03:15 PM, Yinghai Lu wrote:
> >>
> >>That is for the kernel region itself (that code is actually unchanged from
> >>the current code), and yes, we could cap that one to _end if there are
> >>systems which have bugs in that area.  The dynamic page tables map 1G
> >>aligned at a time.
> >
> >dynamic should be 2M too.
> >
> >AMD system:
> >
> >http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> >
> >  BIOS-e820: [mem 0x0001-0x00e037ff] usable
> >  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
> >  BIOS-e820: [mem 0x0100-0x011ffeff] usable
> >
> >the hole is not 1G aligned.
> >
> >or HT region is from e04000 ?
> >
> 
> The HT region starts at 0xfd -- after that reserved region,
> so I have no idea what that particular system is trying to do or
> what is requirements are (nor what its MTRR setup is, since you
> didn't post it.)

This is something that Jacob should be able to answer since he's been
dealing with the 1T support.

Jacob, how is the HT hole marked on AMD? I know hazily that we do say
"all memory regions cacheable by default if not explicitly marked" but
we need to exclude the HT hole from that, right?

So how are we doing that, MTRRs?

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
 On 12/15/2012 03:15 PM, Yinghai Lu wrote:
 
 That is for the kernel region itself (that code is actually unchanged from
 the current code), and yes, we could cap that one to _end if there are
 systems which have bugs in that area.  The dynamic page tables map 1G
 aligned at a time.
 
 dynamic should be 2M too.
 
 AMD system:
 
 http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
 
   BIOS-e820: [mem 0x0001-0x00e037ff] usable
   BIOS-e820: [mem 0x00e03800-0x00fc] reserved
   BIOS-e820: [mem 0x0100-0x011ffeff] usable
 
 the hole is not 1G aligned.
 
 or HT region is from e04000 ?
 
 
 The HT region starts at 0xfd -- after that reserved region,
 so I have no idea what that particular system is trying to do or
 what is requirements are (nor what its MTRR setup is, since you
 didn't post it.)

This is something that Jacob should be able to answer since he's been
dealing with the 1T support.

Jacob, how is the HT hole marked on AMD? I know hazily that we do say
all memory regions cacheable by default if not explicitly marked but
we need to exclude the HT hole from that, right?

So how are we doing that, MTRRs?

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
 On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
  On 12/15/2012 03:15 PM, Yinghai Lu wrote:
  
  That is for the kernel region itself (that code is actually unchanged from
  the current code), and yes, we could cap that one to _end if there are
  systems which have bugs in that area.  The dynamic page tables map 1G
  aligned at a time.
  
  dynamic should be 2M too.
  
  AMD system:
  
  http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
  
BIOS-e820: [mem 0x0001-0x00e037ff] usable
BIOS-e820: [mem 0x00e03800-0x00fc] reserved
BIOS-e820: [mem 0x0100-0x011ffeff] usable
  
  the hole is not 1G aligned.
  
  or HT region is from e04000 ?
  
  
  The HT region starts at 0xfd -- after that reserved region,
  so I have no idea what that particular system is trying to do or
  what is requirements are (nor what its MTRR setup is, since you
  didn't post it.)
 
 This is something that Jacob should be able to answer since he's been
 dealing with the 1T support.
 
 Jacob, how is the HT hole marked on AMD? I know hazily that we do say
 all memory regions cacheable by default if not explicitly marked but
 we need to exclude the HT hole from that, right?
 
 So how are we doing that, MTRRs?

HT hole is architectural, I guess in manuals somewhere and is:
0xfd ~ 0x100. CPU cannot generate memory read/write in
that region.

On that above particular system, there is 1TB of total RAM, and since
we do not want to loose memory around the HT hole, what BIOS has done
is programmed the DRAM controller to move the last 128 GB of memory
to above the HT region. There are 8 memory nodes, the last DRAM
address of the 7th node is 0xe03800. Then there is a hole and the
first address of the last memory node starts at 1TB.

MTRRs only cover under 4GB, and does not cover the HT hole.

Yinghai's mm patchset to only direct map regions backed by RAM solves
our memory hole around HT area.

I've tested Yinghai's patchset (several of early versions)
successfully on our above 1TB system. I'll try the latest tip/mm2
again sometime later today, but I'm pretty sure it should be fine.

Thanks,

-Jacob

 
 Thanks.
 
 -- 
 Regards/Gruss,
 Boris.
 
 Sent from a fat crate under my desk. Formatting is fine.
 --
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
There are a few very serious problems we need to figure out related to 
generalizing very early boot.  If this range gets mapped, will the CPU treat it 
as WB?  If so, with what consequences for either the HT region or the hole 
below it?

Jacob Shin jacob.s...@amd.com wrote:

On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
 On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
  On 12/15/2012 03:15 PM, Yinghai Lu wrote:
  
  That is for the kernel region itself (that code is actually
unchanged from
  the current code), and yes, we could cap that one to _end if
there are
  systems which have bugs in that area.  The dynamic page tables
map 1G
  aligned at a time.
  
  dynamic should be 2M too.
  
  AMD system:
  
 
http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
  
BIOS-e820: [mem 0x0001-0x00e037ff] usable
BIOS-e820: [mem 0x00e03800-0x00fc] reserved
BIOS-e820: [mem 0x0100-0x011ffeff] usable
  
  the hole is not 1G aligned.
  
  or HT region is from e04000 ?
  
  
  The HT region starts at 0xfd -- after that reserved region,
  so I have no idea what that particular system is trying to do or
  what is requirements are (nor what its MTRR setup is, since you
  didn't post it.)
 
 This is something that Jacob should be able to answer since he's been
 dealing with the 1T support.
 
 Jacob, how is the HT hole marked on AMD? I know hazily that we do say
 all memory regions cacheable by default if not explicitly marked
but
 we need to exclude the HT hole from that, right?
 
 So how are we doing that, MTRRs?

HT hole is architectural, I guess in manuals somewhere and is:
0xfd ~ 0x100. CPU cannot generate memory read/write in
that region.

On that above particular system, there is 1TB of total RAM, and since
we do not want to loose memory around the HT hole, what BIOS has done
is programmed the DRAM controller to move the last 128 GB of memory
to above the HT region. There are 8 memory nodes, the last DRAM
address of the 7th node is 0xe03800. Then there is a hole and the
first address of the last memory node starts at 1TB.

MTRRs only cover under 4GB, and does not cover the HT hole.

Yinghai's mm patchset to only direct map regions backed by RAM solves
our memory hole around HT area.

I've tested Yinghai's patchset (several of early versions)
successfully on our above 1TB system. I'll try the latest tip/mm2
again sometime later today, but I'm pretty sure it should be fine.

Thanks,

-Jacob

 
 Thanks.
 
 -- 
 Regards/Gruss,
 Boris.
 
 Sent from a fat crate under my desk. Formatting is fine.
 --
 

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
 There are a few very serious problems we need to figure out related to 
 generalizing very early boot.  If this range gets mapped, will the CPU treat 
 it as WB?  If so, with what consequences for either the HT region or the hole 
 below it?

Hm .. I guess I need to read the whole email thread .. but if you can
explain it in short, what are the problems?

Yes the CPU treats it as WB because the region is under TOM2, so by
default it is WB, and also when you create direct mapping page tables,
the PATs mark them as WB.

What we have seen is that even though the kernel never generate memory
accesses in the hole (since E820 says that it is not RAM) when kernel
read/writes memory near the hole, the CPU was prefetching into the
hole because PATs say that it is WB. This resulted in MCE because
there is no physical RAM there.

-Jacob

 
 Jacob Shin jacob.s...@amd.com wrote:
 
 On Wed, Dec 19, 2012 at 09:37:51PM +0100, Borislav Petkov wrote:
  On Sat, Dec 15, 2012 at 03:17:05PM -0800, H. Peter Anvin wrote:
   On 12/15/2012 03:15 PM, Yinghai Lu wrote:
   
   That is for the kernel region itself (that code is actually
 unchanged from
   the current code), and yes, we could cap that one to _end if
 there are
   systems which have bugs in that area.  The dynamic page tables
 map 1G
   aligned at a time.
   
   dynamic should be 2M too.
   
   AMD system:
   
  
 http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
   
 BIOS-e820: [mem 0x0001-0x00e037ff] usable
 BIOS-e820: [mem 0x00e03800-0x00fc] reserved
 BIOS-e820: [mem 0x0100-0x011ffeff] usable
   
   the hole is not 1G aligned.
   
   or HT region is from e04000 ?
   
   
   The HT region starts at 0xfd -- after that reserved region,
   so I have no idea what that particular system is trying to do or
   what is requirements are (nor what its MTRR setup is, since you
   didn't post it.)
  
  This is something that Jacob should be able to answer since he's been
  dealing with the 1T support.
  
  Jacob, how is the HT hole marked on AMD? I know hazily that we do say
  all memory regions cacheable by default if not explicitly marked
 but
  we need to exclude the HT hole from that, right?
  
  So how are we doing that, MTRRs?
 
 HT hole is architectural, I guess in manuals somewhere and is:
 0xfd ~ 0x100. CPU cannot generate memory read/write in
 that region.
 
 On that above particular system, there is 1TB of total RAM, and since
 we do not want to loose memory around the HT hole, what BIOS has done
 is programmed the DRAM controller to move the last 128 GB of memory
 to above the HT region. There are 8 memory nodes, the last DRAM
 address of the 7th node is 0xe03800. Then there is a hole and the
 first address of the last memory node starts at 1TB.
 
 MTRRs only cover under 4GB, and does not cover the HT hole.
 
 Yinghai's mm patchset to only direct map regions backed by RAM solves
 our memory hole around HT area.
 
 I've tested Yinghai's patchset (several of early versions)
 successfully on our above 1TB system. I'll try the latest tip/mm2
 again sometime later today, but I'm pretty sure it should be fine.
 
 Thanks,
 
 -Jacob
 
  
  Thanks.
  
  -- 
  Regards/Gruss,
  Boris.
  
  Sent from a fat crate under my desk. Formatting is fine.
  --
  
 
 -- 
 Sent from my mobile phone. Please excuse brevity and lack of formatting.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 02:05 PM, Jacob Shin wrote:

On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:

There are a few very serious problems we need to figure out related to 
generalizing very early boot.  If this range gets mapped, will the CPU treat it 
as WB?  If so, with what consequences for either the HT region or the hole 
below it?


Hm .. I guess I need to read the whole email thread .. but if you can
explain it in short, what are the problems?

Yes the CPU treats it as WB because the region is under TOM2, so by
default it is WB, and also when you create direct mapping page tables,
the PATs mark them as WB.

What we have seen is that even though the kernel never generate memory
accesses in the hole (since E820 says that it is not RAM) when kernel
read/writes memory near the hole, the CPU was prefetching into the
hole because PATs say that it is WB. This resulted in MCE because
there is no physical RAM there.



IOW, epic f*ckup.

The problem is that before we have awareness of the memory map, we need 
to map things in order to access them.  This is a big problem and right 
now there are ridiculous heuristics.  I have been working on mapping on 
demand, but there are concerns about the boundaries (i.e. what happens 
if the mapping spill over into a pit like this.)


This kind of stuff is really not acceptable.  A region which will cause 
malfunction if prefetched should not be WB in the MTRR system (I include 
TOM* in that.)  The real question is what we can do to mitigate the damage.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 2:25 PM, H. Peter Anvin h...@zytor.com wrote:

 The problem is that before we have awareness of the memory map, we need to
 map things in order to access them.  This is a big problem and right now
 there are ridiculous heuristics.  I have been working on mapping on demand,
 but there are concerns about the boundaries (i.e. what happens if the
 mapping spill over into a pit like this.)

 This kind of stuff is really not acceptable.  A region which will cause
 malfunction if prefetched should not be WB in the MTRR system (I include
 TOM* in that.)  The real question is what we can do to mitigate the damage.

on demand to only map 2M will help ?
or have to return to v6 version for-x86-boot ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
 The real question is what we can do to mitigate the damage.

Let's try the first thing that comes to mind: waste a variable MTRR on
it:

[0.00] MTRR variable ranges enabled:
[0.00]   0 base  mask 8000 write-back
[0.00]   1 base 8000 mask C000 write-back
[0.00]   2 base C000 mask F000 write-back
[0.00]   3 base 0001 mask  write-back
[0.00]   4 base 0002 mask E000 write-back
[0.00]   5 base 00022000 mask F000 write-back
[0.00]   6 disabled
[0.00]   7 disabled

one of those last two. This is a small box though so I'm guessing on 1T
boxes those last two won't be disabled. Jacob?

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 02:05 PM, Jacob Shin wrote:
 On Wed, Dec 19, 2012 at 01:48:33PM -0800, H. Peter Anvin wrote:
 There are a few very serious problems we need to figure out related to 
 generalizing very early boot.  If this range gets mapped, will the CPU 
 treat it as WB?  If so, with what consequences for either the HT region or 
 the hole below it?
 
 Hm .. I guess I need to read the whole email thread .. but if you can
 explain it in short, what are the problems?
 
 Yes the CPU treats it as WB because the region is under TOM2, so by
 default it is WB, and also when you create direct mapping page tables,
 the PATs mark them as WB.
 
 What we have seen is that even though the kernel never generate memory
 accesses in the hole (since E820 says that it is not RAM) when kernel
 read/writes memory near the hole, the CPU was prefetching into the
 hole because PATs say that it is WB. This resulted in MCE because
 there is no physical RAM there.
 
 
 IOW, epic f*ckup.
 
 The problem is that before we have awareness of the memory map, we
 need to map things in order to access them.  This is a big problem
 and right now there are ridiculous heuristics.  I have been working
 on mapping on demand, but there are concerns about the boundaries
 (i.e. what happens if the mapping spill over into a pit like this.)
 
 This kind of stuff is really not acceptable.  A region which will
 cause malfunction if prefetched should not be WB in the MTRR system
 (I include TOM* in that.)  The real question is what we can do to
 mitigate the damage.

Well, really the problem is with any memory hole above 4GB that is too
big to be covered by variable range MTRRs as UC. Because the kernel
use to just simply do init_memory_mapping for 4GB ~ top of memory,
any memory hole above 4GB are marked as WB in PATs.

How is this handled in Intel architecture? If there are memory holes
that are too big to be covered by variable range MTRRs as UC, are
there other MTRR like CPU registers that the BIOS programs?


Thanks,

-Jacob

 
   -hpa
 
 -- 
 H. Peter Anvin, Intel Open Source Technology Center
 I work for Intel.  I don't speak on their behalf.
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 02:47 PM, Yinghai Lu wrote:
 
 on demand to only map 2M will help ?
 or have to return to v6 version for-x86-boot ?
 

Why would 2M be inherently better than 1G?  I realize it works for the
*one particular system* that you have a specimen for, but that is not a
sensible approach for architecture.

The problem remains no matter how you slice it; we need a general
solution.  The fact that this system was ever built reflects a number of
critical failures that should be surprising but sadly are not.

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 11:51:55PM +0100, Borislav Petkov wrote:
 On Wed, Dec 19, 2012 at 02:25:44PM -0800, H. Peter Anvin wrote:
  The real question is what we can do to mitigate the damage.
 
 Let's try the first thing that comes to mind: waste a variable MTRR on
 it:
 
 [0.00] MTRR variable ranges enabled:
 [0.00]   0 base  mask 8000 write-back
 [0.00]   1 base 8000 mask C000 write-back
 [0.00]   2 base C000 mask F000 write-back
 [0.00]   3 base 0001 mask  write-back
 [0.00]   4 base 0002 mask E000 write-back
 [0.00]   5 base 00022000 mask F000 write-back
 [0.00]   6 disabled
 [0.00]   7 disabled
 
 one of those last two. This is a small box though so I'm guessing on 1T
 boxes those last two won't be disabled. Jacob?

I can check but right, they might be used up. But even if we had slots
available, the memory range that needs to be covered is in large
enough address and aligned in such a way that you cannot cover it with
variable range MTRRs.

 
 -- 
 Regards/Gruss,
 Boris.
 
 Sent from a fat crate under my desk. Formatting is fine.
 --
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:55:06PM -0600, Jacob Shin wrote:
 Well, really the problem is with any memory hole above 4GB that is too
 big to be covered by variable range MTRRs as UC.

Why, their PhysBase field is the 40 MSB bits of the physical address.
That should be more than TB.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
 I can check but right, they might be used up. But even if we had slots
 available, the memory range that needs to be covered is in large
 enough address and aligned in such a way that you cannot cover it with
 variable range MTRRs.

Actually, if I'm not mistaken, you only need to cover the HT hole with
one MTRR - the rest remains WB. And in order the mask bits to work, we
could make it a little bigger - we waste some memory but that's nothing
in comparison to the MCE.

You might need to talk to hw guys about the feasibility of this deal
though.

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 03:00 PM, Borislav Petkov wrote:
 On Wed, Dec 19, 2012 at 04:55:06PM -0600, Jacob Shin wrote:
 Well, really the problem is with any memory hole above 4GB that is too
 big to be covered by variable range MTRRs as UC.
 
 Why, their PhysBase field is the 40 MSB bits of the physical address.
 That should be more than TB.
 

I presume with too big he really means oddly shaped.

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Thu, Dec 20, 2012 at 12:03:29AM +0100, Borislav Petkov wrote:
 On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
  I can check but right, they might be used up. But even if we had slots
  available, the memory range that needs to be covered is in large
  enough address and aligned in such a way that you cannot cover it with
  variable range MTRRs.
 
 Actually, if I'm not mistaken, you only need to cover the HT hole with
 one MTRR - the rest remains WB. And in order the mask bits to work, we
 could make it a little bigger - we waste some memory but that's nothing
 in comparison to the MCE.

Actually all memory hole above 4GB and under TOM2 needs to be marked
as UC, if the kernel just blanket calls init_memory_mapping from 4GB
to top of memory.

Right we would be loosing memory, and I think depending on the
alignment of the boundary and how many MTRRs you have avaiable to use,
significant chunks of memory could be lost. I need to go refresh on
how variable range MTRRs are programmed, it has been a while.

 
 You might need to talk to hw guys about the feasibility of this deal
 though.
 
 Thanks.
 
 -- 
 Regards/Gruss,
 Boris.
 
 Sent from a fat crate under my desk. Formatting is fine.
 --
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 03:03 PM, Borislav Petkov wrote:
 On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
 I can check but right, they might be used up. But even if we had slots
 available, the memory range that needs to be covered is in large
 enough address and aligned in such a way that you cannot cover it with
 variable range MTRRs.
 
 Actually, if I'm not mistaken, you only need to cover the HT hole with
 one MTRR - the rest remains WB. And in order the mask bits to work, we
 could make it a little bigger - we waste some memory but that's nothing
 in comparison to the MCE.
 
 You might need to talk to hw guys about the feasibility of this deal
 though.
 

Just make the hole a bit bigger, so it starts at 0xfc, then you
only need one MTRR.  This is the correct BIOS-level fix, and it really
needs to happen.

Do these systems actually exist in the field or are they engineering
prototypes?  In the latter case, we might be done at that point.

Really, though, AMD should have added a TOM3 for memory above the 1T
mark since they should have been able to see a 1T hole coming from the
design of HyperTransport.  This would be the correct hardware-level fix,
but I don't expect that to happen.

Now, calming down a little bit, we are definitely dealing with BIOS
engineers and so f*ckups are going to happen, again and again.  The
question is what to do about it.

The only truly safe option is to limit early mappings to 4K pages.
This is highly undesirable for a bunch of reasons.  Reducing mapping
granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
the exposure somewhat; it would be interesting to gather trap statistics
and try to get a feel for if this actually changes the boot time
measurably or not.

The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
spillover from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.

Thoughts?

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 02:55 PM, Jacob Shin wrote:
 
 Well, really the problem is with any memory hole above 4GB that is too
 big to be covered by variable range MTRRs as UC. Because the kernel
 use to just simply do init_memory_mapping for 4GB ~ top of memory,
 any memory hole above 4GB are marked as WB in PATs.
 
 How is this handled in Intel architecture? If there are memory holes
 that are too big to be covered by variable range MTRRs as UC, are
 there other MTRR like CPU registers that the BIOS programs?
 

Intel CPUs don't have the TOM augmentation to the MTRR mechanism, and so
MTRRs need to explicitly enable caching of memory rather than the other
way around.

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:17:59PM -0800, H. Peter Anvin wrote:
 I presume with too big he really means oddly shaped.

Yeah, that's why it could be enlarged a little in order to adjust it to
the MTRR scheme. This is what the BKDG says about it:

PhysMask and PhysBase are used together to determine whether a target
physical-address falls within the specified address range. PhysMask
is logically ANDed with PhysBase and separately ANDed with the upper
40 bits of the target physical-address. If the results of the two
operations are identical, the target physical-address falls within the
specified memory range. The pseudo-code for the operation is:

MaskBase = PhysMask AND PhysBase
MaskTarget = PhysMask AND Target_Address[51:12]
IF MaskBase == MaskTarget
target address is in range
ELSE
target address is not in range

And then there are the alignment requirements:

* The boundary on which a variable range is aligned must be equal to the
range size. For example, a memory range of 16 Mbytes must be aligned on
a 16-Mbyte boundary.

* The range size must be a power of 2 (2n, 52  n  11), with a minimum
allowable size of 4 Kbytes. For example, 4 Mbytes and 8 Mbytes are
allowable memory range sizes, but 6 Mbytes is not allowable.

and then some examples about how to calculate those values.

Jacob, if you still have the system, you might try to experiment with
that, provided there are some variable MTRRs free, of course. And also
provided, there's nothing else in the hw stopping us from doing that.

Thanks.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:30 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 03:17:59PM -0800, H. Peter Anvin wrote:

I presume with too big he really means oddly shaped.


Yeah, that's why it could be enlarged a little in order to adjust it to
the MTRR scheme. This is what the BKDG says about it:



Yes, they should just cap the hole a few megabytes short and put an UC 
MTRR at 0xfc.  That should happen regardless... this system is 
dangerous without it.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:

[ … ]

 Now, calming down a little bit, we are definitely dealing with BIOS
 engineers and so f*ckups are going to happen, again and again.

Yeppers.

 The only truly safe option is to limit early mappings to 4K pages.
 This is highly undesirable for a bunch of reasons.  Reducing mapping
 granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
 the exposure somewhat; it would be interesting to gather trap statistics
 and try to get a feel for if this actually changes the boot time
 measurably or not.

This is done on the BSP, right? So we can measure it how long it takes
by taking TSC values of start and end.

 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.

That shouldn't be a lot, relatively speaking.

 Since Yinghai already had iterative page table building working, I
 don't see any reason to not use that capability.
 
 Thoughts?

Sounds doable but we should take a hard look at the patches so that we
don't miss anything.

Also, I don't know how stuff like that would be approached for a wider
testing - I mean, it is a serious change in x86 boot code and there will
be issues.

Hmm.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin h...@zytor.com wrote:
 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.  Since Yinghai
 already had iterative page table building working, I don't see any
 reason to not use that capability.

that is v6, right?

including that patch

---

Subject: [PATCH] x86, 64bit: Set extra ident mapping for whole kernel range

Current when kernel is loaded above 1G, only [_text, _text+2M] is set
up with extra ident page table.
That is not enough, some variables that could be used early are out of
that range, like BRK for early page table.
Need to set map for [_text, _end] include text/data/bss/brk...

Also current kernel is not allowed to be loaded above 512g, it thinks
that address is too big.
We need to add one extra spare page for level3 to point that 512g range.
Need to check _text range and set level4 pg with that spare level3 page,
and set level3 with level2 page to cover [_text, _end] with extra mapping.

At last, to handle crossing GB boundary, we need to add another
level2 spare page. To handle crossing 512GB boundary, we need to
add another level3 spare page to next 512G range.

Test on with kexec-tools with local test code to force loading kernel
cross 1G, 5G, 512g, 513g.

We need this to put relocatable 64bit bzImage high above 1g.

-v4: add crossing GB boundary handling.
-v5: use spare pages from BRK, so could save pages when kernel is not
loaded above 1GB.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 03:03 PM, Borislav Petkov wrote:
  On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:
  I can check but right, they might be used up. But even if we had slots
  available, the memory range that needs to be covered is in large
  enough address and aligned in such a way that you cannot cover it with
  variable range MTRRs.
  
  Actually, if I'm not mistaken, you only need to cover the HT hole with
  one MTRR - the rest remains WB. And in order the mask bits to work, we
  could make it a little bigger - we waste some memory but that's nothing
  in comparison to the MCE.
  
  You might need to talk to hw guys about the feasibility of this deal
  though.
  
 
 Just make the hole a bit bigger, so it starts at 0xfc, then you
 only need one MTRR.  This is the correct BIOS-level fix, and it really
 needs to happen.
 
 Do these systems actually exist in the field or are they engineering
 prototypes?  In the latter case, we might be done at that point.

Yes, HP is shipping (or will ship soon) such systems.

 
 Really, though, AMD should have added a TOM3 for memory above the 1T
 mark since they should have been able to see a 1T hole coming from the
 design of HyperTransport.  This would be the correct hardware-level fix,
 but I don't expect that to happen.
 

I'll feed this conversation back to our hardware folks, but yes we
still need to handle today's systems.

 Now, calming down a little bit, we are definitely dealing with BIOS
 engineers and so f*ckups are going to happen, again and again.  The
 question is what to do about it.
 
 The only truly safe option is to limit early mappings to 4K pages.
 This is highly undesirable for a bunch of reasons.  Reducing mapping
 granularity to 2M rather than 1G (what Yinghai is proposing) does reduce
 the exposure somewhat; it would be interesting to gather trap statistics
 and try to get a feel for if this actually changes the boot time
 measurably or not.
 
 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.  Since Yinghai
 already had iterative page table building working, I don't see any
 reason to not use that capability.

Yes, I'll test again with latest, but Yinghai's patchset mapping only
RAM from top down solved our problem.

Thanks,

 
 Thoughts?
 
   -hpa
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Yinghai Lu wrote:

On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin h...@zytor.com wrote:

The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
spillover from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.


that is v6, right?

including that patch



No, that's just a different way to create the early page tables (and it 
doesn't solve anything, quite on the contrary.)  I'm talking about the 
strategy for creating the *permanent* page tables


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:40 PM, Jacob Shin jacob.s...@amd.com wrote:
 On Wed, Dec 19, 2012 at 03:22:13PM -0800, H. Peter Anvin wrote:
 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.  Since Yinghai
 already had iterative page table building working, I don't see any
 reason to not use that capability.

 Yes, I'll test again with latest, but Yinghai's patchset mapping only
 RAM from top down solved our problem.

that is for-x86-mm or tip:x86/mm2

we are taking about for-x86-boot, and it will allow kernel to be loaded above 4G
to solve the kdump problem.

so early map will have two way
1. extend head_64.S to cover kernel instead of just [0, 1G)
2. or peter's #PF handler version patch to set pg table dynamically.
it could cover 1G when PF happens.

Yinghai
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Yinghai Lu
On Wed, Dec 19, 2012 at 3:43 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/19/2012 03:40 PM, Yinghai Lu wrote:

 On Wed, Dec 19, 2012 at 3:22 PM, H. Peter Anvin h...@zytor.com wrote:

 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.  Since Yinghai
 already had iterative page table building working, I don't see any
 reason to not use that capability.


 that is v6, right?

 including that patch


 No, that's just a different way to create the early page tables (and it
 doesn't solve anything, quite on the contrary.)  I'm talking about the
 strategy for creating the *permanent* page tables


i'm confused. permanent one is in tip/x86/mm2 right?

for for-x86-boot:
so you want v7 plus attached patch ? that change to 2M per PF.

Yinghai


fix_hpa_pe_pgt.patch
Description: Binary data


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Jacob Shin wrote:


Just make the hole a bit bigger, so it starts at 0xfc, then you
only need one MTRR.  This is the correct BIOS-level fix, and it really
needs to happen.

Do these systems actually exist in the field or are they engineering
prototypes?  In the latter case, we might be done at that point.


Yes, HP is shipping (or will ship soon) such systems.



Can you get them to fix the BIOS first, or at least ship a BIOS update? 
 Otherwise there will be a probabilistic failure, and it sounds like it 
is your (AMD's) fault.



The other bit is that building the real kernel page tables iteratively
(ignoring the early page tables here) is safer, since the real page
table builder is fully aware of the memory map.  This means any
spillover from the early page tables gets minimized to regions where
there are data objects that have to be accessed early.  Since Yinghai
already had iterative page table building working, I don't see any
reason to not use that capability.


Yes, I'll test again with latest, but Yinghai's patchset mapping only
RAM from top down solved our problem.


Please don't make me go Steve Ballmer on you.

We're talking about two different things... the early page tables versus 
the permanent page tables.  The permanent page tables we can handle 
because the page table creation at that point is aware of the memory map.


The early page tables are what is used before we get to that point. 
Creating them on demand means that if there are no early-needed data 
structures near the hole, there will be no access and everything will be 
okay, but as the early page table creation *is not and cannot be* aware 
of the memory map.  Right now that simply cannot happen, because all 
such data structures are confined to 32-bit addresses, however *THAT 
WILL CHANGE AND WILL CHANGE SOON*, exactly because these kinds of 
large-memory system needs that to happen.  You may start seeing failures 
at that time, and there isn't a huge lot we can do about it.


We are trying to discuss mitigation strategies with you, but you haven't 
really given us any useful information, e.g. what happens near the 
various boundaries of the hole, what could trigger prefeching into the 
range, and what it would take to fix the BIOSes.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:
 We are trying to discuss mitigation strategies with you, but you
 haven't really given us any useful information, e.g. what happens near
 the various boundaries of the hole, what could trigger prefeching into
 the range, and what it would take to fix the BIOSes.

Another thing we could do (I admit it is ugly) is to add a quirk to the
#MC handler and detect that specific condition by looking at the address
reported in MCi_ADDR and exit early by not panicking the system.

Again, this is ugly but a possibility, still.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:21 PM, Jacob Shin wrote:

On Thu, Dec 20, 2012 at 12:03:29AM +0100, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 04:59:41PM -0600, Jacob Shin wrote:

I can check but right, they might be used up. But even if we had slots
available, the memory range that needs to be covered is in large
enough address and aligned in such a way that you cannot cover it with
variable range MTRRs.


Actually, if I'm not mistaken, you only need to cover the HT hole with
one MTRR - the rest remains WB. And in order the mask bits to work, we
could make it a little bigger - we waste some memory but that's nothing
in comparison to the MCE.


Actually all memory hole above 4GB and under TOM2 needs to be marked
as UC, if the kernel just blanket calls init_memory_mapping from 4GB
to top of memory.

Right we would be loosing memory, and I think depending on the
alignment of the boundary and how many MTRRs you have avaiable to use,
significant chunks of memory could be lost. I need to go refresh on
how variable range MTRRs are programmed, it has been a while.



In this particular case an MTRR at 0xe0 would lose 896 MB of 
RAM, or just under 0.1% of the total.


If it is only the HT region that causes trouble and not the rest of the 
hole you could just plant an MTRR at 0xfc and not lose any 
memory at all.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:55 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:

We are trying to discuss mitigation strategies with you, but you
haven't really given us any useful information, e.g. what happens near
the various boundaries of the hole, what could trigger prefeching into
the range, and what it would take to fix the BIOSes.


Another thing we could do (I admit it is ugly) is to add a quirk to the
#MC handler and detect that specific condition by looking at the address
reported in MCi_ADDR and exit early by not panicking the system.

Again, this is ugly but a possibility, still.



I would really, really hate to have to deal with an early MCE handler, too.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 03:40 PM, Borislav Petkov wrote:


This is done on the BSP, right? So we can measure it how long it takes
by taking TSC values of start and end.



Yes, and we can count the number of #PF traps cheaply enough.  It would 
be interesting to put a counter on the number of #PFs and the number of 
resets and read them out on a large-system boot.




Sounds doable but we should take a hard look at the patches so that we
don't miss anything.

Also, I don't know how stuff like that would be approached for a wider
testing - I mean, it is a serious change in x86 boot code and there will
be issues.



The goal should be to have this into -tip and -next by the middle of 
January in order to make the 3.9 merge window, I think.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 03:50:14PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 03:40 PM, Jacob Shin wrote:
 
 Just make the hole a bit bigger, so it starts at 0xfc, then you
 only need one MTRR.  This is the correct BIOS-level fix, and it really
 needs to happen.
 
 Do these systems actually exist in the field or are they engineering
 prototypes?  In the latter case, we might be done at that point.
 
 Yes, HP is shipping (or will ship soon) such systems.
 
 
 Can you get them to fix the BIOS first, or at least ship a BIOS
 update?  Otherwise there will be a probabilistic failure, and it
 sounds like it is your (AMD's) fault.
 
 The other bit is that building the real kernel page tables iteratively
 (ignoring the early page tables here) is safer, since the real page
 table builder is fully aware of the memory map.  This means any
 spillover from the early page tables gets minimized to regions where
 there are data objects that have to be accessed early.  Since Yinghai
 already had iterative page table building working, I don't see any
 reason to not use that capability.
 
 Yes, I'll test again with latest, but Yinghai's patchset mapping only
 RAM from top down solved our problem.
 
 Please don't make me go Steve Ballmer on you.
 
 We're talking about two different things... the early page tables
 versus the permanent page tables.  The permanent page tables we can
 handle because the page table creation at that point is aware of the
 memory map.

Ah okay,

 
 The early page tables are what is used before we get to that point.
 Creating them on demand means that if there are no early-needed data
 structures near the hole, there will be no access and everything
 will be okay, but as the early page table creation *is not and
 cannot be* aware of the memory map.  Right now that simply cannot
 happen, because all such data structures are confined to 32-bit
 addresses, however *THAT WILL CHANGE AND WILL CHANGE SOON*, exactly
 because these kinds of large-memory system needs that to happen.
 You may start seeing failures at that time, and there isn't a huge
 lot we can do about it.
 
 We are trying to discuss mitigation strategies with you, but you
 haven't really given us any useful information, e.g. what happens
 near the various boundaries of the hole, what could trigger
 prefeching into the range, and what it would take to fix the BIOSes.

From what I remember, accessing memory around the memory hole (not
just the HT hole, but e03800 ~ 100 on our mentioned system
) generated prefetches because the memory hole was marked as WB in PAT.

I'll take a look at the system again, try the blanket MTRR covering
0xe0 ~ 1TB, and talk to our BIOS guys.

 
   -hpa
 
 -- 
 H. Peter Anvin, Intel Open Source Technology Center
 I work for Intel.  I don't speak on their behalf.
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Borislav Petkov
On Wed, Dec 19, 2012 at 04:02:25PM -0800, H. Peter Anvin wrote:
 The goal should be to have this into -tip and -next by the middle of
 January in order to make the 3.9 merge window, I think.

...and an easy back-out strategy in case there are too many issues while
testing. Maybe don't merge it into tip/master so that it can be removed
easily, or something to that effect.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin

On 12/19/2012 04:10 PM, Borislav Petkov wrote:

On Wed, Dec 19, 2012 at 04:02:25PM -0800, H. Peter Anvin wrote:

The goal should be to have this into -tip and -next by the middle of
January in order to make the 3.9 merge window, I think.


...and an easy back-out strategy in case there are too many issues while
testing. Maybe don't merge it into tip/master so that it can be removed
easily, or something to that effect.



We keep everything in topic branches; tip:master is a synthetic branch 
which can be regenerated as needed.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:07 PM, Jacob Shin wrote:
 
 From what I remember, accessing memory around the memory hole (not
 just the HT hole, but e03800 ~ 100 on our mentioned system
 ) generated prefetches because the memory hole was marked as WB in PAT.
 
 I'll take a look at the system again, try the blanket MTRR covering
 0xe0 ~ 1TB, and talk to our BIOS guys.
 

Yes, but do they all #MC (as opposed to, say, fetching all FFs)?

-hpa


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 04:07 PM, Jacob Shin wrote:
  
  From what I remember, accessing memory around the memory hole (not
  just the HT hole, but e03800 ~ 100 on our mentioned system
  ) generated prefetches because the memory hole was marked as WB in PAT.
  
  I'll take a look at the system again, try the blanket MTRR covering
  0xe0 ~ 1TB, and talk to our BIOS guys.
  
 
 Yes, but do they all #MC (as opposed to, say, fetching all FFs)?

Yes, MCE every time and it was fatal.

 
   -hpa
 
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:29 PM, Jacob Shin wrote:
 On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 04:07 PM, Jacob Shin wrote:

 From what I remember, accessing memory around the memory hole (not
 just the HT hole, but e03800 ~ 100 on our mentioned system
 ) generated prefetches because the memory hole was marked as WB in PAT.

 I'll take a look at the system again, try the blanket MTRR covering
 0xe0 ~ 1TB, and talk to our BIOS guys.


 Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
 
 Yes, MCE every time and it was fatal.
 

So regardless of address.  Bother.

-hpa


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 04:29 PM, Jacob Shin wrote:
 On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 04:07 PM, Jacob Shin wrote:

 From what I remember, accessing memory around the memory hole (not
 just the HT hole, but e03800 ~ 100 on our mentioned system
 ) generated prefetches because the memory hole was marked as WB in PAT.

 I'll take a look at the system again, try the blanket MTRR covering
 0xe0 ~ 1TB, and talk to our BIOS guys.


 Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
 
 Yes, MCE every time and it was fatal.
 

OK, one more question... there is something odd with the memory ranges here:

 BIOS-e820: [mem 0x0001-0x00e037ff] usable
 BIOS-e820: [mem 0x00e03800-0x00fc] reserved
 BIOS-e820: [mem 0x0100-0x011ffeff] usable

The first usable range here is 4G to 896G + 896M which is an awfully
strange number.  Similarly, the second range is 1T to 1T + 128G - 16M.
The little fiddly bits imply that there is either overshoot of some sort
going on -- possibly reserved memory -- or these are fairly arbitrary
sizes that don't match any physical bank sizes in which case it should
be possible to shuffle it differently...

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread Jacob Shin
On Wed, Dec 19, 2012 at 06:37:45PM -0800, H. Peter Anvin wrote:
 On 12/19/2012 04:29 PM, Jacob Shin wrote:
  On Wed, Dec 19, 2012 at 04:24:09PM -0800, H. Peter Anvin wrote:
  On 12/19/2012 04:07 PM, Jacob Shin wrote:
 
  From what I remember, accessing memory around the memory hole (not
  just the HT hole, but e03800 ~ 100 on our mentioned system
  ) generated prefetches because the memory hole was marked as WB in PAT.
 
  I'll take a look at the system again, try the blanket MTRR covering
  0xe0 ~ 1TB, and talk to our BIOS guys.
 
 
  Yes, but do they all #MC (as opposed to, say, fetching all FFs)?
  
  Yes, MCE every time and it was fatal.
  
 
 OK, one more question... there is something odd with the memory ranges here:
 
  BIOS-e820: [mem 0x0001-0x00e037ff] usable
  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
  BIOS-e820: [mem 0x0100-0x011ffeff] usable
 
 The first usable range here is 4G to 896G + 896M which is an awfully
 strange number.  Similarly, the second range is 1T to 1T + 128G - 16M.
 The little fiddly bits imply that there is either overshoot of some sort
 going on -- possibly reserved memory -- or these are fairly arbitrary
 sizes that don't match any physical bank sizes in which case it should
 be possible to shuffle it differently...

Not exactly sure why the wierd boundaries, I'll have to ask the BIOS
side folks to be sure. But if I were to guess ..

Here is the NUMA spew out, physically there is 128 GB connected to
each memory controller node. The PCI MMIO region starts at 0xc800.
4 GB - 0xc800 = 0x380 (896 MB). So we loose 896 MB due to PCI
MMIO hole, so the first node ends at 128 GB + 896 MB to talk to all of
128 GB off of the first memory controller, and hence the weird 896 MB
offset.

[0.00] SRAT: Node 0 PXM 0 0-a
[0.00] SRAT: Node 0 PXM 0 10-c800
[0.00] SRAT: Node 0 PXM 0 1-203800
[0.00] SRAT: Node 1 PXM 1 203800-403800
[0.00] SRAT: Node 2 PXM 2 403800-603800
[0.00] SRAT: Node 3 PXM 3 603800-803800
[0.00] SRAT: Node 4 PXM 4 803800-a03800
[0.00] SRAT: Node 5 PXM 5 a03800-c03800
[0.00] SRAT: Node 6 PXM 6 c03800-e03800
[0.00] SRAT: Node 7 PXM 7 100-11fff00
[0.00] NUMA: Initialized distance table, cnt=8
[0.00] NUMA: Node 0 [0,a) + [10,c800) - [0,c800)
[0.00] NUMA: Node 0 [0,c800) + [1,203800) - 
[0,203800)
[0.00] Initmem setup node 0 -00203800
[0.00]   NODE_DATA [002037ff5000 - 002037ff]
[0.00] Initmem setup node 1 00203800-00403800
[0.00]   NODE_DATA [004037ff5000 - 004037ff]
[0.00] Initmem setup node 2 00403800-00603800
[0.00]   NODE_DATA [006037ff5000 - 006037ff]
[0.00] Initmem setup node 3 00603800-00803800
[0.00]   NODE_DATA [008037ff5000 - 008037ff]
[0.00] Initmem setup node 4 00803800-00a03800
[0.00]   NODE_DATA [00a037ff5000 - 00a037ff]
[0.00] Initmem setup node 5 00a03800-00c03800
[0.00]   NODE_DATA [00c037ff5000 - 00c037ff]
[0.00] Initmem setup node 6 00c03800-00e03800
[0.00]   NODE_DATA [00e037ff2000 - 00e037ffcfff]
[0.00] Initmem setup node 7 0100-011fff00
[0.00]   NODE_DATA [011ffeff1000 - 011ffeffbfff]
[0.00] Zone PFN ranges:
[0.00]   DMA  0x0010 - 0x1000
[0.00]   DMA320x1000 - 0x0010
[0.00]   Normal   0x0010 - 0x11fff000
[0.00] Movable zone start PFN for each node
[0.00] early_node_map[10] active PFN ranges
[0.00] 0: 0x0010 - 0x0099
[0.00] 0: 0x0100 - 0x000c7ec0
[0.00] 0: 0x0010 - 0x02038000
[0.00] 1: 0x02038000 - 0x04038000
[0.00] 2: 0x04038000 - 0x06038000
[0.00] 3: 0x06038000 - 0x08038000
[0.00] 4: 0x08038000 - 0x0a038000
[0.00] 5: 0x0a038000 - 0x0c038000
[0.00] 6: 0x0c038000 - 0x0e038000
[0.00] 7: 0x1000 - 0x11fff000
[0.00] On node 0 totalpages: 33553993
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 5 pages reserved
[0.00]   DMA zone: 3916 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 800504 pages, LIFO batch:31
[0.00]   Normal zone: 447552 pages used for memmap
[0.00]   Normal zone: 32287680 pages, LIFO batch:31
[0.00] On node 1 totalpages: 33554432
[0.00]   Normal zone: 458752 pages used for memmap
[0.00]   Normal zone: 33095680 pages, LIFO 

Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-19 Thread H. Peter Anvin
On 12/19/2012 08:16 PM, Jacob Shin wrote:
 
 Not exactly sure why the wierd boundaries, I'll have to ask the BIOS
 side folks to be sure. But if I were to guess ..
 
 Here is the NUMA spew out, physically there is 128 GB connected to
 each memory controller node. The PCI MMIO region starts at 0xc800.
 4 GB - 0xc800 = 0x380 (896 MB). So we loose 896 MB due to PCI
 MMIO hole, so the first node ends at 128 GB + 896 MB to talk to all of
 128 GB off of the first memory controller, and hence the weird 896 MB
 offset.
 

It would obviously be better if the slack were at the end of the total
memory, instead of end of the  1T range.  If the PCI MMIO hole were a
power of 2 (e.g. 1G) that would also reduce the likelihood of problems
and reduce MTRR pressure.

-hpa

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
Jan,

Can you check if attached patch is going to break KGDB?

Thanks

Yinghai


move_down_early_trap_init.patch
Description: Binary data


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 5:11 PM, Yinghai Lu  wrote:
> On Mon, Dec 17, 2012 at 3:26 PM, Yinghai Lu  wrote:
>> On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin  wrote:
>>> On 12/17/2012 02:47 PM, Yinghai Lu wrote:


 Peter, can you check that branch again?

 I moved the early_trap_init after init_mem_mapping.
 so for 64bit native, init_mem_mapping will setup page table for ram from
 blank.

>>>
>>> Looks better, at first glance at least.  There are a couple of unnecessary
>>> changes (the counter in head_64.S cannot exceed 32 bits once computed, so
>>> the change from %rcx to %ecx change is pointless.)
>>
>> ok,  return to use %ecx
>>
>>>
>>> There is another bug in my patch: it either needs to mask off the NX bit if
>>> we are running on non-NX-enabled hardware, or it needs to not set the NX bit
>>> (which is mostly okay that early on, I suspect.)
>>
>> i test that in kvm guest, and westmere, current version seem ok.
>>
>> will repost the patchset to list to get more review.
>>
>
> not sure if i could move that early_trap_init down.
>
> jason,
>
> We need to move down early_trap_init after init_memory_mapping to use
> early #PF handler to set page table.
>
> So can we do that? for kgdb it is that ok to move it down?

adding to Jan.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 3:26 PM, Yinghai Lu  wrote:
> On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin  wrote:
>> On 12/17/2012 02:47 PM, Yinghai Lu wrote:
>>>
>>>
>>> Peter, can you check that branch again?
>>>
>>> I moved the early_trap_init after init_mem_mapping.
>>> so for 64bit native, init_mem_mapping will setup page table for ram from
>>> blank.
>>>
>>
>> Looks better, at first glance at least.  There are a couple of unnecessary
>> changes (the counter in head_64.S cannot exceed 32 bits once computed, so
>> the change from %rcx to %ecx change is pointless.)
>
> ok,  return to use %ecx
>
>>
>> There is another bug in my patch: it either needs to mask off the NX bit if
>> we are running on non-NX-enabled hardware, or it needs to not set the NX bit
>> (which is mostly okay that early on, I suspect.)
>
> i test that in kvm guest, and westmere, current version seem ok.
>
> will repost the patchset to list to get more review.
>

not sure if i could move that early_trap_init down.

jason,

We need to move down early_trap_init after init_memory_mapping to use
early #PF handler to set page table.

So can we do that? for kgdb it is that ok to move it down?

or can we just move
set_intr_gate(X86_TRAP_PF, _fault)
back to trap_init?

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin  wrote:
> On 12/17/2012 02:47 PM, Yinghai Lu wrote:
>>
>>
>> Peter, can you check that branch again?
>>
>> I moved the early_trap_init after init_mem_mapping.
>> so for 64bit native, init_mem_mapping will setup page table for ram from
>> blank.
>>
>
> Looks better, at first glance at least.  There are a couple of unnecessary
> changes (the counter in head_64.S cannot exceed 32 bits once computed, so
> the change from %rcx to %ecx change is pointless.)

ok,  return to use %ecx

>
> There is another bug in my patch: it either needs to mask off the NX bit if
> we are running on non-NX-enabled hardware, or it needs to not set the NX bit
> (which is mostly okay that early on, I suspect.)

i test that in kvm guest, and westmere, current version seem ok.

will repost the patchset to list to get more review.

:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread H. Peter Anvin

On 12/17/2012 02:47 PM, Yinghai Lu wrote:


Peter, can you check that branch again?

I moved the early_trap_init after init_mem_mapping.
so for 64bit native, init_mem_mapping will setup page table for ram from blank.



Looks better, at first glance at least.  There are a couple of 
unnecessary changes (the counter in head_64.S cannot exceed 32 bits once 
computed, so the change from %rcx to %ecx change is pointless.)


There is another bug in my patch: it either needs to mask off the NX bit 
if we are running on non-NX-enabled hardware, or it needs to not set the 
NX bit (which is mostly okay that early on, I suspect.)


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Sun, Dec 16, 2012 at 12:50 AM, Yinghai Lu  wrote:
> On Sat, Dec 15, 2012 at 9:17 PM, Yinghai Lu  wrote:
>> On Sat, Dec 15, 2012 at 6:09 PM, Yinghai Lu  wrote:
>>> On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin  wrote:
 On 12/15/2012 12:55 PM, Yinghai Lu wrote:
>
> BTW, did you look at smp boot problem with early_level4_pgt version?


 No, I have been busy with non-Linux stuff today.

>>>
>>> ok, i sorted it out. I will split it to small pieces and post them.
>>
>> I updated for-x86-boot branch with it, and it is based on
>> linus:master
>> tip:x86/mm
>> tip:x86/urgent
>> tip:x86/mm2.
>>
>> also attach 7 new ones are just added to that branch.
>>
> just updated the branch to fix compiling problem that was found by
> Fengguang's kbuild test robot.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
> for-x86-boot

Peter, can you check that branch again?

I moved the early_trap_init after init_mem_mapping.
so for 64bit native, init_mem_mapping will setup page table for ram from blank.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Sun, Dec 16, 2012 at 12:50 AM, Yinghai Lu ying...@kernel.org wrote:
 On Sat, Dec 15, 2012 at 9:17 PM, Yinghai Lu ying...@kernel.org wrote:
 On Sat, Dec 15, 2012 at 6:09 PM, Yinghai Lu ying...@kernel.org wrote:
 On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/15/2012 12:55 PM, Yinghai Lu wrote:

 BTW, did you look at smp boot problem with early_level4_pgt version?


 No, I have been busy with non-Linux stuff today.


 ok, i sorted it out. I will split it to small pieces and post them.

 I updated for-x86-boot branch with it, and it is based on
 linus:master
 tip:x86/mm
 tip:x86/urgent
 tip:x86/mm2.

 also attach 7 new ones are just added to that branch.

 just updated the branch to fix compiling problem that was found by
 Fengguang's kbuild test robot.

 git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
 for-x86-boot

Peter, can you check that branch again?

I moved the early_trap_init after init_mem_mapping.
so for 64bit native, init_mem_mapping will setup page table for ram from blank.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread H. Peter Anvin

On 12/17/2012 02:47 PM, Yinghai Lu wrote:


Peter, can you check that branch again?

I moved the early_trap_init after init_mem_mapping.
so for 64bit native, init_mem_mapping will setup page table for ram from blank.



Looks better, at first glance at least.  There are a couple of 
unnecessary changes (the counter in head_64.S cannot exceed 32 bits once 
computed, so the change from %rcx to %ecx change is pointless.)


There is another bug in my patch: it either needs to mask off the NX bit 
if we are running on non-NX-enabled hardware, or it needs to not set the 
NX bit (which is mostly okay that early on, I suspect.)


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/17/2012 02:47 PM, Yinghai Lu wrote:


 Peter, can you check that branch again?

 I moved the early_trap_init after init_mem_mapping.
 so for 64bit native, init_mem_mapping will setup page table for ram from
 blank.


 Looks better, at first glance at least.  There are a couple of unnecessary
 changes (the counter in head_64.S cannot exceed 32 bits once computed, so
 the change from %rcx to %ecx change is pointless.)

ok,  return to use %ecx


 There is another bug in my patch: it either needs to mask off the NX bit if
 we are running on non-NX-enabled hardware, or it needs to not set the NX bit
 (which is mostly okay that early on, I suspect.)

i test that in kvm guest, and westmere, current version seem ok.

will repost the patchset to list to get more review.

:
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 3:26 PM, Yinghai Lu ying...@kernel.org wrote:
 On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/17/2012 02:47 PM, Yinghai Lu wrote:


 Peter, can you check that branch again?

 I moved the early_trap_init after init_mem_mapping.
 so for 64bit native, init_mem_mapping will setup page table for ram from
 blank.


 Looks better, at first glance at least.  There are a couple of unnecessary
 changes (the counter in head_64.S cannot exceed 32 bits once computed, so
 the change from %rcx to %ecx change is pointless.)

 ok,  return to use %ecx


 There is another bug in my patch: it either needs to mask off the NX bit if
 we are running on non-NX-enabled hardware, or it needs to not set the NX bit
 (which is mostly okay that early on, I suspect.)

 i test that in kvm guest, and westmere, current version seem ok.

 will repost the patchset to list to get more review.


not sure if i could move that early_trap_init down.

jason,

We need to move down early_trap_init after init_memory_mapping to use
early #PF handler to set page table.

So can we do that? for kgdb it is that ok to move it down?

or can we just move
set_intr_gate(X86_TRAP_PF, page_fault)
back to trap_init?

Thanks

Yinghai
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
On Mon, Dec 17, 2012 at 5:11 PM, Yinghai Lu ying...@kernel.org wrote:
 On Mon, Dec 17, 2012 at 3:26 PM, Yinghai Lu ying...@kernel.org wrote:
 On Mon, Dec 17, 2012 at 3:11 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/17/2012 02:47 PM, Yinghai Lu wrote:


 Peter, can you check that branch again?

 I moved the early_trap_init after init_mem_mapping.
 so for 64bit native, init_mem_mapping will setup page table for ram from
 blank.


 Looks better, at first glance at least.  There are a couple of unnecessary
 changes (the counter in head_64.S cannot exceed 32 bits once computed, so
 the change from %rcx to %ecx change is pointless.)

 ok,  return to use %ecx


 There is another bug in my patch: it either needs to mask off the NX bit if
 we are running on non-NX-enabled hardware, or it needs to not set the NX bit
 (which is mostly okay that early on, I suspect.)

 i test that in kvm guest, and westmere, current version seem ok.

 will repost the patchset to list to get more review.


 not sure if i could move that early_trap_init down.

 jason,

 We need to move down early_trap_init after init_memory_mapping to use
 early #PF handler to set page table.

 So can we do that? for kgdb it is that ok to move it down?

adding to Jan.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-17 Thread Yinghai Lu
Jan,

Can you check if attached patch is going to break KGDB?

Thanks

Yinghai


move_down_early_trap_init.patch
Description: Binary data


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-16 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 9:17 PM, Yinghai Lu  wrote:
> On Sat, Dec 15, 2012 at 6:09 PM, Yinghai Lu  wrote:
>> On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin  wrote:
>>> On 12/15/2012 12:55 PM, Yinghai Lu wrote:

 BTW, did you look at smp boot problem with early_level4_pgt version?
>>>
>>>
>>> No, I have been busy with non-Linux stuff today.
>>>
>>
>> ok, i sorted it out. I will split it to small pieces and post them.
>
> I updated for-x86-boot branch with it, and it is based on
> linus:master
> tip:x86/mm
> tip:x86/urgent
> tip:x86/mm2.
>
> also attach 7 new ones are just added to that branch.
>
just updated the branch to fix compiling problem that was found by
Fengguang's kbuild test robot.

git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
for-x86-boot
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-16 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 9:17 PM, Yinghai Lu ying...@kernel.org wrote:
 On Sat, Dec 15, 2012 at 6:09 PM, Yinghai Lu ying...@kernel.org wrote:
 On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin h...@zytor.com wrote:
 On 12/15/2012 12:55 PM, Yinghai Lu wrote:

 BTW, did you look at smp boot problem with early_level4_pgt version?


 No, I have been busy with non-Linux stuff today.


 ok, i sorted it out. I will split it to small pieces and post them.

 I updated for-x86-boot branch with it, and it is based on
 linus:master
 tip:x86/mm
 tip:x86/urgent
 tip:x86/mm2.

 also attach 7 new ones are just added to that branch.

just updated the branch to fix compiling problem that was found by
Fengguang's kbuild test robot.

git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
for-x86-boot
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 6:09 PM, Yinghai Lu  wrote:
> On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin  wrote:
>> On 12/15/2012 12:55 PM, Yinghai Lu wrote:
>>>
>>> BTW, did you look at smp boot problem with early_level4_pgt version?
>>
>>
>> No, I have been busy with non-Linux stuff today.
>>
>
> ok, i sorted it out. I will split it to small pieces and post them.

I updated for-x86-boot branch with it, and it is based on
linus:master
tip:x86/mm
tip:x86/urgent
tip:x86/mm2.

also attach 7 new ones are just added to that branch.

Thanks

Yinghai


0003-x86-call-copy_bootdata-early.patch
Description: Binary data


0004-x86-mm-add-early-kernel-mapping-in-c.patch
Description: Binary data


0005-x86-realmode-use-init_level4_pgt-to-set-trapmoline_p.patch
Description: Binary data


0006-x86-mm-increase-BRK-area-for-early-page-table.patch
Description: Binary data


0007-x86-64bit-early-PF-handler-set-page-table.patch
Description: Binary data


0008-x86-64bit-PF-handler-set-page-to-cover-2M-only.patch
Description: Binary data


0009-x86-64bit-Print-init-kernel-lowmap-correctly.patch
Description: Binary data


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin  wrote:
> On 12/15/2012 12:55 PM, Yinghai Lu wrote:
>>
>> BTW, did you look at smp boot problem with early_level4_pgt version?
>
>
> No, I have been busy with non-Linux stuff today.
>

ok, i sorted it out. I will split it to small pieces and post them.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread H. Peter Anvin

On 12/15/2012 03:15 PM, Yinghai Lu wrote:


That is for the kernel region itself (that code is actually unchanged from
the current code), and yes, we could cap that one to _end if there are
systems which have bugs in that area.  The dynamic page tables map 1G
aligned at a time.


dynamic should be 2M too.

AMD system:

http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf

  BIOS-e820: [mem 0x0001-0x00e037ff] usable
  BIOS-e820: [mem 0x00e03800-0x00fc] reserved
  BIOS-e820: [mem 0x0100-0x011ffeff] usable

the hole is not 1G aligned.

or HT region is from e04000 ?



The HT region starts at 0xfd -- after that reserved region, so I 
have no idea what that particular system is trying to do or what is 
requirements are (nor what its MTRR setup is, since you didn't post it.)


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 2:17 PM, H. Peter Anvin  wrote:
> On 12/15/2012 02:13 PM, Yinghai Lu wrote:
>>
>>
>> AMD system could have all mem between TOLM and TOHM all WB, and don
>> need to set them in MTRRs entries.
>>
>
> I include the TOM2 mechanism in the overall umbrella of MTRRs for this
> purpose.
>
>
>> and also your switchover change that handle cross 1G, and 512g, and it
>> is not 1G aligned.
>> for example, if kernel at 4095G+512M, it will map from 4095G+512M to
>> 4096G + 512M.
>
>
> That is for the kernel region itself (that code is actually unchanged from
> the current code), and yes, we could cap that one to _end if there are
> systems which have bugs in that area.  The dynamic page tables map 1G
> aligned at a time.

dynamic should be 2M too.

AMD system:

http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf

 BIOS-e820: [mem 0x0001-0x00e037ff] usable
 BIOS-e820: [mem 0x00e03800-0x00fc] reserved
 BIOS-e820: [mem 0x0100-0x011ffeff] usable

the hole is not 1G aligned.

or HT region is from e04000 ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread H. Peter Anvin

On 12/15/2012 02:13 PM, Yinghai Lu wrote:


AMD system could have all mem between TOLM and TOHM all WB, and don
need to set them in MTRRs entries.



I include the TOM2 mechanism in the overall umbrella of MTRRs for this 
purpose.



and also your switchover change that handle cross 1G, and 512g, and it
is not 1G aligned.
for example, if kernel at 4095G+512M, it will map from 4095G+512M to
4096G + 512M.


That is for the kernel region itself (that code is actually unchanged 
from the current code), and yes, we could cap that one to _end if there 
are systems which have bugs in that area.  The dynamic page tables map 
1G aligned at a time.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread Yinghai Lu
On Sat, Dec 15, 2012 at 1:40 PM, H. Peter Anvin  wrote:
> On 12/15/2012 12:55 PM, Yinghai Lu wrote:
>> Also if we set map too large, could have chance to cover mem hole near
>> 1T for AMD HT system.
>
>
> Again, should not be cachable in the MTRRs, and even so, is 1G aligned
> already.

AMD system could have all mem between TOLM and TOHM all WB, and don
need to set them in MTRRs entries.

and also your switchover change that handle cross 1G, and 512g, and it
is not 1G aligned.
for example, if kernel at 4095G+512M, it will map from 4095G+512M to
4096G + 512M.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread H. Peter Anvin

On 12/15/2012 12:55 PM, Yinghai Lu wrote:

On Sat, Dec 15, 2012 at 11:30 AM, H. Peter Anvin  wrote:

What is the point of only managing 2M at a time?  Now you have to have
more conditionals and you don't get any more memory efficiency.


We don't need to, because real_data is less than 2M, and ramdisk is about 16M.



In other words, you make magic assumptions (some of which are very wrong 
in many real-life scenarios -- people can and do use gigabyte-plus 
initramfs).  That is exactly the wrong thing to do.  Furthermore it 
doesn't buy you anything, because you still have to allocate the PMDs.



Also if we set map too large, could have chance to cover mem hole near
1T for AMD HT system.


Again, should not be cachable in the MTRRs, and even so, is 1G aligned 
already.



Filling arbitrarily into the brk is not acceptable... the brk is an O(1)
area and all brk allocations need to be reserved at compile time, so the
overflow handling is still necessary.


if run out of BRK, we will get panic, because early_make_pgtable will return -1.


And you consider that panic an acceptable failure mode


and current BRK already have 64 slop space.

BTW, did you look at smp boot problem with early_level4_pgt version?


No, I have been busy with non-Linux stuff today.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early update ucode on Intel's CPU

2012-12-15 Thread H. Peter Anvin
The mem hole at 1T should not be marked cachable in the MTRRs.

Yinghai Lu  wrote:

>On Sat, Dec 15, 2012 at 11:30 AM, H. Peter Anvin 
>wrote:
>> What is the point of only managing 2M at a time?  Now you have to
>have
>> more conditionals and you don't get any more memory efficiency.
>
>We don't need to, because real_data is less than 2M, and ramdisk is
>about 16M.
>
>Also if we set map too large, could have chance to cover mem hole near
>1T for AMD HT system.
>
>>
>> Filling arbitrarily into the brk is not acceptable... the brk is an
>O(1)
>> area and all brk allocations need to be reserved at compile time, so
>the
>> overflow handling is still necessary.
>
>if run out of BRK, we will get panic, because early_make_pgtable will
>return -1.
>
>and current BRK already have 64 slop space.
>
>BTW, did you look at smp boot problem with early_level4_pgt version?
>
>Yinghai

-- 
Sent from my mobile phone. Please excuse brevity and lack of formatting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   >