Hello
After working with Jonathan Chew on the subject (by emails),
I discovered that the problem came from the wrong initialization
of the lgrp kernel internal objects
when the ACPI SRAT table is used to populate these objects (this applies to my
system HP proliant with at least 2 opteron processors).
On my server, in the ACPI SRAT table, the proximity domain ids start at 1 (not
0).
when this happens, the lgrp_plat_node_memory records are swapped.
ex :
lgrp_plat_node_memory[]
{
start = 0x80000
end = 0xfffff
exists = 0x1
prox_domain = 0x2
}
{
start = 0
end = 0x7ffff
exists = 0x1
prox_domain = 0x1
}
and mem_node_config[] :
{
exists = 0x1
physbase = 0x80000
physmax = 0xf57f5
}
{
exists = 0x1
physbase = 0
physmax = 0x7ffff
}
When the SRAT table is not processed (which was the case before snv_88),
these objects are initialized using pci registers and give :
lgrp_plat_node_memory[]
{
start = 0
end = 0x7ffff
exists = 0x1
prox_domain = 0x1
}
{
start = 0x80000
end = 0xfffff
exists = 0x1
prox_domain = 0x2
}
and :
mem_node_config[] :
{
exists = 0x1
physbase = 0
physmax = 0x7ffff
}
{
exists = 0x1
physbase = 0x80000
physmax = 0xf57f5
}
With this configuration, the system works properly.
The problem is that access to this table is done with a hash function which is
prox-domain-id modulo number of nodes in system.
On my system, since prox-domain start at 1 :
1%2 = 1
2%2 = 0
So it reverses the order of the node in the table.
I tried the folowing fix in lgrpplat.c :
I changed the macro :
NODE_DOMAIN_HASH(domain, node_cnt) ((domain) % node_cnt)
into :
NODE_DOMAIN_HASH(domain, node_cnt, first_domain) ((domain - first_domain) %
node_cnt)
(full patch in attached file)
With this corrected code, the lgrp objects are initialized in the correct way
(like when the objects are initialized from pci regs).
I don't think this fix has side effect outside lgrpplat.c, since changes are
internal to this file, no interface modified.
If you have a system impacted by this bug, can you test this fix (I can provide
a bfu binary archive if you need it) ?
Thank you
Guy
--
This message posted from opensolaris.org
*** lgrpplat.c_orig Mon Feb 23 19:01:40 2009
--- usr/src/uts/i86pc/os/lgrpplat.c Thu Mar 12 17:09:50 2009
***************
*** 143,149 ****
--- 143,152 ----
#include <vm/hat_i86.h>
#include <vm/seg_kmem.h>
#include <vm/vm_dep.h>
+ #include <sys/kobj.h>
+ #include <sys/kobj_lex.h>
+
#include "acpi_fw.h" /* for SRAT and SLIT */
***************
*** 151,156 ****
--- 154,168 ----
#define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1)
/*
+ * boot argument string
+ * allows to disable srat and/or slit acpi tables processing
+ * ( workaround usefull for some class of NUMA server, when acpi data are
unusual )
+ */
+ #define BP_LGRP_SRAT_ENABLE "lgrp-srat-enable"
+ #define BP_LGRP_SLIT_ENABLE "lgrp-slit-enable"
+ #define BP_MAX_STRLEN 80
+
+ /*
* Constants for configuring probing
*/
#define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for
probing */
***************
*** 168,174 ****
* Hash proximity domain ID into node to domain mapping table using to
minimize
* span of entries used
*/
! #define NODE_DOMAIN_HASH(domain, node_cnt) ((domain) % node_cnt)
/*
--- 180,186 ----
* Hash proximity domain ID into node to domain mapping table using to
minimize
* span of entries used
*/
! #define NODE_DOMAIN_HASH(domain, node_cnt, first_domain)
((domain - first_domain) % node_cnt)
/*
***************
*** 294,299 ****
--- 306,316 ----
uint_t lgrp_plat_node_cnt = 1;
/*
+ * First domain number in system
+ */
+ uint32_t lgrp_plat_min_domain = 0;
+
+ /*
* Configuration Parameters for Probing
* - lgrp_plat_probe_flags Flags to specify enabling probing, probe
* operation, etc.
***************
*** 368,379 ****
static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
! uint32_t domain);
static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
! int node_cnt, uint32_t domain);
static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
lgrp_plat_latency_stats_t *lat_stats,
--- 385,396 ----
static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
! uint32_t domain, uint32_t min_domain);
static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
! int node_cnt, uint32_t domain, uint32_t min_domain);
static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
lgrp_plat_latency_stats_t *lat_stats,
***************
*** 385,395 ****
static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t,
lgrp_mem_query_t);
static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
! int node_cnt, uint32_t domain);
static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
! uint64_t end, uint32_t domain);
static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
lgrp_plat_probe_mem_config_t *probe_mem_config,
--- 402,412 ----
static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t,
lgrp_mem_query_t);
static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
! int node_cnt, uint32_t domain, uint32_t min_domain);
static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
! uint64_t end, uint32_t domain, uint32_t min_domain);
static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
lgrp_plat_probe_mem_config_t *probe_mem_config,
***************
*** 683,691 ****
--- 700,731 ----
*/
lgrp_plat_node_cnt = max_mem_nodes = 1;
#else /* __xpv */
+ int boot_prop_len;
uint_t probe_op;
+ char str[BP_MAX_STRLEN];
+ u_longlong_t value;
/*
+ * Get boot property for enabling/disabling SRAT
+ */
+ boot_prop_len=BOP_GETPROPLEN(bootops, BP_LGRP_SRAT_ENABLE);
+ if (boot_prop_len> 0 && boot_prop_len < sizeof (str)) {
+ if (BOP_GETPROP(bootops, BP_LGRP_SRAT_ENABLE, str) >= 0 &&
+ kobj_getvalue(str, &value) != -1) {
+ lgrp_plat_srat_enable = 0;
+ }
+ }
+
+ boot_prop_len=BOP_GETPROPLEN(bootops, BP_LGRP_SLIT_ENABLE);
+ if (boot_prop_len> 0 && boot_prop_len < sizeof (str)) {
+ if (BOP_GETPROP(bootops, BP_LGRP_SLIT_ENABLE, str) >= 0 &&
+ kobj_getvalue(str, &value) != -1) {
+ lgrp_plat_slit_enable = 0;
+ }
+ }
+
+
+ /*
* Initialize as a UMA machine
*/
if (lgrp_topo_ht_limit() == 1) {
***************
*** 703,708 ****
--- 743,749 ----
/*
* Determine which CPUs and memory are local to each other and number
* of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
+ if (lgrp_plat_srat_enable && lgrp_plat_apic_ncpus > 0) {
*/
if (lgrp_plat_apic_ncpus > 0) {
int retval;
***************
*** 1190,1196 ****
*/
static int
lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
! cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
{
uint_t i;
int node;
--- 1231,1237 ----
*/
static int
lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
! cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain,
uint32_t min_domain)
{
uint_t i;
int node;
***************
*** 1198,1207 ****
/*
* Get node number for proximity domain
*/
! node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
if (node == -1) {
node = lgrp_plat_node_domain_update(node_domain, node_cnt,
! domain);
if (node == -1)
return (-1);
}
--- 1239,1248 ----
/*
* Get node number for proximity domain
*/
! node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain,
min_domain);
if (node == -1) {
node = lgrp_plat_node_domain_update(node_domain, node_cnt,
! domain, min_domain);
if (node == -1)
return (-1);
}
***************
*** 1283,1289 ****
*/
static int
lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
! uint32_t domain)
{
uint_t node;
uint_t start;
--- 1324,1330 ----
*/
static int
lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
! uint32_t domain, uint32_t min_domain)
{
uint_t node;
uint_t start;
***************
*** 1293,1304 ****
* search for entry with matching proximity domain ID, and return index
* of matching entry as node ID.
*/
! node = start = NODE_DOMAIN_HASH(domain, node_cnt);
do {
if (node_domain[node].prox_domain == domain &&
node_domain[node].exists)
return (node);
! node = NODE_DOMAIN_HASH(node + 1, node_cnt);
} while (node != start);
return (-1);
}
--- 1334,1345 ----
* search for entry with matching proximity domain ID, and return index
* of matching entry as node ID.
*/
! node = start = NODE_DOMAIN_HASH(domain, node_cnt, min_domain);
do {
if (node_domain[node].prox_domain == domain &&
node_domain[node].exists)
return (node);
! node = NODE_DOMAIN_HASH(node + 1, node_cnt, min_domain);
} while (node != start);
return (-1);
}
***************
*** 1699,1705 ****
*/
static int
lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
! uint32_t domain)
{
uint_t node;
uint_t start;
--- 1740,1746 ----
*/
static int
lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
! uint32_t domain, uint32_t min_domain)
{
uint_t node;
uint_t start;
***************
*** 1708,1714 ****
* Hash proximity domain ID into node to domain mapping table (array)
* and add entry for it into first non-existent or matching entry found
*/
! node = start = NODE_DOMAIN_HASH(domain, node_cnt);
do {
/*
* Entry doesn't exist yet, so create one for this proximity
--- 1749,1755 ----
* Hash proximity domain ID into node to domain mapping table (array)
* and add entry for it into first non-existent or matching entry found
*/
! node = start = NODE_DOMAIN_HASH(domain, node_cnt, min_domain);
do {
/*
* Entry doesn't exist yet, so create one for this proximity
***************
*** 1726,1732 ****
*/
if (node_domain[node].prox_domain == domain)
return (node);
! node = NODE_DOMAIN_HASH(node + 1, node_cnt);
} while (node != start);
/*
--- 1767,1773 ----
*/
if (node_domain[node].prox_domain == domain)
return (node);
! node = NODE_DOMAIN_HASH(node + 1, node_cnt, min_domain);
} while (node != start);
/*
***************
*** 1745,1751 ****
static int
lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
! uint32_t domain)
{
int node;
--- 1786,1792 ----
static int
lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
! uint32_t domain, uint32_t min_domain)
{
int node;
***************
*** 1752,1761 ****
/*
* Get node number for proximity domain
*/
! node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
if (node == -1) {
node = lgrp_plat_node_domain_update(node_domain, node_cnt,
! domain);
if (node == -1)
return (-1);
}
--- 1793,1802 ----
/*
* Get node number for proximity domain
*/
! node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain,
min_domain);
if (node == -1) {
node = lgrp_plat_node_domain_update(node_domain, node_cnt,
! domain, min_domain);
if (node == -1)
return (-1);
}
***************
*** 2061,2066 ****
--- 2102,2108 ----
* Determine number of nodes by counting number of proximity domains in
* SRAT and return if number of nodes is 1 or less since don't need to
* read SRAT then
+ * initialize lgrp_plat_min_domain
*/
node_cnt = lgrp_plat_srat_domains(tp);
if (node_cnt == 1)
***************
*** 2100,2106 ****
apic_id = item->i.p.apic_id;
if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
! cpu_node, cpu_count, apic_id, domain) < 0)
return (-3);
proc_entry_count++;
--- 2142,2148 ----
apic_id = item->i.p.apic_id;
if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
! cpu_node, cpu_count, apic_id, domain,
lgrp_plat_min_domain) < 0)
return (-3);
proc_entry_count++;
***************
*** 2121,2127 ****
end = start + length - 1;
if (lgrp_plat_node_memory_update(node_domain, node_cnt,
! node_memory, start, end, domain) < 0)
return (-4);
break;
case SRAT_X2APIC: /* x2apic CPU entry */
--- 2163,2169 ----
end = start + length - 1;
if (lgrp_plat_node_memory_update(node_domain, node_cnt,
! node_memory, start, end, domain,
lgrp_plat_min_domain) < 0)
return (-4);
break;
case SRAT_X2APIC: /* x2apic CPU entry */
***************
*** 2137,2143 ****
apic_id = item->i.xp.x2apic_id;
if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
! cpu_node, cpu_count, apic_id, domain) < 0)
return (-3);
proc_entry_count++;
--- 2179,2185 ----
apic_id = item->i.xp.x2apic_id;
if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
! cpu_node, cpu_count, apic_id, domain,
lgrp_plat_min_domain) < 0)
return (-3);
proc_entry_count++;
***************
*** 2161,2167 ****
--- 2203,2273 ----
/*
+ * Return min proximity domain given in ACPI SRAT
+ */
+ static uint32_t
+ lgrp_plat_srat_min_domain(struct srat *tp)
+ {
+ struct srat_item *end;
+ int i;
+ struct srat_item *item;
+ uint32_t min_domain;
+
+
+ if (tp == NULL || !lgrp_plat_srat_enable)
+ return ((uint32_t)(-1));
+
+ /*
+ * Walk through SRAT, examining each CPU and memory entry to determine
+ * proximity domain ID for each.
+ */
+ item = tp->list;
+ end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
+ while (item < end) {
+ uint32_t domain;
+ boolean_t overflow;
+ uint_t start;
+
+ switch (item->type) {
+ case SRAT_PROCESSOR: /* CPU entry */
+ if (!(item->i.p.flags & SRAT_ENABLED))
+ break;
+ domain = item->i.p.domain1;
+ for (i = 0; i < 3; i++) {
+ domain += item->i.p.domain2[i] <<
+ ((i + 1) * 8);
+ }
+ break;
+
+ case SRAT_MEMORY: /* memory entry */
+ if (!(item->i.m.flags & SRAT_ENABLED))
+ break;
+ domain = item->i.m.domain;
+ break;
+
+ case SRAT_X2APIC: /* x2apic CPU entry */
+ if (!(item->i.xp.flags & SRAT_ENABLED))
+ break;
+ domain = item->i.xp.domain;
+ break;
+
+ default:
+ break;
+ }
+
+ if (domain < min_domain) {
+ min_domain=domain;
+ }
+
+ item = (struct srat_item *)((uintptr_t)item + item->len);
+ }
+ return min_domain;
+ }
+
+
+ /*
* Return number of proximity domains given in ACPI SRAT
+ * also initialize lgrp_plat_min_domain
*/
static int
lgrp_plat_srat_domains(struct srat *tp)
***************
*** 2171,2176 ****
--- 2277,2283 ----
int i;
struct srat_item *item;
node_domain_map_t node_domain[MAX_NODES];
+ uint32_t min_domain;
if (tp == NULL || !lgrp_plat_srat_enable)
***************
*** 2178,2183 ****
--- 2285,2300 ----
/*
* Walk through SRAT, examining each CPU and memory entry to determine
+ * min proximity domain ID.
+ */
+
+ if ((min_domain=lgrp_plat_srat_min_domain(tp)) == (uint32_t)(-1)) {
+ return(1);
+ }
+ lgrp_plat_min_domain=min_domain;
+
+ /*
+ * Walk through SRAT, examining each CPU and memory entry to determine
* proximity domain ID for each.
*/
domain_cnt = 0;
***************
*** 2219,2225 ****
/*
* Count and keep track of which proximity domain IDs seen
*/
! start = i = domain % MAX_NODES;
overflow = B_TRUE;
do {
/*
--- 2336,2342 ----
/*
* Count and keep track of which proximity domain IDs seen
*/
! start = i = (domain - min_domain) % MAX_NODES;
overflow = B_TRUE;
do {
/*
_______________________________________________
opensolaris-code mailing list
opensolaris-code@opensolaris.org
http://mail.opensolaris.org/mailman/listinfo/opensolaris-code