Hello

After working with Jonathan Chew on the subject (by emails),
I discovered that the problem came from the wrong initialization 
of the lgrp kernel internal objects
when the ACPI SRAT table is used to populate these objects (this applies to my 
system HP proliant with at least 2 opteron processors).

On my server, in the ACPI SRAT table, the proximity domain ids start at 1 (not 
0).
when this happens, the lgrp_plat_node_memory records are swapped.
ex :

lgrp_plat_node_memory[]
    {
        start = 0x80000
        end = 0xfffff
        exists = 0x1
        prox_domain = 0x2
    }
    {
        start = 0
        end = 0x7ffff
        exists = 0x1
        prox_domain = 0x1
    }

and mem_node_config[] :
    {
        exists = 0x1
        physbase = 0x80000
        physmax = 0xf57f5
    }
    {
        exists = 0x1
        physbase = 0
        physmax = 0x7ffff
    }


When the SRAT table is not processed (which was the case before snv_88),
these objects are initialized using pci registers and give :
lgrp_plat_node_memory[]
    {
        start = 0
        end = 0x7ffff
        exists = 0x1
        prox_domain = 0x1
    }
    {
        start = 0x80000
        end = 0xfffff
        exists = 0x1
        prox_domain = 0x2
    }
and :
mem_node_config[] :
    {
        exists = 0x1
        physbase = 0
        physmax = 0x7ffff
    }
    {
        exists = 0x1
        physbase = 0x80000
        physmax = 0xf57f5
    }
With this configuration, the system works properly.


The problem is that access to this table is done with a hash function which is 
prox-domain-id modulo number of nodes in system.
On my system, since prox-domain start at 1 :
1%2 = 1
2%2 = 0
So it reverses the order of the node in the table.

I tried the folowing fix in lgrpplat.c :
I changed the macro :
NODE_DOMAIN_HASH(domain, node_cnt)    ((domain) % node_cnt)
into :
NODE_DOMAIN_HASH(domain, node_cnt, first_domain)    ((domain - first_domain) % 
node_cnt)

(full patch in attached file)

With this corrected code, the lgrp objects are initialized in the correct way 
(like when the objects are initialized from pci regs).

I don't think this fix has side effect outside lgrpplat.c, since changes are 
internal to this file, no interface modified.

If you have a system impacted by this bug, can you test this fix (I can provide 
a bfu binary archive if you need it) ?

Thank you

Guy
-- 
This message posted from opensolaris.org
*** lgrpplat.c_orig     Mon Feb 23 19:01:40 2009
--- usr/src/uts/i86pc/os/lgrpplat.c     Thu Mar 12 17:09:50 2009
***************
*** 143,149 ****
--- 143,152 ----
  #include <vm/hat_i86.h>
  #include <vm/seg_kmem.h>
  #include <vm/vm_dep.h>
+ #include <sys/kobj.h>
+ #include <sys/kobj_lex.h>
  
+ 
  #include "acpi_fw.h"          /* for SRAT and SLIT */
  
  
***************
*** 151,156 ****
--- 154,168 ----
  #define       NLGRP                   (MAX_NODES * (MAX_NODES - 1) + 1)
  
  /*
+  * boot argument string
+  * allows to disable srat and/or slit acpi tables processing
+  * ( workaround usefull for some class of NUMA server, when acpi data are 
unusual )
+  */
+ #define BP_LGRP_SRAT_ENABLE           "lgrp-srat-enable"
+ #define BP_LGRP_SLIT_ENABLE           "lgrp-slit-enable"
+ #define BP_MAX_STRLEN                 80
+ 
+ /*
   * Constants for configuring probing
   */
  #define       LGRP_PLAT_PROBE_NROUNDS         64      /* default laps for 
probing */
***************
*** 168,174 ****
   * Hash proximity domain ID into node to domain mapping table using to 
minimize
   * span of entries used
   */
! #define       NODE_DOMAIN_HASH(domain, node_cnt)      ((domain) % node_cnt)
  
  
  /*
--- 180,186 ----
   * Hash proximity domain ID into node to domain mapping table using to 
minimize
   * span of entries used
   */
! #define       NODE_DOMAIN_HASH(domain, node_cnt, first_domain)        
((domain - first_domain) % node_cnt)
  
  
  /*
***************
*** 294,299 ****
--- 306,316 ----
  uint_t                        lgrp_plat_node_cnt = 1;
  
  /*
+  * First domain number in system
+  */
+ uint32_t              lgrp_plat_min_domain = 0;
+ 
+ /*
   * Configuration Parameters for Probing
   * - lgrp_plat_probe_flags    Flags to specify enabling probing, probe
   *                            operation, etc.
***************
*** 368,379 ****
  
  static int    lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
      int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
!     uint32_t domain);
  
  static int    lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
  
  static int    lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
!     int node_cnt, uint32_t domain);
  
  static void   lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
      lgrp_plat_latency_stats_t *lat_stats,
--- 385,396 ----
  
  static int    lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
      int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
!     uint32_t domain, uint32_t min_domain);
  
  static int    lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node);
  
  static int    lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
!     int node_cnt, uint32_t domain, uint32_t min_domain);
  
  static void   lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory,
      lgrp_plat_latency_stats_t *lat_stats,
***************
*** 385,395 ****
  static pgcnt_t        lgrp_plat_mem_size_default(lgrp_handle_t, 
lgrp_mem_query_t);
  
  static int    lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
!     int node_cnt, uint32_t domain);
  
  static int    lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
      int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
!     uint64_t end, uint32_t domain);
  
  static hrtime_t       lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
      lgrp_plat_probe_mem_config_t *probe_mem_config,
--- 402,412 ----
  static pgcnt_t        lgrp_plat_mem_size_default(lgrp_handle_t, 
lgrp_mem_query_t);
  
  static int    lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
!     int node_cnt, uint32_t domain, uint32_t min_domain);
  
  static int    lgrp_plat_node_memory_update(node_domain_map_t *node_domain,
      int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start,
!     uint64_t end, uint32_t domain, uint32_t min_domain);
  
  static hrtime_t       lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
      lgrp_plat_probe_mem_config_t *probe_mem_config,
***************
*** 683,691 ****
--- 700,731 ----
         */
        lgrp_plat_node_cnt = max_mem_nodes = 1;
  #else /* __xpv */
+       int     boot_prop_len;
        uint_t  probe_op;
+       char    str[BP_MAX_STRLEN];
+       u_longlong_t    value;
  
        /*
+        * Get boot property for enabling/disabling SRAT
+        */
+       boot_prop_len=BOP_GETPROPLEN(bootops, BP_LGRP_SRAT_ENABLE);
+       if (boot_prop_len> 0 && boot_prop_len < sizeof (str)) {
+               if (BOP_GETPROP(bootops, BP_LGRP_SRAT_ENABLE, str) >= 0 &&
+                       kobj_getvalue(str, &value) != -1) {
+                        lgrp_plat_srat_enable = 0;
+               }
+       }
+ 
+       boot_prop_len=BOP_GETPROPLEN(bootops, BP_LGRP_SLIT_ENABLE);
+       if (boot_prop_len> 0 && boot_prop_len < sizeof (str)) {
+               if (BOP_GETPROP(bootops, BP_LGRP_SLIT_ENABLE, str) >= 0 &&
+                       kobj_getvalue(str, &value) != -1) {
+                        lgrp_plat_slit_enable = 0;
+               }
+       }
+ 
+ 
+       /*
         * Initialize as a UMA machine
         */
        if (lgrp_topo_ht_limit() == 1) {
***************
*** 703,708 ****
--- 743,749 ----
        /*
         * Determine which CPUs and memory are local to each other and number
         * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
+       if (lgrp_plat_srat_enable && lgrp_plat_apic_ncpus > 0) {
         */
        if (lgrp_plat_apic_ncpus > 0) {
                int     retval;
***************
*** 1190,1196 ****
   */
  static int
  lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
!     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
  {
        uint_t  i;
        int     node;
--- 1231,1237 ----
   */
  static int
  lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
!     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain, 
uint32_t min_domain)
  {
        uint_t  i;
        int     node;
***************
*** 1198,1207 ****
        /*
         * Get node number for proximity domain
         */
!       node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
        if (node == -1) {
                node = lgrp_plat_node_domain_update(node_domain, node_cnt,
!                   domain);
                if (node == -1)
                        return (-1);
        }
--- 1239,1248 ----
        /*
         * Get node number for proximity domain
         */
!       node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain, 
min_domain);
        if (node == -1) {
                node = lgrp_plat_node_domain_update(node_domain, node_cnt,
!                   domain, min_domain);
                if (node == -1)
                        return (-1);
        }
***************
*** 1283,1289 ****
   */
  static int
  lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
!     uint32_t domain)
  {
        uint_t  node;
        uint_t  start;
--- 1324,1330 ----
   */
  static int
  lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
!     uint32_t domain, uint32_t min_domain)
  {
        uint_t  node;
        uint_t  start;
***************
*** 1293,1304 ****
         * search for entry with matching proximity domain ID, and return index
         * of matching entry as node ID.
         */
!       node = start = NODE_DOMAIN_HASH(domain, node_cnt);
        do {
                if (node_domain[node].prox_domain == domain &&
                    node_domain[node].exists)
                        return (node);
!               node = NODE_DOMAIN_HASH(node + 1, node_cnt);
        } while (node != start);
        return (-1);
  }
--- 1334,1345 ----
         * search for entry with matching proximity domain ID, and return index
         * of matching entry as node ID.
         */
!       node = start = NODE_DOMAIN_HASH(domain, node_cnt, min_domain);
        do {
                if (node_domain[node].prox_domain == domain &&
                    node_domain[node].exists)
                        return (node);
!               node = NODE_DOMAIN_HASH(node + 1, node_cnt, min_domain);
        } while (node != start);
        return (-1);
  }
***************
*** 1699,1705 ****
   */
  static int
  lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
!     uint32_t domain)
  {
        uint_t  node;
        uint_t  start;
--- 1740,1746 ----
   */
  static int
  lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
!     uint32_t domain, uint32_t min_domain)
  {
        uint_t  node;
        uint_t  start;
***************
*** 1708,1714 ****
         * Hash proximity domain ID into node to domain mapping table (array)
         * and add entry for it into first non-existent or matching entry found
         */
!       node = start = NODE_DOMAIN_HASH(domain, node_cnt);
        do {
                /*
                 * Entry doesn't exist yet, so create one for this proximity
--- 1749,1755 ----
         * Hash proximity domain ID into node to domain mapping table (array)
         * and add entry for it into first non-existent or matching entry found
         */
!       node = start = NODE_DOMAIN_HASH(domain, node_cnt, min_domain);
        do {
                /*
                 * Entry doesn't exist yet, so create one for this proximity
***************
*** 1726,1732 ****
                 */
                if (node_domain[node].prox_domain == domain)
                        return (node);
!               node = NODE_DOMAIN_HASH(node + 1, node_cnt);
        } while (node != start);
  
        /*
--- 1767,1773 ----
                 */
                if (node_domain[node].prox_domain == domain)
                        return (node);
!               node = NODE_DOMAIN_HASH(node + 1, node_cnt, min_domain);
        } while (node != start);
  
        /*
***************
*** 1745,1751 ****
  static int
  lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
      node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
!     uint32_t domain)
  {
        int     node;
  
--- 1786,1792 ----
  static int
  lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt,
      node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end,
!     uint32_t domain, uint32_t min_domain)
  {
        int     node;
  
***************
*** 1752,1761 ****
        /*
         * Get node number for proximity domain
         */
!       node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
        if (node == -1) {
                node = lgrp_plat_node_domain_update(node_domain, node_cnt,
!                   domain);
                if (node == -1)
                        return (-1);
        }
--- 1793,1802 ----
        /*
         * Get node number for proximity domain
         */
!       node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain, 
min_domain);
        if (node == -1) {
                node = lgrp_plat_node_domain_update(node_domain, node_cnt,
!                   domain, min_domain);
                if (node == -1)
                        return (-1);
        }
***************
*** 2061,2066 ****
--- 2102,2108 ----
         * Determine number of nodes by counting number of proximity domains in
         * SRAT and return if number of nodes is 1 or less since don't need to
         * read SRAT then
+        * initialize lgrp_plat_min_domain
         */
        node_cnt = lgrp_plat_srat_domains(tp);
        if (node_cnt == 1)
***************
*** 2100,2106 ****
                        apic_id = item->i.p.apic_id;
  
                        if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
!                           cpu_node, cpu_count, apic_id, domain) < 0)
                                return (-3);
  
                        proc_entry_count++;
--- 2142,2148 ----
                        apic_id = item->i.p.apic_id;
  
                        if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
!                           cpu_node, cpu_count, apic_id, domain, 
lgrp_plat_min_domain) < 0)
                                return (-3);
  
                        proc_entry_count++;
***************
*** 2121,2127 ****
                        end = start + length - 1;
  
                        if (lgrp_plat_node_memory_update(node_domain, node_cnt,
!                           node_memory, start, end, domain) < 0)
                                return (-4);
                        break;
                case SRAT_X2APIC:       /* x2apic CPU entry */
--- 2163,2169 ----
                        end = start + length - 1;
  
                        if (lgrp_plat_node_memory_update(node_domain, node_cnt,
!                           node_memory, start, end, domain, 
lgrp_plat_min_domain) < 0)
                                return (-4);
                        break;
                case SRAT_X2APIC:       /* x2apic CPU entry */
***************
*** 2137,2143 ****
                        apic_id = item->i.xp.x2apic_id;
  
                        if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
!                           cpu_node, cpu_count, apic_id, domain) < 0)
                                return (-3);
  
                        proc_entry_count++;
--- 2179,2185 ----
                        apic_id = item->i.xp.x2apic_id;
  
                        if (lgrp_plat_cpu_node_update(node_domain, node_cnt,
!                           cpu_node, cpu_count, apic_id, domain, 
lgrp_plat_min_domain) < 0)
                                return (-3);
  
                        proc_entry_count++;
***************
*** 2161,2167 ****
--- 2203,2273 ----
  
  
  /*
+  * Return min proximity domain given in ACPI SRAT
+  */
+ static uint32_t
+ lgrp_plat_srat_min_domain(struct srat *tp)
+ {
+       struct srat_item        *end;
+       int                     i;
+       struct srat_item        *item;
+       uint32_t                min_domain;
+ 
+ 
+       if (tp == NULL || !lgrp_plat_srat_enable)
+               return ((uint32_t)(-1));
+ 
+       /*
+        * Walk through SRAT, examining each CPU and memory entry to determine
+        * proximity domain ID for each.
+        */
+       item = tp->list;
+       end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp);
+       while (item < end) {
+               uint32_t        domain;
+               boolean_t       overflow;
+               uint_t          start;
+ 
+               switch (item->type) {
+               case SRAT_PROCESSOR:    /* CPU entry */
+                       if (!(item->i.p.flags & SRAT_ENABLED))
+                               break;
+                       domain = item->i.p.domain1;
+                       for (i = 0; i < 3; i++) {
+                               domain += item->i.p.domain2[i] <<
+                                   ((i + 1) * 8);
+                       }
+                       break;
+ 
+               case SRAT_MEMORY:       /* memory entry */
+                       if (!(item->i.m.flags & SRAT_ENABLED))
+                               break;
+                       domain = item->i.m.domain;
+                       break;
+ 
+               case SRAT_X2APIC:       /* x2apic CPU entry */
+                       if (!(item->i.xp.flags & SRAT_ENABLED))
+                               break;
+                       domain = item->i.xp.domain;
+                       break;
+ 
+               default:
+                       break;
+               }
+ 
+               if (domain < min_domain) {
+                       min_domain=domain;
+               }
+ 
+               item = (struct srat_item *)((uintptr_t)item + item->len);
+       }
+       return min_domain;
+ }
+ 
+ 
+ /*
   * Return number of proximity domains given in ACPI SRAT
+  * also initialize lgrp_plat_min_domain
   */
  static int
  lgrp_plat_srat_domains(struct srat *tp)
***************
*** 2171,2176 ****
--- 2277,2283 ----
        int                     i;
        struct srat_item        *item;
        node_domain_map_t       node_domain[MAX_NODES];
+       uint32_t                min_domain;
  
  
        if (tp == NULL || !lgrp_plat_srat_enable)
***************
*** 2178,2183 ****
--- 2285,2300 ----
  
        /*
         * Walk through SRAT, examining each CPU and memory entry to determine
+        * min proximity domain ID.
+        */
+       
+       if ((min_domain=lgrp_plat_srat_min_domain(tp))  == (uint32_t)(-1)) {
+               return(1);
+               }
+       lgrp_plat_min_domain=min_domain;
+ 
+       /*
+        * Walk through SRAT, examining each CPU and memory entry to determine
         * proximity domain ID for each.
         */
        domain_cnt = 0;
***************
*** 2219,2225 ****
                /*
                 * Count and keep track of which proximity domain IDs seen
                 */
!               start = i = domain % MAX_NODES;
                overflow = B_TRUE;
                do {
                        /*
--- 2336,2342 ----
                /*
                 * Count and keep track of which proximity domain IDs seen
                 */
!               start = i = (domain - min_domain) % MAX_NODES;
                overflow = B_TRUE;
                do {
                        /*
_______________________________________________
opensolaris-code mailing list
opensolaris-code@opensolaris.org
http://mail.opensolaris.org/mailman/listinfo/opensolaris-code

Reply via email to