I am still trying to diagnose the issues with these two systems (dual Intel 5520/supermicro X8DT3). When I have two PC3-8500 ECC RDIMMs populated per channel (6, 4G RDIMMs/socket), I see the following fmdump -e reports reliably every 10 seconds:

Nov 24 2009 09:34:40.410002797 ereport.cpu.intel.quickpath.mem_ce
Nov 24 2009 09:34:40.410013341 ereport.cpu.intel.quickpath.mem_ce
Nov 24 2009 09:34:40.410025191 ereport.cpu.intel.quickpath.mem_ce
Nov 24 2009 09:34:40.410060807 ereport.cpu.intel.quickpath.mem_ce
Nov 24 2009 09:34:40.410071957 ereport.cpu.intel.quickpath.mem_ce
Nov 24 2009 09:34:40.410134184 ereport.cpu.intel.quickpath.mem_ce

(I've included the details of these reports at the bottom of this email.)

While the reports seem to indicate that there is a problem on the second row RDIMMs, swapping the memory sticks around yields the same problem. In fact, even when I have only on RDIMM in each channel the machine will experience occassional ereports on memory.

I have tried forcing the BIOS to run the memory @ DDR-1066 and DDR-800 and still have the same behavior, so it would seem to not be related to memory quality. The memory modules appear to be reasonable quality stuff:

  Hynix

I have tried opensolaris b127, b126, 2009.06, and S10U8, all with variations of these same symptoms. I am really hoping that someone can either give some guidance on further diagnosing what is going on, or can point me to which mailing list on which I should follow-up.

Here is the detailed fmdump ereport information:

bash-3.00# cat /tmp/fmdump.eV.sample
Nov 24 2009 09:34:40.410002797 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb34793a6b810401
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0d5c800001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc3d099c80
        IA32_MCi_MISC = 0x294e5def00015840
        ECC-syndrome = 0x294e5def
        physaddr = 0xc3d099c80
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7fc0c400
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x69f 0x797 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x743 0x4ee 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x1870256d

Nov 24 2009 09:34:40.410013341 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb34793d17612001
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0001400001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc3e0d3300
        IA32_MCi_MISC = 0xa13dcb9300015840
        ECC-syndrome = 0xa13dcb93
        physaddr = 0xc3e0d3300
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7fd66b80
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x6a0 0x7a6 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x69f 0x797 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x18704e9d

Nov 24 2009 09:34:40.410025191 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb34794013414401
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0000800001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc3cebadc0
        IA32_MCi_MISC = 0x7f59c35b00011280
        ECC-syndrome = 0x7f59c35b
        physaddr = 0xc3cebadc0
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7fbe49c0
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x6a0 0x7a9 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x6a0 0x7a6 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x18707ce7

Nov 24 2009 09:34:40.410060807 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb3479489c416401
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0000800001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc3ad95680
        IA32_MCi_MISC = 0x5ac6cb0500011080
        ECC-syndrome = 0x5ac6cb05
        physaddr = 0xc3ad95680
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7f921200
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x6a2 0x7c3 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x6a0 0x7a9 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x18710807

Nov 24 2009 09:34:40.410071957 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb34794b6cb10001
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0000c00001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc2ff7dbc0
        IA32_MCi_MISC = 0x431bdc5f00011180
        ECC-syndrome = 0x431bdc5f
        physaddr = 0xc2ff7dbc0
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7ea9f3c0
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x6a5 0x7d1 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x6a2 0x7c3 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x18713395

Nov 24 2009 09:34:40.410134184 ereport.cpu.intel.quickpath.mem_ce
nvlist version: 0
        class = ereport.cpu.intel.quickpath.mem_ce
        ena = 0xb34795aa2712401
        detector = (embedded nvlist)
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])

        (end detector)

        compound_errorname = MC_CH_RD_ERR
        IA32_MCG_STATUS = 0x0
        machine_check_in_progress = 0
        bank_number = 0x8
        bank_msr_offset = 0x420
        IA32_MCi_STATUS = 0xcc0003c00001009f
        overflow = 1
        error_uncorrected = 0
        error_enabled = 0
        processor_context_corrupt = 0
        error_code = 0x9f
        model_specific_error_code = 0x1
        threshold_based_error_status = No tracking
        IA32_MCi_ADDR = 0xc2e261440
        IA32_MCi_MISC = 0x48f99d1500015e46
        ECC-syndrome = 0x48f99d15
        physaddr = 0xc2e261440
        resource = (array of embedded nvlists)
        (start resource[0])
        nvlist version: 0
                version = 0x0
                scheme = hc
                hc-list = (array of embedded nvlists)
                (start hc-list[0])
                nvlist version: 0
                        hc-name = motherboard
                        hc-id = 0
                (end hc-list[0])
                (start hc-list[1])
                nvlist version: 0
                        hc-name = chip
                        hc-id = 1
                (end hc-list[1])
                (start hc-list[2])
                nvlist version: 0
                        hc-name = memory-controller
                        hc-id = 0
                (end hc-list[2])
                (start hc-list[3])
                nvlist version: 0
                        hc-name = dram-channel
                        hc-id = 0
                (end hc-list[3])
                (start hc-list[4])
                nvlist version: 0
                        hc-name = dimm
                        hc-id = 1
                (end hc-list[4])
                (start hc-list[5])
                nvlist version: 0
                        hc-name = rank
                        hc-id = 5
                (end hc-list[5])

                hc-specific = (embedded nvlist)
                nvlist version: 0
                        offset = 0x7e832140
                (end hc-specific)

        (end resource[0])

        mem_cor_ecc_counter = 0x6a7 0x7d8 0x0 0x0 0x3 0x0
        mem_cor_ecc_counter_last = 0x6a5 0x7d1 0x0 0x0 0x3 0x0
        __ttl = 0x1
        __tod = 0x4b0bfd10 0x187226a8

bash-3.00#


--
paul
_______________________________________________
fm-discuss mailing list
fm-discuss@opensolaris.org

Reply via email to