3.2.0-rc1 panic on PowerPC

2011-11-15 Thread Christian Kujau
Hi,

I noticed a few crashes on this PowerBook G4 lately, starting somewhere in 
3.2.0-rc1. The crashes are really rare and as I'm not on the system all 
the time I did not notice most of them. By the time I did, the screen was 
blank already and I had to hard-reset the box. But not this time:

  http://nerdbynature.de/bits/3.2.0-rc1/oops/

When the crash occured, the system was failry loaded (CPU and disk I/O 
wise), so that may have triggered it. I tried to type off the stack trace, 
I hope there are not too many typos, see below.

The machine is fairly old, so maybe it's just bad RAM or something, I 
wouldn't be suprised. But maybe not, the box us pretty stable most of the 
time and only now I notice these rare crashes.

If anyone could take a quick look...?

Thank you,
Christian.

Instruction dump:
92c40008 6801 0f00 8004 543c 9004 817f000c 380b
901f000c 2f09 81640018 81440014 916a0004 914b 92840014 92a49918
Kernel panic - not syncing: Fatal exception in interrupt
Call Trace:
show_stack+0x70/0x1bc (unreliable)
panic+0xc8/0x220
die+0x2ac/0x2b8
bad_page_fault+0xbc/0x104
handle_page_fault+0x7c/0x80
Exception: 300 at T.975+0x3f4/0x570
LR = T.957+0x300/0x570
kmem_cache_alloc+0x150/0x150
__aloc_skb+0x50/0x148
tcp_send_ack+0x35/0x138
tcp_delay_timer+0x140/0x244
run_timer_softirq+0x1a0/0x2ec
__do_softirq+0xf4/0x1bc
call_do_softirq+0x14/0x24
do_softirq+0xfc/0x128
irq_exit+0xa0/0xa4
timer_interrupt+0x148/0x180
ret_from_except+0x0/0x14
cpu_idle+0xa0/0x118
rest_init+0xf0/0x114
start_kernel+0x2d0/0x2f0
0x3444
Rebooting in 180 seconds..

-- 
BOFH excuse #184:

loop found in loop in redundant loopback
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v3 3/3] mtd/nand : workaround for Freescale FCM to support large-page Nand chip

2011-11-15 Thread b35362
From: Liu Shuo b35...@freescale.com

Freescale FCM controller has a 2K size limitation of buffer RAM. In order
to support the Nand flash chip whose page size is larger than 2K bytes,
we read/write 2k data repeatedly by issuing FIR_OP_RB/FIR_OP_WB and save
them to a large buffer.

Signed-off-by: Liu Shuo shuo@freescale.com
Signed-off-by: Shengzhou Liu shengzhou@freescale.com
Signed-off-by: Li Yang le...@freescale.com
---
 drivers/mtd/nand/fsl_elbc_nand.c |  216 +++---
 1 files changed, 199 insertions(+), 17 deletions(-)

diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index c2c231b..415f87e 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -55,7 +55,9 @@ struct fsl_elbc_mtd {
struct device *dev;
int bank;   /* Chip select bank number   */
u8 __iomem *vbase;  /* Chip select base virtual address  */
-   int page_size;  /* NAND page size (0=512, 1=2048)*/
+   int page_size;  /* NAND page size (0=512, 1=2048, 2=4096...),
+* the mutiple of 2048.
+*/
unsigned int fmr;   /* FCM Flash Mode Register value */
 };
 
@@ -75,6 +77,8 @@ struct fsl_elbc_fcm_ctrl {
unsigned int use_mdr;/* Non zero if the MDR is to be set  */
unsigned int oob;/* Non zero if operating on OOB data */
unsigned int counter;/* counter for the initializations   */
+
+   char *buffer;/* just be used when pagesize  2048 */
 };
 
 /* These map to the positions used by the FCM hardware ECC generator */
@@ -150,6 +154,42 @@ static struct nand_bbt_descr bbt_mirror_descr = {
 };
 
 /*=*/
+static void io_to_buffer(struct mtd_info *mtd, int subpage, int oob)
+{
+   struct nand_chip *chip = mtd-priv;
+   struct fsl_elbc_mtd *priv = chip-priv;
+   struct fsl_elbc_fcm_ctrl *elbc_fcm_ctrl = priv-ctrl-nand;
+   void *src, *dst;
+   int len = (oob ? 64 : 2048);
+
+   if (oob)
+   dst = elbc_fcm_ctrl-buffer + mtd-writesize + subpage * 64;
+   else
+   dst = elbc_fcm_ctrl-buffer + subpage * 2048;
+
+   src = elbc_fcm_ctrl-addr + (oob ? 2048 : 0);
+   memcpy_fromio(dst, src, len);
+}
+
+static void buffer_to_io(struct mtd_info *mtd, int subpage, int oob)
+{
+   struct nand_chip *chip = mtd-priv;
+   struct fsl_elbc_mtd *priv = chip-priv;
+   struct fsl_elbc_fcm_ctrl *elbc_fcm_ctrl = priv-ctrl-nand;
+   void *src, *dst;
+   int len = (oob ? 64 : 2048);
+
+   if (oob)
+   src = elbc_fcm_ctrl-buffer + mtd-writesize + subpage * 64;
+   else
+   src = elbc_fcm_ctrl-buffer + subpage * 2048;
+
+   dst = elbc_fcm_ctrl-addr + (oob ? 2048 : 0);
+   memcpy_toio(dst, src, len);
+
+   /* See the in_8() in fsl_elbc_write_buf() */
+   in_8(elbc_fcm_ctrl-addr);
+}
 
 /*
  * Set up the FCM hardware block and page address fields, and the fcm
@@ -193,7 +233,7 @@ static void set_addr(struct mtd_info *mtd, int column, int 
page_addr, int oob)
 
/* for OOB data point to the second half of the buffer */
if (oob)
-   elbc_fcm_ctrl-index += priv-page_size ? 2048 : 512;
+   elbc_fcm_ctrl-index += mtd-writesize;
 
dev_vdbg(priv-dev, set_addr: bank=%d, 
elbc_fcm_ctrl-addr=0x%p (0x%p), 
@@ -311,6 +351,7 @@ static void fsl_elbc_cmdfunc(struct mtd_info *mtd, unsigned 
int command,
struct fsl_lbc_ctrl *ctrl = priv-ctrl;
struct fsl_elbc_fcm_ctrl *elbc_fcm_ctrl = ctrl-nand;
struct fsl_lbc_regs __iomem *lbc = ctrl-regs;
+   int i;
 
elbc_fcm_ctrl-use_mdr = 0;
 
@@ -339,21 +380,63 @@ static void fsl_elbc_cmdfunc(struct mtd_info *mtd, 
unsigned int command,
 
fsl_elbc_do_read(chip, 0);
fsl_elbc_run_command(mtd);
-   return;
 
+   if (priv-page_size = 1)
+   return;
+
+   /* Continue to read the rest bytes if writesize  2048 */
+   io_to_buffer(mtd, 0, 0);
+   io_to_buffer(mtd, 0, 1);
+
+   out_be32(lbc-fir, FIR_OP_RB  FIR_OP1_SHIFT);
+
+   for (i = 1; i  priv-page_size; i++) {
+   /*
+* Maybe there are some reasons of FCM hardware timming,
+* we must insert a FIR_OP_NOP(0x00) before FIR_OP_RB.
+*/
+   fsl_elbc_run_command(mtd);
+   io_to_buffer(mtd, i, 0);
+   io_to_buffer(mtd, i, 1);
+   }
+
+   return;
/* READOOB reads only the OOB because no ECC is performed. */
case NAND_CMD_READOOB:
dev_vdbg(priv-dev,
 fsl_elbc_cmdfunc: 

[PATCH 2/3] mtd/nand : set Nand flash page address to FBAR and FPAR correctly

2011-11-15 Thread b35362
From: Liu Shuo b35...@freescale.com

If we use the Nand flash chip whose number of pages in a block is greater
than 64(for large page), we must treat the low bit of FBAR as being the
high bit of the page address due to the limitation of FCM, it simply uses
the low 6-bits (for large page) of the combined block/page address as the
FPAR component, rather than considering the actual block size.

Signed-off-by: Liu Shuo b35...@freescale.com
Signed-off-by: Jerry Huang chang-ming.hu...@freescale.com
Signed-off-by: Tang Yuantian b29...@freescale.com
Signed-off-by: Li Yang le...@freescale.com
---
 drivers/mtd/nand/fsl_elbc_nand.c |   13 ++---
 1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 1bfcdef..c2c231b 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -166,15 +166,22 @@ static void set_addr(struct mtd_info *mtd, int column, 
int page_addr, int oob)
 
elbc_fcm_ctrl-page = page_addr;
 
-   out_be32(lbc-fbar,
-page_addr  (chip-phys_erase_shift - chip-page_shift));
-
if (priv-page_size) {
+   /*
+* large page size chip : FPAR[PI] save the lowest 6 bits,
+*FBAR[BLK] save the other bits.
+*/
+   out_be32(lbc-fbar, page_addr  6);
out_be32(lbc-fpar,
 ((page_addr  FPAR_LP_PI_SHIFT)  FPAR_LP_PI) |
 (oob ? FPAR_LP_MS : 0) | column);
buf_num = (page_addr  1)  2;
} else {
+   /*
+* small page size chip : FPAR[PI] save the lowest 5 bits,
+*FBAR[BLK] save the other bits.
+*/
+   out_be32(lbc-fbar, page_addr  5);
out_be32(lbc-fpar,
 ((page_addr  FPAR_SP_PI_SHIFT)  FPAR_SP_PI) |
 (oob ? FPAR_SP_MS : 0) | column);
-- 
1.7.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread b35362
From: Liu Shuo b35...@freescale.com

fix whitespaces,tabs coding style issue and use #include linux/io.h instead 
of asm/io.h
in drivers/mtd/nand/fsl_elbc.c.

Signed-off-by: Liu Shuo b35...@freescale.com
Signed-off-by: Li Yang le...@freescale.com
---
 drivers/mtd/nand/fsl_elbc_nand.c |  194 +++---
 1 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index eedd8ee..1bfcdef 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -38,7 +38,7 @@
 #include linux/mtd/nand_ecc.h
 #include linux/mtd/partitions.h
 
-#include asm/io.h
+#include linux/io.h
 #include asm/fsl_lbc.h
 
 #define MAX_BANKS 8
@@ -167,17 +167,17 @@ static void set_addr(struct mtd_info *mtd, int column, 
int page_addr, int oob)
elbc_fcm_ctrl-page = page_addr;
 
out_be32(lbc-fbar,
-page_addr  (chip-phys_erase_shift - chip-page_shift));
+page_addr  (chip-phys_erase_shift - chip-page_shift));
 
if (priv-page_size) {
out_be32(lbc-fpar,
-((page_addr  FPAR_LP_PI_SHIFT)  FPAR_LP_PI) |
-(oob ? FPAR_LP_MS : 0) | column);
+((page_addr  FPAR_LP_PI_SHIFT)  FPAR_LP_PI) |
+(oob ? FPAR_LP_MS : 0) | column);
buf_num = (page_addr  1)  2;
} else {
out_be32(lbc-fpar,
-((page_addr  FPAR_SP_PI_SHIFT)  FPAR_SP_PI) |
-(oob ? FPAR_SP_MS : 0) | column);
+((page_addr  FPAR_SP_PI_SHIFT)  FPAR_SP_PI) |
+(oob ? FPAR_SP_MS : 0) | column);
buf_num = page_addr  7;
}
 
@@ -190,10 +190,10 @@ static void set_addr(struct mtd_info *mtd, int column, 
int page_addr, int oob)
 
dev_vdbg(priv-dev, set_addr: bank=%d, 
elbc_fcm_ctrl-addr=0x%p (0x%p), 
-   index %x, pes %d ps %d\n,
+   index %x, pes %d ps %d\n,
 buf_num, elbc_fcm_ctrl-addr, priv-vbase,
 elbc_fcm_ctrl-index,
-chip-phys_erase_shift, chip-page_shift);
+chip-phys_erase_shift, chip-page_shift);
 }
 
 /*
@@ -213,13 +213,13 @@ static int fsl_elbc_run_command(struct mtd_info *mtd)
out_be32(lbc-mdr, elbc_fcm_ctrl-mdr);
 
dev_vdbg(priv-dev,
-fsl_elbc_run_command: fmr=%08x fir=%08x fcr=%08x\n,
-in_be32(lbc-fmr), in_be32(lbc-fir), in_be32(lbc-fcr));
+fsl_elbc_run_command: fmr=%08x fir=%08x fcr=%08x\n,
+in_be32(lbc-fmr), in_be32(lbc-fir), in_be32(lbc-fcr));
dev_vdbg(priv-dev,
-fsl_elbc_run_command: fbar=%08x fpar=%08x 
-fbcr=%08x bank=%d\n,
-in_be32(lbc-fbar), in_be32(lbc-fpar),
-in_be32(lbc-fbcr), priv-bank);
+fsl_elbc_run_command: fbar=%08x fpar=%08x 
+fbcr=%08x bank=%d\n,
+in_be32(lbc-fbar), in_be32(lbc-fpar),
+in_be32(lbc-fbcr), priv-bank);
 
ctrl-irq_status = 0;
/* execute special operation */
@@ -227,7 +227,7 @@ static int fsl_elbc_run_command(struct mtd_info *mtd)
 
/* wait for FCM complete flag or timeout */
wait_event_timeout(ctrl-irq_wait, ctrl-irq_status,
-  FCM_TIMEOUT_MSECS * HZ/1000);
+  FCM_TIMEOUT_MSECS * HZ/1000);
elbc_fcm_ctrl-status = ctrl-irq_status;
/* store mdr value in case it was needed */
if (elbc_fcm_ctrl-use_mdr)
@@ -237,8 +237,8 @@ static int fsl_elbc_run_command(struct mtd_info *mtd)
 
if (elbc_fcm_ctrl-status != LTESR_CC) {
dev_info(priv-dev,
-command failed: fir %x fcr %x status %x mdr %x\n,
-in_be32(lbc-fir), in_be32(lbc-fcr),
+command failed: fir %x fcr %x status %x mdr %x\n,
+in_be32(lbc-fir), in_be32(lbc-fcr),
 elbc_fcm_ctrl-status, elbc_fcm_ctrl-mdr);
return -EIO;
}
@@ -273,20 +273,20 @@ static void fsl_elbc_do_read(struct nand_chip *chip, int 
oob)
 
if (priv-page_size) {
out_be32(lbc-fir,
-(FIR_OP_CM0  FIR_OP0_SHIFT) |
-(FIR_OP_CA   FIR_OP1_SHIFT) |
-(FIR_OP_PA   FIR_OP2_SHIFT) |
-(FIR_OP_CM1  FIR_OP3_SHIFT) |
-(FIR_OP_RBW  FIR_OP4_SHIFT));
+(FIR_OP_CM0  FIR_OP0_SHIFT) |
+(FIR_OP_CA   FIR_OP1_SHIFT) |
+(FIR_OP_PA   FIR_OP2_SHIFT) |
+(FIR_OP_CM1  FIR_OP3_SHIFT) |
+(FIR_OP_RBW  FIR_OP4_SHIFT));
 
out_be32(lbc-fcr, 

RE: [PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread Jenkins, Clive
 fix whitespaces,tabs coding style issue and ...

In my opinion this code was already correct, and would display correctly
at any TAB setting. This patch changes it so that it displays
incorrectly
at all TAB settings other than 8.

Example:

Correct:
tabsfunction(arg1,
tabs9spacesarg2

Incorrect:
tabsfunction(arg1,
tabs tab   arg2

For any TAB setting other than 8, arg1 and arg2 no longer line up.

Clive
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread David Woodhouse
On Tue, 2011-11-15 at 11:26 +, Jenkins, Clive wrote:
  fix whitespaces,tabs coding style issue and ...
 
 In my opinion this code was already correct, and would display correctly
 at any TAB setting. This patch changes it so that it displays
 incorrectly at all TAB settings other than 8.

Any tab setting other than 8 is incorrect and should not be used for
Linux code. If I view it in a proportionally-spaced font, it's not going
to line up right either. Fix your editor, if that's an issue for you.

-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread Jenkins, Clive
   fix whitespaces,tabs coding style issue and ...
  
  In my opinion this code was already correct, and would display correctly
  at any TAB setting. This patch changes it so that it displays
  incorrectly at all TAB settings other than 8.

 Any tab setting other than 8 is incorrect and should not be used for
 Linux code.

This may be your (not so humble :-) opinion, and I happen to agree that
a tab setting of 8 is best, usually. However, as Linus says in his
coding style document Coding style is very personal, and I won't _force_
my views on anybody.

 ... Fix your editor, if that's an issue for you.

My editor has a tab setting of 8, but readers of this list have diverse
email clients, some of which do not display 8 spaces per tab.

Clive
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread David Laight
 
 On Tue, 2011-11-15 at 11:26 +, Jenkins, Clive wrote:
   fix whitespaces,tabs coding style issue and ...
  
  In my opinion this code was already correct, and would display
correctly
  at any TAB setting. This patch changes it so that it displays
  incorrectly at all TAB settings other than 8.
 
 Any tab setting other than 8 is incorrect and should not be used for
 Linux code. If I view it in a proportionally-spaced font, 
 it's not going to line up right either. Fix your editor, if that's an
issue for you.

Personally I don't even attempt to line up argumants on
continuation lines - it just wastes vertical space.
I just indent them as any other continuation line, so
double-indent if using 4-char indents and 1/2 indent for
8-char indents.

Tabs have to be assumed to be 8 columns, otherwise all sorts
of tools just get it wrong.
(Or you ban tabs from source files.)

David


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 1/3] mtd/nand: fix coding style issue in drivers/mtd/nand/fsl_elbc.c

2011-11-15 Thread David Woodhouse
On Tue, 2011-11-15 at 14:42 +, Jenkins, Clive wrote:
 This may be your (not so humble :-) opinion, and I happen to agree that
 a tab setting of 8 is best, usually. However, as Linus says in his
 coding style document Coding style is very personal, and I won't _force_
 my views on anybody. 

I refer you to the remainder of that same sentence.

-- 
dwmw2


smime.p7s
Description: S/MIME cryptographic signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: fsl_pq_mdio: fix non tbi phy access

2011-11-15 Thread Andy Fleming

On Nov 14, 2011, at 11:17 PM, Baruch Siach wrote:

 Hi Andy,
 
 On Mon, Nov 14, 2011 at 09:04:47PM +, Fleming Andy-AFLEMING wrote:
 Well, this got applied quickly, so I guess I can't NAK, but this requires 
 discussion.
 
 On Nov 14, 2011, at 0:22, Baruch Siach bar...@tkos.co.il wrote:
 
 Since 952c5ca1 (fsl_pq_mdio: Clean up tbi address configuration) .probe 
 returns
 -EBUSY when the tbi-phy node is missing. Fix this.
 
 It returns an error because it finds no tbi node. Because without the tbi 
 node, there is no way for the driver to determine which address to set.
 
 Your solution is to ignore the error, and hope. That's a broken approach.  
 The real solution for a p1010 should be to have a tbi node in the dts.
 
 Can you elaborate a bit on why this approach is broken? The PHY used to work 
 for me until 952c5ca1, and with this applied.


Yes, well, just because a problem goes away when a patch is applied does not 
mean that the patch is correct, or that it made things work.

An explanation:

In order to support certain types of serial data interfaces with external PHYs 
(like SGMII), it is necessary to translate the MAC's data signaling into the 
serialized signaling. On Freescale parts, this is done via a SerDes block, but 
the SerDes link needs a small amount of management. To perform this management, 
we have an onboard TBI PHY. This PHY is highly integrated with the MAC and 
MDIO devices. Each MAC has two relevant components:

1) a TBIPA register, which declares the address of the TBI PHY
2) an associated MDIO controller.

In order to configure the SerDes link, it is necessary to communicate via the 
local MDIO controller with the TBI PHY. For most of the MACs, this is simple: 
Choose an address for TBIPA, and then use that address to communicate with the 
TBI PHY. However, the *first* MDIO controller is also used to communicate with 
external PHYs. On this controller, we have to be fairly particular about which 
address we put in TBIPA, because all transactions to that address will go to 
the TBI PHY. On older parts, this value defaulted to 0, but it now defaults 
to 31, I believe.

Ok, so now we're at this code. The of_mdiobus_register() function will parse 
the device tree, and find all of the PHYs on the MDIO bus, and register them as 
devices. In order to ensure that all of those PHYs are accessible, we *MUST* 
set TBIPA to something that won't conflict with any existing addresses. The 
mechanism we have chosen for this is to assign the address in the device tree, 
via a tbi-phy node.

My recent patch changed the behavior, because we used to try to find a free 
address via scanning, but this was somewhat ugly, and failed (as you noticed) 
due to uninitialized mutexes.

The reason your latest patch is wrong is because it doesn't set the TBIPA 
register at all if there is no tbi-phy node. Instead, it just relies on luck, 
hoping that the TBIPA register was set to something that doesn't conflict 
already. It will work if 0x1f or 0 aren't necessary PHY addresses for your 
board, or if the firmware set it to something sensible.


 
 And looking at the p1010si.dtsi, I see that it's automatically there for 
 you.
 
 How were you breaking?
 
 Adding linuxppc to Cc.
 
 My board is P1011 based, the single core version of P1020, not P1010. In 
 p1020si.dtsi I see no tbi node. In p1020rdb.dts I see a tbi node but only for 
 mdio@25000, not mdio@24000, which is what I'm using.
 
 Am I missing something?


Well, that's a bug. In truth, the silicon dtsi trees should not have tbi nodes, 
as that's highly machine-specific. The p1020rdb is apparently relying on the 
old behavior, which is broken, and due to the fact that the first ethernet 
interface doesn't *use* the TBI PHY.

You should add this to your board tree:

mdio@24000 {

tbi0: tbi-phy@11 {
reg = 0x11;
device_type = tbi-phy;
};
};

And add the PHYs you use, as well as set reg (and the value after the @) to 
something that makes sense for your board.

I am going to go right now, and add tbi nodes for all of the Freescale 
platforms. I will also modify the fsl_pq_mdio code to be more explicit about 
its reason for failure.

Andy
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH v5 0/9] fadump: Firmware-assisted dump support for Powerpc.

2011-11-15 Thread Mahesh J Salgaonkar
Hi All,

Please find the version 5 of the patchset that implements firmware-assisted
dump mechanism to capture kernel crash dump for Powerpc architecture. The
firmware-assisted dump is a robust mechanism to get reliable kernel crash
dump with assistance from firmware. This approach does not use kexec, instead
firmware assists in booting the kdump kernel while preserving memory contents.

Change in v5:
-
- Added 'fadump_' prefix to all static functions defined.

patch 02/10:
- Merged patch 10/10 which introduces a config option CONFIG_FA_DUMP
  for firmware assisted dump feature on Powerpc (ppc64) architecture.
- Increased MIN_BOOT_MEM by 64M to avoid OOM issue during network
  dump capture. When kdump infrastructure is configured to save vmcore
  over network, we run into OOM issue while loading modules related to
  network setup.

Changes in v4:
--
patch 04/10:
- Move the init_elfcore_header() function and 'memblock_num_regions' macro
 from generic code to power specific code as these are used only by
 firmware assisted dump implementation which is power specific feature.

patch 05/10:
- Fixes a issue where memblock_free() is invoked from build_cpu_notes()
  function during error_out path. Invoke cpu_notes_buf_free() in error_out
  path instead of memblock_free().

Changes in v3:
-
- Re-factored the implementation to work with kdump service start/stop.
  Introduce fadump_registered sysfs control file which will be used by
  kdump init scripts to start/stop firmware assisted dump. echo 1 to
  /sys/kernel/fadump_registered file for fadump registration and
  echo 0 to /sys/kernel/fadump_registered file for fadump un-registration.
- Introduced the locking mechanism to handle simultaneous writes to
  sysfs control files fadump_registered and fadump_release_mem

  Affected patches are: 01/10, 03/10, 08/10.

Changes in v2:
-
patch 01/10:
- Modified the documentation to reflect the change of fadump_region
  file under debugfs filesystem.

patch 02/10:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  ibm,configure-kernel-dump-sizes property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

patch 03/10:
- Removed few debug print statements.
- Moved the setup_fadump() call from setup_system() and now calling it
  subsys_initcall.
- Moved fadump_region attribute under debugfs.
- Clear the TCE entries if firmware assisted dump is active.

patch 05/10:
- Moved the crash_fadump() invocation from generic code to panic notifier.
- Introduced cpu_notes_buf_alloc() function to allocate cpu notes buffer
  using get_free_pages().

patch 08/10:
- Introduced cpu_notes_buf_free() function to free memory allocated for
  cpu notes buffer.

The most of the code implementation has been adapted from phyp assisted dump
implementation written by Linas Vepstas and Manish Ahuja.

The first patch is a documentation that talks about firmware-assisted dump
mechanism, implementation details and TODO list.

I have tested the patches on following system configuration:
1. LPAR on Power6 with 4GB RAM and 8 CPUs
2. LPAR on Power7 with 2GB RAM and 20 CPUs
3. LPAR on Power7 with 1TB RAM and 896 CPUs

These patches cleanly apply on commit c3b92c878 in linux-2.6 git tree.

Please review the patchset and let me know your comments.

Thanks,
-Mahesh.
---

Mahesh Salgaonkar (9):
  fadump: Add documentation for firmware-assisted dump.
  fadump: Reserve the memory for firmware assisted dump.
  fadump: Register for firmware assisted dump.
  fadump: Initialize elfcore header and add PT_LOAD program headers.
  fadump: Convert firmware-assisted cpu state dump data into elf notes.
  fadump: Add PT_NOTE program header for vmcoreinfo
  fadump: Introduce cleanup routine to invalidate /proc/vmcore.
  fadump: Invalidate registration and release reserved memory for general 
use.
  fadump: Invalidate the fadump registration during machine shutdown.


 Documentation/powerpc/firmware-assisted-dump.txt |  243 
 arch/powerpc/Kconfig |   13 
 arch/powerpc/include/asm/fadump.h|  216 
 arch/powerpc/kernel/Makefile |1 
 arch/powerpc/kernel/fadump.c | 1316 ++
 arch/powerpc/kernel/iommu.c  |8 
 arch/powerpc/kernel/prom.c   |   15 
 arch/powerpc/kernel/setup-common.c   |   14 
 arch/powerpc/kernel/traps.c  |3 
 arch/powerpc/mm/hash_utils_64.c  |   11 
 fs/proc/vmcore.c |   23 
 11 files changed, 1861 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 

[RFC PATCH v5 1/9] fadump: Add documentation for firmware-assisted dump.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

Documentation for firmware-assisted dump. This document is based on the
original documentation written for phyp assisted dump by Linas Vepstas
and Manish Ahuja, with few changes to reflect the current implementation.

Change in v3:
- Modified the documentation to reflect introdunction of fadump_registered
  sysfs file and few minor changes.

Change in v2:
- Modified the documentation to reflect the change of fadump_region
  file under debugfs filesystem.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 Documentation/powerpc/firmware-assisted-dump.txt |  243 ++
 1 files changed, 243 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
new file mode 100644
index 000..3248b5d
--- /dev/null
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -0,0 +1,243 @@
+
+   Firmware-Assisted Dump
+   
+   July 2011
+
+The goal of firmware-assisted dump is to enable the dump of
+a crashed system, and to do so from a fully-reset system, and
+to minimize the total elapsed time until the system is back
+in production use.
+
+Comparing with kdump or other strategies, firmware-assisted
+dump offers several strong, practical advantages:
+
+-- Unlike kdump, the system has been reset, and loaded
+   with a fresh copy of the kernel.  In particular,
+   PCI and I/O devices have been reinitialized and are
+   in a clean, consistent state.
+-- Once the dump is copied out, the memory that held the dump
+   is immediately available to the running kernel. A further
+   reboot isn't required.
+
+The above can only be accomplished by coordination with,
+and assistance from the Power firmware. The procedure is
+as follows:
+
+-- The first kernel registers the sections of memory with the
+   Power firmware for dump preservation during OS initialization.
+   These registered sections of memory are reserved by the first
+   kernel during early boot.
+
+-- When a system crashes, the Power firmware will save
+   the low memory (boot memory of size larger of 5% of system RAM
+   or 256MB) of RAM to the previous registered region. It will
+   also save system registers, and hardware PTE's.
+
+   NOTE: The term 'boot memory' means size of the low memory chunk
+ that is required for a kernel to boot successfully when
+ booted with restricted memory. By default, the boot memory
+ size will be the larger of 5% of system RAM or 256MB.
+ Alternatively, user can also specify boot memory size
+ through boot parameter 'fadump_reserve_mem=' which will
+ override the default calculated size.
+
+-- After the low memory (boot memory) area has been saved, the
+   firmware will reset PCI and other hardware state.  It will
+   *not* clear the RAM. It will then launch the bootloader, as
+   normal.
+
+-- The freshly booted kernel will notice that there is a new
+   node (ibm,dump-kernel) in the device tree, indicating that
+   there is crash data available from a previous boot. During
+   the early boot OS will reserve rest of the memory above
+   boot memory size effectively booting with restricted memory
+   size. This will make sure that the second kernel will not
+   touch any of the dump memory area.
+
+-- User-space tools will read /proc/vmcore to obtain the contents
+   of memory, which holds the previous crashed kernel dump in ELF
+   format. The userspace tools may copy this info to disk, or
+   network, nas, san, iscsi, etc. as desired.
+
+-- Once the userspace tool is done saving dump, it will echo
+   '1' to /sys/kernel/fadump_release_mem to release the reserved
+   memory back to general use, except the memory required for
+   next firmware-assisted dump registration.
+
+   e.g.
+ # echo 1  /sys/kernel/fadump_release_mem
+
+Please note that the firmware-assisted dump feature
+is only available on Power6 and above systems with recent
+firmware versions.
+
+Implementation details:
+--
+
+During boot, a check is made to see if firmware supports
+this feature on that particular machine. If it does, then
+we check to see if an active dump is waiting for us. If yes
+then everything but boot memory size of RAM is reserved during
+early boot (See Fig. 2). This area is released once we finish
+collecting the dump from user land scripts (e.g. kdump scripts)
+that are run. If there is dump data, then the
+/sys/kernel/fadump_release_mem file is created, and the reserved
+memory is held.
+
+If there is no waiting dump data, then only the memory required
+to hold CPU state, HPTE region, boot memory dump and elfcore
+header, is reserved at the top of memory (see Fig. 1). This area
+is *not* released: this region will be kept permanently reserved,
+so that it can act 

[RFC PATCH v5 7/9] fadump: Introduce cleanup routine to invalidate /proc/vmcore.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

With the firmware-assisted dump support we don't require a reboot when we
are in second kernel after crash. The second kernel after crash is a normal
kernel boot and has knowledge about entire system RAM with the page tables
initialized for entire system RAM. Hence once the dump is saved to disk, we
can just release the reserved memory area for general use and continue
with second kernel as production kernel.

Hence when we release the reserved memory that contains dump data, the
'/proc/vmcore' will not be valid anymore. Hence this patch introduces
a cleanup routine that invalidates and removes the /proc/vmcore file. This
routine will be invoked before we release the reserved dump memory area.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 fs/proc/vmcore.c |   23 +++
 1 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf5..fae5526 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -699,3 +699,26 @@ static int __init vmcore_init(void)
return 0;
 }
 module_init(vmcore_init)
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+   struct list_head *pos, *next;
+
+   if (proc_vmcore) {
+   remove_proc_entry(proc_vmcore-name, proc_vmcore-parent);
+   proc_vmcore = NULL;
+   }
+
+   /* clear the vmcore list. */
+   list_for_each_safe(pos, next, vmcore_list) {
+   struct vmcore *m;
+
+   m = list_entry(pos, struct vmcore, list);
+   list_del(m-list);
+   kfree(m);
+   }
+   kfree(elfcorebuf);
+   elfcorebuf = NULL;
+}
+EXPORT_SYMBOL_GPL(vmcore_cleanup);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH v5 8/9] fadump: Invalidate registration and release reserved memory for general use.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

This patch introduces an sysfs interface '/sys/kernel/fadump_release_mem' to
invalidate the last fadump registration, invalidate '/proc/vmcore', release
the reserved memory for general use and re-register for future kernel dump.
Once the dump is copied to the disk, the userspace tool will echo 1 to
'/sys/kernel/fadump_release_mem'.

Release the reserved memory region excluding the size of the memory required
for future kernel dump registration.

Change in v3:
- Syncronize the fadump invalidation step to handle simultaneous writes to
  /sys/kernel/fadump_release_mem.

Change in v2:
- Introduced cpu_notes_buf_free() function to free memory allocated for
  cpu notes buffer.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/fadump.h |3 +
 arch/powerpc/kernel/fadump.c  |  158 -
 2 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 72908e3..ede7dc9 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -206,6 +206,9 @@ extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern void crash_fadump(struct pt_regs *, const char *);
+extern void fadump_cleanup(void);
+
+extern void vmcore_cleanup(void);
 #else  /* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 20ea849..73c670e 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -33,6 +33,8 @@
 #include linux/debugfs.h
 #include linux/seq_file.h
 #include linux/crash_dump.h
+#include linux/kobject.h
+#include linux/sysfs.h
 
 #include asm/page.h
 #include asm/prom.h
@@ -988,6 +990,132 @@ static int fadump_unregister_dump(struct 
fadump_mem_struct *fdm)
return 0;
 }
 
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+   int rc = 0;
+   unsigned int wait_time;
+
+   pr_debug(Invalidating firmware-assisted dump registration\n);
+
+   /* TODO: Add upper time limit for the delay */
+   do {
+   rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+   FADUMP_INVALIDATE, fdm,
+   sizeof(struct fadump_mem_struct));
+
+   wait_time = rtas_busy_delay_time(rc);
+   if (wait_time)
+   mdelay(wait_time);
+   } while (wait_time);
+
+   if (rc) {
+   printk(KERN_ERR Failed to invalidate firmware-assisted dump 
+   rgistration. unexpected error(%d).\n, rc);
+   return rc;
+   }
+   fw_dump.dump_active = 0;
+   fdm_active = NULL;
+   return 0;
+}
+
+void fadump_cleanup(void)
+{
+   /* Invalidate the registration only if dump is active. */
+   if (fw_dump.dump_active) {
+   init_fadump_mem_struct(fdm,
+   fdm_active-cpu_state_data.destination_address);
+   fadump_invalidate_dump(fdm);
+   }
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+   unsigned long addr;
+   unsigned long ra_start, ra_end;
+
+   ra_start = fw_dump.reserve_dump_area_start;
+   ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+   for (addr = begin; addr  end; addr += PAGE_SIZE) {
+   /*
+* exclude the dump reserve area. Will reuse it for next
+* fadump registration.
+*/
+   if (addr = ra_end  ((addr + PAGE_SIZE)  ra_start))
+   continue;
+
+   ClearPageReserved(pfn_to_page(addr  PAGE_SHIFT));
+   init_page_count(pfn_to_page(addr  PAGE_SHIFT));
+   free_page((unsigned long)__va(addr));
+   totalram_pages++;
+   }
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+   unsigned long reserved_area_start, reserved_area_end;
+   unsigned long destination_address;
+
+   mutex_lock(fadump_mutex);
+   if (!fw_dump.dump_active) {
+   mutex_unlock(fadump_mutex);
+   return;
+   }
+
+   destination_address = fdm_active-cpu_state_data.destination_address;
+   fadump_cleanup();
+   mutex_unlock(fadump_mutex);
+
+   /*
+* Save the current reserved memory bounds we will require them
+* later for releasing the memory for general use.
+*/
+   reserved_area_start = fw_dump.reserve_dump_area_start;
+   reserved_area_end = reserved_area_start +
+   fw_dump.reserve_dump_area_size;
+ 

[RFC PATCH v5 9/9] fadump: Invalidate the fadump registration during machine shutdown.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

If dump is active during system reboot, shutdown or halt then invalidate
the fadump registration as it does not get invalidated automatically.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/setup-common.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 87d2465..847d638 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -110,6 +110,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 /* also used by kexec */
 void machine_shutdown(void)
 {
+#ifdef CONFIG_FA_DUMP
+   /*
+* if fadump is active, cleanup the fadump registration before we
+* shutdown.
+*/
+   fadump_cleanup();
+#endif
+
if (ppc_md.machine_shutdown)
ppc_md.machine_shutdown();
 }

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH v5 6/9] fadump: Add PT_NOTE program header for vmcoreinfo

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

Introduce a PT_NOTE program header that points to physical address of
vmcoreinfo_note buffer declared in kernel/kexec.c. The vmcoreinfo
note buffer is populated during crash_fadump() at the time of system
crash.

Change in v5:
- Added 'fadump_' prefix to static function relocate().

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/fadump.c |   29 +
 1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 1879ddf..20ea849 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -818,6 +818,19 @@ static void fadump_setup_crash_memory_ranges(void)
}
 }
 
+/*
+ * If the given physical address falls within the boot memory region then
+ * return the relocated address that points to the dump region reserved
+ * for saving initial boot memory contents.
+ */
+static inline unsigned long fadump_relocate(unsigned long paddr)
+{
+   if (paddr  RMR_START  paddr  fw_dump.boot_memory_size)
+   return fdm.rmr_region.destination_address + paddr;
+   else
+   return paddr;
+}
+
 static int fadump_create_elfcore_headers(char *bufp)
 {
struct elfhdr *elf;
@@ -849,6 +862,22 @@ static int fadump_create_elfcore_headers(char *bufp)
 
(elf-e_phnum)++;
 
+   /* setup ELF PT_NOTE for vmcoreinfo */
+   phdr = (struct elf_phdr *)bufp;
+   bufp += sizeof(struct elf_phdr);
+   phdr-p_type= PT_NOTE;
+   phdr-p_flags   = 0;
+   phdr-p_vaddr   = 0;
+   phdr-p_align   = 0;
+
+   phdr-p_paddr   = fadump_relocate(paddr_vmcoreinfo_note());
+   phdr-p_offset  = phdr-p_paddr;
+   phdr-p_memsz   = vmcoreinfo_max_size;
+   phdr-p_filesz  = vmcoreinfo_max_size;
+
+   /* Increment number of program headers. */
+   (elf-e_phnum)++;
+
/* setup PT_LOAD sections. */
 
for (i = 0; i  crash_mem_ranges; i++) {

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH v5 3/9] fadump: Register for firmware assisted dump.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

This patch registers for firmware-assisted dump using rtas token
ibm,configure-kernel-dump. During registration firmware is informed about
the reserved area where it saves the CPU state data, HPTE table and contents
of RMR region at the time of kernel crash. Apart from this, firmware also
preserves the contents of entire partition memory even if it is not specified
during registration.

This patch also populates sysfs files under /sys/kernel to display
fadump status and reserved memory regions.

Change in v3:
- Re-factored the implementation to work with kdump service start/stop.
  Introduce fadump_registered sysfs control file which will be used by
  kdump init scripts to start/stop firmware assisted dump. echo 1 to
  /sys/kernel/fadump_registered file for fadump registration and
  echo 0 to /sys/kernel/fadump_registered file for fadump un-registration.
- Introduced the locking mechanism to handle simultaneous writes to
  /sys/kernel/fadump_registered file.

Change in v2:
- Removed few debug print statements.
- Moved the setup_fadump() call from setup_system() and now calling it
  subsys_initcall.
- Moved fadump_region attribute under debugfs.
- Clear the TCE entries if firmware assisted dump is active.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/fadump.h |   57 ++
 arch/powerpc/kernel/fadump.c  |  352 +
 arch/powerpc/kernel/iommu.c   |8 +
 arch/powerpc/mm/hash_utils_64.c   |   11 +
 4 files changed, 424 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 86b17e8..c2951b2 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -46,6 +46,58 @@
 #define FADUMP_HPTE_REGION 0x0002
 #define FADUMP_REAL_MODE_REGION0x0011
 
+/* Dump request flag */
+#define FADUMP_REQUEST_FLAG0x0001
+
+/* FAD commands */
+#define FADUMP_REGISTER1
+#define FADUMP_UNREGISTER  2
+#define FADUMP_INVALIDATE  3
+
+/* Kernel Dump section info */
+struct fadump_section {
+   u32 request_flag;
+   u16 source_data_type;
+   u16 error_flags;
+   u64 source_address;
+   u64 source_len;
+   u64 bytes_dumped;
+   u64 destination_address;
+};
+
+/* ibm,configure-kernel-dump header. */
+struct fadump_section_header {
+   u32 dump_format_version;
+   u16 dump_num_sections;
+   u16 dump_status_flag;
+   u32 offset_first_dump_section;
+
+   /* Fields for disk dump option. */
+   u32 dd_block_size;
+   u64 dd_block_offset;
+   u64 dd_num_blocks;
+   u32 dd_offset_disk_path;
+
+   /* Maximum time allowed to prevent an automatic dump-reboot. */
+   u32 max_time_auto;
+};
+
+/*
+ * Firmware Assisted dump memory structure. This structure is required for
+ * registering future kernel dump with power firmware through rtas call.
+ *
+ * No disk dump option. Hence disk dump path string section is not included.
+ */
+struct fadump_mem_struct {
+   struct fadump_section_headerheader;
+
+   /* Kernel dump sections */
+   struct fadump_section   cpu_state_data;
+   struct fadump_section   hpte_region;
+   struct fadump_section   rmr_region;
+};
+
+/* Firmware-assisted dump configuration details. */
 struct fw_dump {
unsigned long   cpu_state_data_size;
unsigned long   hpte_region_size;
@@ -60,10 +112,15 @@ struct fw_dump {
unsigned long   fadump_enabled:1;
unsigned long   fadump_supported:1;
unsigned long   dump_active:1;
+   unsigned long   dump_registered:1;
 };
 
 extern int early_init_dt_scan_fw_dump(unsigned long node,
const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
+extern int setup_fadump(void);
+extern int is_fadump_active(void);
+#else  /* CONFIG_FA_DUMP */
+static inline int is_fadump_active(void) { return 0; }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d94fc0e..15f4751 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -29,6 +29,9 @@
 
 #include linux/string.h
 #include linux/memblock.h
+#include linux/delay.h
+#include linux/debugfs.h
+#include linux/seq_file.h
 
 #include asm/page.h
 #include asm/prom.h
@@ -46,6 +49,10 @@ struct dump_section {
 } __packed;
 
 static struct fw_dump fw_dump;
+static struct fadump_mem_struct fdm;
+static const struct fadump_mem_struct *fdm_active;
+
+static DEFINE_MUTEX(fadump_mutex);
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
@@ -74,7 +81,8 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 * The 'ibm,kernel-dump' rtas node is present only if there is
 * dump data 

[RFC PATCH v5 4/9] fadump: Initialize elfcore header and add PT_LOAD program headers.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

Build the crash memory range list by traversing through system memory during
the first kernel before we register for firmware-assisted dump. After the
successful dump registration, initialize the elfcore header and populate
PT_LOAD program headers with crash memory ranges. The elfcore header is
saved in the scratch area within the reserved memory. The scratch area starts
at the end of the memory reserved for saving RMR region contents. The
scratch area contains fadump crash info structure that contains magic number
for fadump validation and physical address where the eflcore header can be
found. This structure will also be used to pass some important crash info
data to the second kernel which will help second kernel to populate ELF core
header with correct data before it gets exported through /proc/vmcore. Since
the firmware preserves the entire partition memory at the time of crash the
contents of the scratch area will be preserved till second kernel boot.

NOTE: The current design implementation does not address a possibility of
introducing additional fields (in future) to this structure without affecting
compatibility. It's on TODO list to come up with better approach to
address this.

Reserved dump area start = +-+
|  CPU state dump data|
+-+
|  HPTE region data   |
+-+
|  RMR region data|
Scratch area start   = +-+
|  fadump crash info structure {  |
| magic nummber   |
 +--| elfcorehdr_addr |
 |  |  }  |
 + +-+
|  ELF core header|
Reserved dump area end   = +-+

Change in v5:
- Added 'fadump_' prefix to all static functions defined.

Change in v4:
- Move the init_elfcore_header() function and 'memblock_num_regions' macro
  from generic code to power specific code as these are used only by
  firmware assisted dump implementation which is power specific feature.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/fadump.h |   43 +++
 arch/powerpc/kernel/fadump.c  |  233 +
 2 files changed, 275 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index c2951b2..c022d5c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -41,6 +41,12 @@
 #define MIN_BOOT_MEM   (((RMR_END  (0x1UL  28)) ? (0x1UL  28) : RMR_END) \
+ (0x1UL  26))
 
+#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
+
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA  0x0001
 #define FADUMP_HPTE_REGION 0x0002
@@ -54,6 +60,9 @@
 #define FADUMP_UNREGISTER  2
 #define FADUMP_INVALIDATE  3
 
+/* Dump status flag */
+#define FADUMP_ERROR_FLAG  0x2000
+
 /* Kernel Dump section info */
 struct fadump_section {
u32 request_flag;
@@ -107,6 +116,7 @@ struct fw_dump {
/* cmd line option during boot */
unsigned long   reserve_bootvar;
 
+   unsigned long   fadumphdr_addr;
int ibm_configure_kernel_dump;
 
unsigned long   fadump_enabled:1;
@@ -115,6 +125,39 @@ struct fw_dump {
unsigned long   dump_registered:1;
 };
 
+/*
+ * Copy the ascii values for first 8 characters from a string into u64
+ * variable at their respective indexes.
+ * e.g.
+ *  The string FADMPINF will be converted into 0x4641444d50494e46
+ */
+static inline u64 str_to_u64(const char *str)
+{
+   u64 val = 0;
+   int i;
+
+   for (i = 0; i  sizeof(val); i++)
+   val = (*str) ? (val  8) | *str++ : val  8;
+   return val;
+}
+#define STR_TO_HEX(x)  str_to_u64(x)
+
+#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX(FADMPINF)
+
+/* fadump crash info structure */
+struct fadump_crash_info_header {
+   u64 magic_number;
+   u64 elfcorehdr_addr;
+};
+
+/* Crash memory ranges */
+#define INIT_CRASHMEM_RANGES   (INIT_MEMBLOCK_REGIONS + 2)
+
+struct fad_crash_memory_ranges {
+   unsigned long long  base;
+   unsigned long long  size;
+};
+
 extern int early_init_dt_scan_fw_dump(unsigned long node,
const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
diff --git 

[RFC PATCH v5 5/9] fadump: Convert firmware-assisted cpu state dump data into elf notes.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

When registered for firmware assisted dump on powerpc, firmware preserves
the registers for the active CPUs during a system crash. This patch reads
the cpu register data stored in Firmware-assisted dump format (except for
crashing cpu) and converts it into elf notes and updates the PT_NOTE program
header accordingly. The exact register state for crashing cpu is saved to
fadump crash info structure in scratch area during crash_fadump() and read
during second kernel boot.

Change in v5:
- Added 'fadump_' prefix to all static function defined.

Change in v4:
- Fixes a issue where memblock_free() is invoked from build_cpu_notes()
  function during error_out path. Invoke cpu_notes_buf_free() in error_out
  path instead of memblock_free().

Change in v2:
- Moved the crash_fadump() invocation from generic code to panic notifier.
- Introduced cpu_notes_buf_alloc() function to allocate cpu notes buffer
  using get_free_pages(). The reason is, with the use of subsys_initcall
  the setup_fadump() is now called after mem_init(). Hence use of
  get_free_pages() to allocate memory is more approriate then using
  memblock_alloc().

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/fadump.h  |   44 +
 arch/powerpc/kernel/fadump.c   |  314 
 arch/powerpc/kernel/setup-common.c |6 +
 arch/powerpc/kernel/traps.c|3 
 4 files changed, 365 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index c022d5c..72908e3 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -63,6 +63,18 @@
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG  0x2000
 
+#define FADUMP_CPU_ID_MASK ((1UL  32) - 1)
+
+#define CPU_UNKNOWN(~((u32)0))
+
+/* Utility macros */
+#define SKIP_TO_NEXT_CPU(reg_entry)\
+({ \
+   while (reg_entry-reg_id != REG_ID(CPUEND))   \
+   reg_entry++;\
+   reg_entry++;\
+})
+
 /* Kernel Dump section info */
 struct fadump_section {
u32 request_flag;
@@ -117,6 +129,9 @@ struct fw_dump {
unsigned long   reserve_bootvar;
 
unsigned long   fadumphdr_addr;
+   unsigned long   cpu_notes_buf;
+   unsigned long   cpu_notes_buf_size;
+
int ibm_configure_kernel_dump;
 
unsigned long   fadump_enabled:1;
@@ -141,13 +156,40 @@ static inline u64 str_to_u64(const char *str)
return val;
 }
 #define STR_TO_HEX(x)  str_to_u64(x)
+#define REG_ID(x)  str_to_u64(x)
 
 #define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX(FADMPINF)
+#define REGSAVE_AREA_MAGIC STR_TO_HEX(REGSAVE)
+
+/* The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with
+ * CPUSTRT and ends with CPUEND.
+ */
+
+/* Register save area header. */
+struct fadump_reg_save_area_header {
+   u64 magic_number;
+   u32 version;
+   u32 num_cpu_offset;
+};
+
+/* Register entry. */
+struct fadump_reg_entry {
+   u64 reg_id;
+   u64 reg_value;
+};
 
 /* fadump crash info structure */
 struct fadump_crash_info_header {
u64 magic_number;
u64 elfcorehdr_addr;
+   u32 crashing_cpu;
+   struct pt_regs  regs;
+   struct cpumask  cpu_online_mask;
 };
 
 /* Crash memory ranges */
@@ -163,7 +205,9 @@ extern int early_init_dt_scan_fw_dump(unsigned long node,
 extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
+extern void crash_fadump(struct pt_regs *, const char *);
 #else  /* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
+static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 #endif
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 082f85a..1879ddf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -244,6 +244,7 @@ static unsigned long get_fadump_area_size(void)
size += fw_dump.boot_memory_size;
size += sizeof(struct fadump_crash_info_header);
size += sizeof(struct elfhdr); /* ELF core header.*/
+   size += sizeof(struct elf_phdr); /* place holder for cpu notes */
/* Program headers for crash memory regions. */
size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
 
@@ -397,6 +398,285 @@ static void register_fw_dump(struct 

[RFC PATCH 2/2] WIP: PowerPC cache cleanup

2011-11-15 Thread Kyle Moffett
[My apologies for the resend, it does not seem to have hit the MLs.
I think my git send-email cc-cmd may have broken somehow, oops.]

This badly needs breaking up, and a better changelog... oh well...

The big changes:

* The ppc64_caches structure is now powerpc_caches and is used on
  both PPC32 and PPC64.  I hated staring at the pages and pages of
  assembly code, so nearly all of the functions are now C with tiny
  snippets of inline ASM in the loops.

* Lots of ugly assembly functions in arch/powerpc/kernel/misc_*.S were
  rewritten as cleaner inline ASM in arch/powerpc/mm/cache.c

* I'm not sure that the physical address functions from those files
  actually came out cleaner, but they are now more correct.

* I'm not 100% sure I like the new FOR_EACH_CACHE_LINE() macro, but it
  sure does make a lot of the other code much cleaner.

* I have a bit of a temptation to try to merge the 32/64-bit variants
  of copy_page() into a single C function.  A quick test seems to show
  that I can get nearly identical output to the 64-bit ASM with very
  little work.


---
 arch/powerpc/include/asm/cache.h |  155 ---
 arch/powerpc/include/asm/cacheflush.h|3 -
 arch/powerpc/include/asm/page.h  |6 +
 arch/powerpc/include/asm/page_32.h   |4 +-
 arch/powerpc/include/asm/page_64.h   |   17 --
 arch/powerpc/kernel/align.c  |7 +-
 arch/powerpc/kernel/asm-offsets.c|   13 +-
 arch/powerpc/kernel/head_32.S|9 +-
 arch/powerpc/kernel/head_64.S|2 +-
 arch/powerpc/kernel/misc_32.S|  193 --
 arch/powerpc/kernel/misc_64.S|  182 -
 arch/powerpc/kernel/ppc_ksyms.c  |3 -
 arch/powerpc/kernel/setup-common.c   |  103 ++
 arch/powerpc/kernel/setup.h  |1 +
 arch/powerpc/kernel/setup_32.c   |   11 +-
 arch/powerpc/kernel/setup_64.c   |  118 +--
 arch/powerpc/kernel/vdso.c   |   27 +--
 arch/powerpc/lib/copypage_64.S   |   10 +-
 arch/powerpc/mm/Makefile |2 +-
 arch/powerpc/mm/cache.c  |  279 ++
 arch/powerpc/mm/dma-noncoherent.c|2 +-
 arch/powerpc/platforms/52xx/lite5200_sleep.S |9 +-
 arch/powerpc/platforms/powermac/pci.c|2 +-
 arch/powerpc/xmon/xmon.c |   53 +++---
 drivers/macintosh/smu.c  |8 +-
 25 files changed, 599 insertions(+), 620 deletions(-)
 create mode 100644 arch/powerpc/mm/cache.c

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 4b50941..b1dc08f 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -3,47 +3,142 @@
 
 #ifdef __KERNEL__
 
-
-/* bytes per L1 cache line */
-#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
-#define L1_CACHE_SHIFT 4
-#define MAX_COPY_PREFETCH  1
+/*
+ * Various PowerPC CPUs which are otherwise compatible have different L1
+ * cache line sizes.
+ *
+ * Unfortunately, lots of kernel code assumes that L1_CACHE_BYTES and
+ * L1_CACHE_SHIFT are compile-time constants that can be used to align
+ * data-structures to avoid false cacheline sharing, so we can't just
+ * compute them at runtime from the cputable values.
+ *
+ * So for alignment purposes, we will compute these values as safe maximums
+ * of all the CPU support compiled into the kernel.
+ */
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_47x)
+# define L1_CACHE_SHIFT_MAX 7 /* 128-byte cache blocks */
 #elif defined(CONFIG_PPC_E500MC)
-#define L1_CACHE_SHIFT 6
-#define MAX_COPY_PREFETCH  4
-#elif defined(CONFIG_PPC32)
-#define MAX_COPY_PREFETCH  4
-#if defined(CONFIG_PPC_47x)
-#define L1_CACHE_SHIFT 7
+# define L1_CACHE_SHIFT_MAX 6 /* 64-byte cache blocks */
 #else
-#define L1_CACHE_SHIFT 5
+# define L1_CACHE_SHIFT_MAX 5 /* 32-byte cache blocks */
 #endif
+#define L1_CACHE_BYTES_MAX (1  L1_CACHE_SHIFT_MAX)
+
+#define L1_CACHE_SHIFT  L1_CACHE_SHIFT_MAX
+#define L1_CACHE_BYTES  L1_CACHE_BYTES_MAX
+#define SMP_CACHE_BYTES L1_CACHE_BYTES_MAX
+
+/*
+ * Unfortunately, for other purposes, we can't just use a safe maximum value
+ * because it gets used in loops when invalidating or clearing cachelines and
+ * it would be very bad to only flush/invalidate/zero/etc every 4th one.
+ *
+ * During early initialization we load these values from the device-tree and
+ * the cputable into the powerpc_caches structure, but we need to be able to
+ * clear pages before that occurs, so these need sane default values.
+ *
+ * As explained in the powerpc_caches structure definition, the defaults
+ * should be safe minimums, so that's what we compute here.
+ */
+#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
+# define L1_CACHE_SHIFT_MIN 4 /* 16-byte cache blocks */
+#elif 

[RFC PATCH v5 2/9] fadump: Reserve the memory for firmware assisted dump.

2011-11-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar mah...@linux.vnet.ibm.com

Reserve the memory during early boot to preserve CPU state data, HPTE region
and RMR region data in case of kernel crash. At the time of crash, powerpc
firmware will store CPU state data, HPTE region data and move RMR region
data to the reserved memory area.

If the firmware-assisted dump fails to reserve the memory, then fallback
to existing kexec-based kdump.

The most of the code implementation to reserve memory has been
adapted from phyp assisted dump implementation written by Linas Vepstas
and Manish Ahuja

Change in v5:
- Merged patch 10/10 which introduces a config option CONFIG_FA_DUMP
  for firmware assisted dump feature on Powerpc (ppc64) architecture.
- Increased MIN_BOOT_MEM by 64M to avoid OOM issue during network
  dump capture. When kdump infrastructure is configured to save vmcore
  over network, we run into OOM issue while loading modules related to
  network setup.

Change in v2:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  ibm,configure-kernel-dump-sizes property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

Signed-off-by: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
---
 arch/powerpc/Kconfig  |   13 ++
 arch/powerpc/include/asm/fadump.h |   69 ++
 arch/powerpc/kernel/Makefile  |1 
 arch/powerpc/kernel/fadump.c  |  250 +
 arch/powerpc/kernel/prom.c|   15 ++
 5 files changed, 347 insertions(+), 1 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 arch/powerpc/kernel/fadump.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6926b61..7ce773c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -379,6 +379,19 @@ config PHYP_DUMP
 
  If unsure, say N
 
+config FA_DUMP
+   bool Firmware-assisted dump
+   depends on PPC64  PPC_RTAS  CRASH_DUMP
+   help
+ A robust mechanism to get reliable kernel crash dump with
+ assistance from firmware. This approach does not use kexec,
+ instead firmware assists in booting the kdump kernel
+ while preserving memory contents. Firmware-assisted dump
+ is meant to be a kdump replacement offering robustness and
+ speed not possible without system firmware assistance.
+
+ If unsure, say N
+
 config PPCBUG_NVRAM
bool Enable reading PPCBUG NVRAM during boot if PPLUS || LOPEC
default y if PPC_PREP
diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
new file mode 100644
index 000..86b17e8
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump.h
@@ -0,0 +1,69 @@
+/*
+ * Firmware Assisted dump header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
+ */
+
+#ifndef __PPC64_FA_DUMP_H__
+#define __PPC64_FA_DUMP_H__
+
+#ifdef CONFIG_FA_DUMP
+
+/*
+ * The RMR region will be saved for later dumping when kernel crashes.
+ * Set this to RMO size.
+ */
+#define RMR_START  0x0
+#define RMR_END(ppc64_rma_size)
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need aditional 64M
+ * of memory to avoid OOM issue.
+ */
+#define MIN_BOOT_MEM   (((RMR_END  (0x1UL  28)) ? (0x1UL  28) : RMR_END) \
+   + (0x1UL  26))
+
+/* Firmware provided dump sections */
+#define FADUMP_CPU_STATE_DATA  0x0001
+#define FADUMP_HPTE_REGION 0x0002
+#define FADUMP_REAL_MODE_REGION0x0011
+
+struct fw_dump {
+   unsigned long   cpu_state_data_size;
+   unsigned long   hpte_region_size;
+   unsigned long   boot_memory_size;
+   unsigned long   reserve_dump_area_start;
+   unsigned long   reserve_dump_area_size;
+   /* cmd line option during boot */
+   unsigned long   reserve_bootvar;
+
+   int ibm_configure_kernel_dump;
+
+   

Re: [PATCH] net: fsl_pq_mdio: fix non tbi phy access

2011-11-15 Thread Baruch Siach
Hi Andy,

On Tue, Nov 15, 2011 at 09:06:03AM -0600, Andy Fleming wrote:
 On Nov 14, 2011, at 11:17 PM, Baruch Siach wrote:
  On Mon, Nov 14, 2011 at 09:04:47PM +, Fleming Andy-AFLEMING wrote:

[snip]

  And looking at the p1010si.dtsi, I see that it's automatically there for 
  you.
  
  How were you breaking?
  
  Adding linuxppc to Cc.
  
  My board is P1011 based, the single core version of P1020, not P1010. In 
  p1020si.dtsi I see no tbi node. In p1020rdb.dts I see a tbi node but only 
  for 
  mdio@25000, not mdio@24000, which is what I'm using.
  
  Am I missing something?
 
 Well, that's a bug. In truth, the silicon dtsi trees should not have tbi 
 nodes, as that's highly machine-specific. The p1020rdb is apparently relying 
 on the old behavior, which is broken, and due to the fact that the first 
 ethernet interface doesn't *use* the TBI PHY.
 
 You should add this to your board tree:
 
 mdio@24000 {
 
 tbi0: tbi-phy@11 {
 reg = 0x11;
 device_type = tbi-phy;
 };
 };
 
 And add the PHYs you use, as well as set reg (and the value after the @) 
 to something that makes sense for your board.

Thanks for your detailed explanation and prompt response. I've added a tbi 
node, dropped my patch, and now my board works as expected.

 I am going to go right now, and add tbi nodes for all of the Freescale 
 platforms. I will also modify the fsl_pq_mdio code to be more explicit about 
 its reason for failure.

Please Cc me on these.

Thanks,
baruch

-- 
 ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe

2011-11-15 Thread Kumar Gala

On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:

 P1023 external IRQ[4:6, 11] do not pin out, but the interrupts are
 shared with PCIe controller.
 The silicon internally ties the interrupts to L, so change the
 IRQ[4:6,11] to high level sensitive for PCIe.
 
 Signed-off-by: Roy Zang tie-fei.z...@freescale.com
 ---
 arch/powerpc/boot/dts/p1023rds.dts |8 
 1 files changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/arch/powerpc/boot/dts/p1023rds.dts 
 b/arch/powerpc/boot/dts/p1023rds.dts
 index d9b7767..66bf804 100644
 --- a/arch/powerpc/boot/dts/p1023rds.dts
 +++ b/arch/powerpc/boot/dts/p1023rds.dts
 @@ -490,9 +490,9 @@
   interrupt-map-mask = 0xf800 0 0 7;
   interrupt-map = 
   /* IDSEL 0x0 */
 -  0 0 1 mpic 4 1
 -  0 0 2 mpic 5 1
 -  0 0 3 mpic 6 1
 +  0 0 1 mpic 4 2
 +  0 0 2 mpic 5 2
 +  0 0 3 mpic 6 2
    0 0 4 mpic 7 1
   ;
   ranges = 0x200 0x0 0xa000
 @@ -532,7 +532,7 @@
    0 0 1 mpic 8 1
    0 0 2 mpic 9 1
    0 0 3 mpic 10 1
 -  0 0 4 mpic 11 1
 +  0 0 4 mpic 11 2
   ;
   ranges = 0x200 0x0 0x8000
 0x200 0x0 0x8000
 -- 
 1.6.0.6
 

Should be setting ALL PCIe interrupts to '2'?  As I think in general we say 
these PCIe are 'active high'.  The only reason I would think we would NOT do 
this is if they are shared with some external device that is 'active low'.  If 
so we should comment that somewhere (maybe in the .dts, maybe just in the 
commit message).

- k

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe

2011-11-15 Thread Scott Wood
On 11/15/2011 03:51 PM, Kumar Gala wrote:
 
 On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:
 
 Should be setting ALL PCIe interrupts to '2'?  As I think in general
 we say these PCIe are 'active high'.  The only reason I would think
 we would NOT do this is if they are shared with some external device
 that is 'active low'.  If so we should comment that somewhere (maybe
 in the .dts, maybe just in the commit message).

I'd assume the ones that are pinned out are pulled high on the board.
Active-low is normal, it's these non-pinned-out external interrupts
that are pulled low inside the SoC that are weird.

-Scott

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup

2011-11-15 Thread Benjamin Herrenschmidt
On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
 Unfortunately, I've been staring at PPC asm for long enough that I
 have a migraine headache and I'm going to have to stop here for now.
 If somebody else wants to tackle fixing up the 32-bit copy_page() and
 __copy_tofrom_user() routines it would be highly appreciated. 

Yeah that's the one everybody's avoiding :-)

What about my idea of instead compiling it multiple times with a
different size and fixing up the branch to call the right one ?

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions

2011-11-15 Thread Benjamin Herrenschmidt
On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
 These functions are only used from one place each.  If the cacheable_*
 versions really are more efficient, then those changes should be
 migrated into the common code instead.
 
 NOTE: The old routines are just flat buggy on kernels that support
   hardware with different cacheline sizes.
 
 Signed-off-by: Kyle Moffett kyle.d.moff...@boeing.com
 ---

Right, considering where those are used, I think we can safely remove
them. Thanks.

Ben.

  arch/powerpc/include/asm/system.h|2 -
  arch/powerpc/kernel/ppc_ksyms.c  |2 -
  arch/powerpc/lib/copy_32.S   |  127 
 --
  arch/powerpc/mm/ppc_mmu_32.c |2 +-
  drivers/net/ethernet/ibm/emac/core.c |   12 +---
  5 files changed, 3 insertions(+), 142 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/system.h 
 b/arch/powerpc/include/asm/system.h
 index e30a13d..25389d1 100644
 --- a/arch/powerpc/include/asm/system.h
 +++ b/arch/powerpc/include/asm/system.h
 @@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct 
 *t)
  #endif
  
  extern int call_rtas(const char *, int, int, unsigned long *, ...);
 -extern void cacheable_memzero(void *p, unsigned int nb);
 -extern void *cacheable_memcpy(void *, const void *, unsigned int);
  extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
  extern int die(const char *, struct pt_regs *, long);
 diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
 index d3114a7..acba8ce 100644
 --- a/arch/powerpc/kernel/ppc_ksyms.c
 +++ b/arch/powerpc/kernel/ppc_ksyms.c
 @@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
  #ifdef CONFIG_PPC32
  EXPORT_SYMBOL(timer_interrupt);
  EXPORT_SYMBOL(tb_ticks_per_jiffy);
 -EXPORT_SYMBOL(cacheable_memcpy);
 -EXPORT_SYMBOL(cacheable_memzero);
  #endif
  
  #ifdef CONFIG_PPC32
 diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
 index 55f19f9..6813f80 100644
 --- a/arch/powerpc/lib/copy_32.S
 +++ b/arch/powerpc/lib/copy_32.S
 @@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
  LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  CACHELINE_MASK = (L1_CACHE_BYTES-1)
  
 -/*
 - * Use dcbz on the complete cache lines in the destination
 - * to set them to zero.  This requires that the destination
 - * area is cacheable.  -- paulus
 - */
 -_GLOBAL(cacheable_memzero)
 - mr  r5,r4
 - li  r4,0
 - addir6,r3,-4
 - cmplwi  0,r5,4
 - blt 7f
 - stwur4,4(r6)
 - beqlr
 - andi.   r0,r6,3
 - add r5,r0,r5
 - subfr6,r0,r6
 - clrlwi  r7,r6,32-LG_CACHELINE_BYTES
 - add r8,r7,r5
 - srwir9,r8,LG_CACHELINE_BYTES
 - addic.  r9,r9,-1/* total number of complete cachelines */
 - ble 2f
 - xorir0,r7,CACHELINE_MASK  ~3
 - srwi.   r0,r0,2
 - beq 3f
 - mtctr   r0
 -4:   stwur4,4(r6)
 - bdnz4b
 -3:   mtctr   r9
 - li  r7,4
 -10:  dcbzr7,r6
 - addir6,r6,CACHELINE_BYTES
 - bdnz10b
 - clrlwi  r5,r8,32-LG_CACHELINE_BYTES
 - addir5,r5,4
 -2:   srwir0,r5,2
 - mtctr   r0
 - bdz 6f
 -1:   stwur4,4(r6)
 - bdnz1b
 -6:   andi.   r5,r5,3
 -7:   cmpwi   0,r5,0
 - beqlr
 - mtctr   r5
 - addir6,r6,3
 -8:   stbur4,1(r6)
 - bdnz8b
 - blr
 -
  _GLOBAL(memset)
   rlwimi  r4,r4,8,16,23
   rlwimi  r4,r4,16,0,15
 @@ -142,85 +94,6 @@ _GLOBAL(memset)
   bdnz8b
   blr
  
 -/*
 - * This version uses dcbz on the complete cache lines in the
 - * destination area to reduce memory traffic.  This requires that
 - * the destination area is cacheable.
 - * We only use this version if the source and dest don't overlap.
 - * -- paulus.
 - */
 -_GLOBAL(cacheable_memcpy)
 - add r7,r3,r5/* test if the src  dst overlap */
 - add r8,r4,r5
 - cmplw   0,r4,r7
 - cmplw   1,r3,r8
 - crand   0,0,4   /* cr0.lt = cr1.lt */
 - blt memcpy  /* if regions overlap */
 -
 - addir4,r4,-4
 - addir6,r3,-4
 - neg r0,r3
 - andi.   r0,r0,CACHELINE_MASK/* # bytes to start of cache line */
 - beq 58f
 -
 - cmplw   0,r5,r0 /* is this more than total to do? */
 - blt 63f /* if not much to do */
 - andi.   r8,r0,3 /* get it word-aligned first */
 - subfr5,r0,r5
 - mtctr   r8
 - beq+61f
 -70:  lbz r9,4(r4)/* do some bytes */
 - stb r9,4(r6)
 - addir4,r4,1
 - addir6,r6,1
 - bdnz70b
 -61:  srwi.   r0,r0,2
 - mtctr   r0
 - beq 58f
 -72:  lwzur9,4(r4)/* do some words */
 - stwur9,4(r6)
 - bdnz72b
 -
 -58:  srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
 - 

Re: [RFC PATCH 00/17] powerpc/e500: separate e500 from e500mc

2011-11-15 Thread Benjamin Herrenschmidt
On Mon, 2011-11-14 at 20:36 -0600, Moffett, Kyle D wrote:
 So when you are clearing a whole page, there are only 2 things you can do
 wrong with dcbz:
 
   (1) Call dcbz with an address outside of the page you want to zero.
 
   (2) Omit calls dcbz to dcbz for some physical cachelines in the page.
 
 Now, that's a totally different story from the userspace memset() calls
 that caused the problem originally, because they were frequently given
 memory much smaller than a page to clear, and if you didn't know exactly
 how many bytes a dcbz was going to clear you couldn't use it at all.

Right. That's why we pass the cache line sizes to userspace via the elf
AUX table so they don't do stupid things like that :-)

 But the kernel doesn't do that anywhere, it just uses it for page clears. 

Right, so we could easily precalc the count  increment and use a soft
loop.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 2/2] WIP: PowerPC cache cleanup

2011-11-15 Thread Benjamin Herrenschmidt
On Tue, 2011-11-15 at 10:22 -0500, Kyle Moffett wrote:
 [My apologies for the resend, it does not seem to have hit the MLs.
 I think my git send-email cc-cmd may have broken somehow, oops.]

Or the ML took a while because it's big :-) I got both.

I'll try to review this week. Probably wont get to it today tho.

Thanks for looking at this !

Cheers,
Ben.

 This badly needs breaking up, and a better changelog... oh well...
 
 The big changes:
 
 * The ppc64_caches structure is now powerpc_caches and is used on
   both PPC32 and PPC64.  I hated staring at the pages and pages of
   assembly code, so nearly all of the functions are now C with tiny
   snippets of inline ASM in the loops.
 
 * Lots of ugly assembly functions in arch/powerpc/kernel/misc_*.S were
   rewritten as cleaner inline ASM in arch/powerpc/mm/cache.c
 
 * I'm not sure that the physical address functions from those files
   actually came out cleaner, but they are now more correct.
 
 * I'm not 100% sure I like the new FOR_EACH_CACHE_LINE() macro, but it
   sure does make a lot of the other code much cleaner.
 
 * I have a bit of a temptation to try to merge the 32/64-bit variants
   of copy_page() into a single C function.  A quick test seems to show
   that I can get nearly identical output to the 64-bit ASM with very
   little work.
 
 
 ---
  arch/powerpc/include/asm/cache.h |  155 ---
  arch/powerpc/include/asm/cacheflush.h|3 -
  arch/powerpc/include/asm/page.h  |6 +
  arch/powerpc/include/asm/page_32.h   |4 +-
  arch/powerpc/include/asm/page_64.h   |   17 --
  arch/powerpc/kernel/align.c  |7 +-
  arch/powerpc/kernel/asm-offsets.c|   13 +-
  arch/powerpc/kernel/head_32.S|9 +-
  arch/powerpc/kernel/head_64.S|2 +-
  arch/powerpc/kernel/misc_32.S|  193 --
  arch/powerpc/kernel/misc_64.S|  182 -
  arch/powerpc/kernel/ppc_ksyms.c  |3 -
  arch/powerpc/kernel/setup-common.c   |  103 ++
  arch/powerpc/kernel/setup.h  |1 +
  arch/powerpc/kernel/setup_32.c   |   11 +-
  arch/powerpc/kernel/setup_64.c   |  118 +--
  arch/powerpc/kernel/vdso.c   |   27 +--
  arch/powerpc/lib/copypage_64.S   |   10 +-
  arch/powerpc/mm/Makefile |2 +-
  arch/powerpc/mm/cache.c  |  279 
 ++
  arch/powerpc/mm/dma-noncoherent.c|2 +-
  arch/powerpc/platforms/52xx/lite5200_sleep.S |9 +-
  arch/powerpc/platforms/powermac/pci.c|2 +-
  arch/powerpc/xmon/xmon.c |   53 +++---
  drivers/macintosh/smu.c  |8 +-
  25 files changed, 599 insertions(+), 620 deletions(-)
  create mode 100644 arch/powerpc/mm/cache.c
 
 diff --git a/arch/powerpc/include/asm/cache.h 
 b/arch/powerpc/include/asm/cache.h
 index 4b50941..b1dc08f 100644
 --- a/arch/powerpc/include/asm/cache.h
 +++ b/arch/powerpc/include/asm/cache.h
 @@ -3,47 +3,142 @@
  
  #ifdef __KERNEL__
  
 -
 -/* bytes per L1 cache line */
 -#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
 -#define L1_CACHE_SHIFT   4
 -#define MAX_COPY_PREFETCH1
 +/*
 + * Various PowerPC CPUs which are otherwise compatible have different L1
 + * cache line sizes.
 + *
 + * Unfortunately, lots of kernel code assumes that L1_CACHE_BYTES and
 + * L1_CACHE_SHIFT are compile-time constants that can be used to align
 + * data-structures to avoid false cacheline sharing, so we can't just
 + * compute them at runtime from the cputable values.
 + *
 + * So for alignment purposes, we will compute these values as safe maximums
 + * of all the CPU support compiled into the kernel.
 + */
 +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_47x)
 +# define L1_CACHE_SHIFT_MAX 7 /* 128-byte cache blocks */
  #elif defined(CONFIG_PPC_E500MC)
 -#define L1_CACHE_SHIFT   6
 -#define MAX_COPY_PREFETCH4
 -#elif defined(CONFIG_PPC32)
 -#define MAX_COPY_PREFETCH4
 -#if defined(CONFIG_PPC_47x)
 -#define L1_CACHE_SHIFT   7
 +# define L1_CACHE_SHIFT_MAX 6 /* 64-byte cache blocks */
  #else
 -#define L1_CACHE_SHIFT   5
 +# define L1_CACHE_SHIFT_MAX 5 /* 32-byte cache blocks */
  #endif
 +#define L1_CACHE_BYTES_MAX (1  L1_CACHE_SHIFT_MAX)
 +
 +#define L1_CACHE_SHIFT  L1_CACHE_SHIFT_MAX
 +#define L1_CACHE_BYTES  L1_CACHE_BYTES_MAX
 +#define SMP_CACHE_BYTES L1_CACHE_BYTES_MAX
 +
 +/*
 + * Unfortunately, for other purposes, we can't just use a safe maximum value
 + * because it gets used in loops when invalidating or clearing cachelines and
 + * it would be very bad to only flush/invalidate/zero/etc every 4th one.
 + *
 + * During early initialization we load these values from the device-tree and
 + * the cputable into the powerpc_caches structure, but we need to be able 

Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup

2011-11-15 Thread Moffett, Kyle D
On Nov 15, 2011, at 17:29, Benjamin Herrenschmidt wrote:
 On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
 Unfortunately, I've been staring at PPC asm for long enough that I
 have a migraine headache and I'm going to have to stop here for now.
 If somebody else wants to tackle fixing up the 32-bit copy_page() and
 __copy_tofrom_user() routines it would be highly appreciated. 
 
 Yeah that's the one everybody's avoiding :-)
 
 What about my idea of instead compiling it multiple times with a
 different size and fixing up the branch to call the right one ?

I guess that's doable, although I have to admit that idea almost gives
me more of a headache than trying to fix up the 32-bit ASM.

One thing that bothers me in particular is that both 32/64 versions of
__copy_tofrom_user() are dramatically overcomplicated for what they
ought to be doing.

It would seem that if we get a page fault during an unaligned copy, we
ought to just give up and fall back to a simple byte-by-byte copy loop
from wherever we left off.  That would eliminate 90% of the ugly
special cases without actually hurting performance, right?

For a page-fault during a cacheline-aligned copy, we should be able to
handle the exception and retry from the last cacheline without much
logic, again with good performance.

With that said, I'm curious about the origin of the PPC32 ASM.  In
particular, it looks like it was generated by GCC at some point in the
distant past, and I'm wondering if there's a good way to rewrite that
file in C and trick GCC into generating the relevant exception tables
for it?

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[patch 1/1] drivers/edac/mpc85xx_edac.c: fix memory controller compatible for edac

2011-11-15 Thread akpm
From: Shaohui Xie shaohui@freescale.com
Subject: drivers/edac/mpc85xx_edac.c: fix memory controller compatible for edac

compatible in dts has been changed, so the driver needs to be updated
accordingly.

Signed-off-by: Shaohui Xie shaohui@freescale.com
Cc: Kumar Gala ga...@kernel.crashing.org
Cc: Grant Likely grant.lik...@secretlab.ca
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: Andrew Morton a...@linux-foundation.org
---

 drivers/edac/mpc85xx_edac.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff -puN 
drivers/edac/mpc85xx_edac.c~drivers-edac-mpc85xx_edacc-fix-memory-controller-compatible-for-edac
 drivers/edac/mpc85xx_edac.c
--- 
a/drivers/edac/mpc85xx_edac.c~drivers-edac-mpc85xx_edacc-fix-memory-controller-compatible-for-edac
+++ a/drivers/edac/mpc85xx_edac.c
@@ -1128,7 +1128,7 @@ static struct of_device_id mpc85xx_mc_er
{ .compatible = fsl,p1020-memory-controller, },
{ .compatible = fsl,p1021-memory-controller, },
{ .compatible = fsl,p2020-memory-controller, },
-   { .compatible = fsl,p4080-memory-controller, },
+   { .compatible = fsl,qoriq-memory-controller, },
{},
 };
 MODULE_DEVICE_TABLE(of, mpc85xx_mc_err_of_match);
_
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup

2011-11-15 Thread Benjamin Herrenschmidt
On Tue, 2011-11-15 at 16:45 -0600, Moffett, Kyle D wrote:

 I guess that's doable, although I have to admit that idea almost gives
 me more of a headache than trying to fix up the 32-bit ASM.
 
 One thing that bothers me in particular is that both 32/64 versions of
 __copy_tofrom_user() are dramatically overcomplicated for what they
 ought to be doing.
 
 It would seem that if we get a page fault during an unaligned copy, we
 ought to just give up and fall back to a simple byte-by-byte copy loop
 from wherever we left off.  That would eliminate 90% of the ugly
 special cases without actually hurting performance, right?
 
 For a page-fault during a cacheline-aligned copy, we should be able to
 handle the exception and retry from the last cacheline without much
 logic, again with good performance.
 
 With that said, I'm curious about the origin of the PPC32 ASM.  In
 particular, it looks like it was generated by GCC at some point in the
 distant past, and I'm wondering if there's a good way to rewrite that
 file in C and trick GCC into generating the relevant exception tables
 for it?

There is some serious history in there :-)

I would check with Anton, he's been doing some performance work on those
lately (the 64-bit ones).

It's probably worth throwing a proof-of-concept simpler variant for
32-bit at least on the table and have people compare the perfs
(typically network perfs). I can test on a range of ppc32 here (6xx,
7xxx, 4xx).

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC][PATCH 15/30] powerpc/85xx: Rework P1022DS device tree

2011-11-15 Thread Tabi Timur-B04825
On Thu, Nov 10, 2011 at 10:13 AM, Kumar Gala ga...@kernel.crashing.org wrote:

 +       lbc: localbus@fffe05000 {
                reg = 0 0xffe05000 0 0x1000;
 -               interrupts = 19 2 0 0;

I just noticed this bug in the original p1022ds.dts, and I see you're
carrying it over here.  The reg property should look like this:

reg = 0xf 0xffe05000 0 0x1000;
   ^^^

Do you want to fix this here, or do you want me to submit a patch that
fixes the original p1022ds.dts?

-- 
Timur Tabi
Linux kernel developer at Freescale
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup

2011-11-15 Thread Moffett, Kyle D
On Nov 15, 2011, at 18:46, Benjamin Herrenschmidt wrote:
 On Tue, 2011-11-15 at 16:45 -0600, Moffett, Kyle D wrote:
 
 With that said, I'm curious about the origin of the PPC32 ASM.  In
 particular, it looks like it was generated by GCC at some point in the
 distant past, and I'm wondering if there's a good way to rewrite that
 file in C and trick GCC into generating the relevant exception tables
 for it?
 
 There is some serious history in there :-)
 
 I would check with Anton, he's been doing some performance work on those
 lately (the 64-bit ones).
 
 It's probably worth throwing a proof-of-concept simpler variant for
 32-bit at least on the table and have people compare the perfs
 (typically network perfs). I can test on a range of ppc32 here (6xx,
 7xxx, 4xx).

Ok, so there's not really a good way to make GCC generate the exception
tables itself.  I've come up with several overly-clever ways to do most
of what we would want using asm goto except that (1) asm goto cannot
have register outputs, and (2) asm goto is only available in GCC 4.5+

I could easily work around the former by putting the code into its own
file and creating a global register variable just for that file, but
the GCC 4.5+ dependency is a total nonstarter.

I'm trying to see if I can make it look better than it does now with
some judicious use of inline ASM.  At the very least, it should be
possible to have a wrapper function written in C which calls the ASM
guts with the correct cache params.

More importantly, the ASM code needs to use something other than
totally arbitrary numbers for labels.  :-D

Cheers,
Kyle Moffett

--
Curious about my work on the Debian powerpcspe port?
I'm keeping a blog here: http://pureperl.blogspot.com/

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] P1021: set IReady in QE Microcode Upload

2011-11-15 Thread Tabi Timur-B04825
On Mon, Nov 14, 2011 at 2:55 AM, Kokoris, Ioannis
ioannis.koko...@siemens-enterprise.com wrote:
 Ready register is needed for ROM-less devices such as P1021, MPC859, MPC8306 
 etc.
 For ROM-based devices such as MCP8323 the Ready register does not exist.
 Is there a global definition for conditionally including this code?

I'll have to check.  But this patch can't be applied as-is unless it's
proven safe for all QE-enabled chips.

-- 
Timur Tabi
Linux kernel developer at Freescale
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: Fix atomic_xxx_return barrier semantics

2011-11-15 Thread Benjamin Herrenschmidt
The Documentation/memory-barriers.txt document requires that atomic
operations that return a value act as a memory barrier both before
and after the actual atomic operation.

Our current implementation doesn't guarantee this. More specifically,
while a load following the isync can not be issued before stwcx. has
completed, that completion doesn't architecturally means that the
result of stwcx. is visible to other processors (or any previous stores
for that matter) (typically, the other processors L1 caches can still
hold the old value).

This has caused an actual crash in RCU torture testing on Power 7

This fixes it by changing those atomic ops to use new macros instead
of RELEASE/ACQUIRE barriers, called ATOMIC_ENTRY and ATMOIC_EXIT barriers,
which are then defined respectively to lwsync and sync.

I haven't had a chance to measure the performance impact (or rather
what I measured with kernel compiles is in the noise, I yet have to
find a more precise benchmark)

Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---

diff --git a/arch/powerpc/include/asm/atomic.h 
b/arch/powerpc/include/asm/atomic.h
index e2a4c26..02e41b5 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -49,13 +49,13 @@ static __inline__ int atomic_add_return(int a, atomic_t *v)
int t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%2 # atomic_add_return\n\
add %0,%1,%0\n
PPC405_ERR77(0,%2)
   stwcx.  %0,0,%2 \n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
: =r (t)
: r (a), r (v-counter)
: cc, memory);
@@ -85,13 +85,13 @@ static __inline__ int atomic_sub_return(int a, atomic_t *v)
int t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%2 # atomic_sub_return\n\
subf%0,%1,%0\n
PPC405_ERR77(0,%2)
   stwcx.  %0,0,%2 \n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
: =r (t)
: r (a), r (v-counter)
: cc, memory);
@@ -119,13 +119,13 @@ static __inline__ int atomic_inc_return(atomic_t *v)
int t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%1 # atomic_inc_return\n\
addic   %0,%0,1\n
PPC405_ERR77(0,%1)
   stwcx.  %0,0,%1 \n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
: =r (t)
: r (v-counter)
: cc, xer, memory);
@@ -163,13 +163,13 @@ static __inline__ int atomic_dec_return(atomic_t *v)
int t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%1 # atomic_dec_return\n\
addic   %0,%0,-1\n
PPC405_ERR77(0,%1)
   stwcx.  %0,0,%1\n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
: =r (t)
: r (v-counter)
: cc, xer, memory);
@@ -194,7 +194,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int 
a, int u)
int t;
 
__asm__ __volatile__ (
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%1 # __atomic_add_unless\n\
cmpw0,%0,%3 \n\
beq-2f \n\
@@ -202,7 +202,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int 
a, int u)
PPC405_ERR77(0,%2)
   stwcx.  %0,0,%1 \n\
bne-1b \n
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
   subf%0,%2,%0 \n\
 2:
: =r (t)
@@ -226,7 +226,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
int t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:lwarx   %0,0,%1 # atomic_dec_if_positive\n\
cmpwi   %0,1\n\
addi%0,%0,-1\n\
@@ -234,7 +234,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
PPC405_ERR77(0,%1)
   stwcx.  %0,0,%1\n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
\n\
 2:: =b (t)
: r (v-counter)
@@ -285,12 +285,12 @@ static __inline__ long atomic64_add_return(long a, 
atomic64_t *v)
long t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:ldarx   %0,0,%2 # atomic64_add_return\n\
add %0,%1,%0\n\
stdcx.  %0,0,%2 \n\
bne-1b
-   PPC_ACQUIRE_BARRIER
+   PPC_ATOMIC_EXIT_BARRIER
: =r (t)
: r (a), r (v-counter)
: cc, memory);
@@ -319,12 +319,12 @@ static __inline__ long atomic64_sub_return(long a, 
atomic64_t *v)
long t;
 
__asm__ __volatile__(
-   PPC_RELEASE_BARRIER
+   PPC_ATOMIC_ENTRY_BARRIER
 1:ldarx   %0,0,%2 # atomic64_sub_return\n\
subf

[PATCH v2] powerpc/powernv: PCI support for p7IOC under OPAL v2

2011-11-15 Thread Benjamin Herrenschmidt
This adds support for p7IOC (and possibly other IODA v1 IO Hubs)
using OPAL v2 interfaces.

We completely take over resource assignment and assign them using an
algorithm that hands out device BARs in a way that makes them fit in
individual segments of the M32 window of the bridge, which enables us
to assign individual PEs to devices and functions.

The current implementation gives out a PE per functions on PCIe, and a
PE for the entire bridge for PCIe to PCI-X bridges.

This can be adjusted / fine tuned later.

We also setup DMA resources (32-bit only for now) and MSIs (both 32-bit
and 64-bit MSI are supported).

The DMA allocation tries to divide the available 256M segments of the
32-bit DMA address space fairly among PEs. This is done using a
weight heuristic which assigns less value to things like OHCI USB
controllers than, for example SCSI RAID controllers. This algorithm
will probably want some fine tuning for specific devices or device
types.

Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---
v2. Small fixes from Gavin Shan in the resource assignment code

 arch/powerpc/include/asm/pci-bridge.h |6 +-
 arch/powerpc/kernel/pci_dn.c  |3 +
 arch/powerpc/platforms/powernv/Makefile   |2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 1320 +
 arch/powerpc/platforms/powernv/pci.c  |   20 +-
 arch/powerpc/platforms/powernv/pci.h  |   84 ++
 6 files changed, 1429 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-ioda.c

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 56b879a..882b6aa 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -153,8 +153,8 @@ struct pci_dn {
 
int pci_ext_config_space;   /* for pci devices */
 
-#ifdef CONFIG_EEH
struct  pci_dev *pcidev;/* back-pointer to the pci device */
+#ifdef CONFIG_EEH
int class_code; /* pci device class */
int eeh_mode;   /* See eeh.h for possible EEH_MODEs */
int eeh_config_addr;
@@ -164,6 +164,10 @@ struct pci_dn {
int eeh_false_positives;/* # times this device reported #ff's */
u32 config_space[16];   /* saved PCI config space */
 #endif
+#define IODA_INVALID_PE(-1)
+#ifdef CONFIG_PPC_POWERNV
+   int pe_number;
+#endif
 };
 
 /* Get the pointer to a device_node's pci_dn */
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 4e69deb..dd9e4a0 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -50,6 +50,9 @@ void * __devinit update_dn_pci_info(struct device_node *dn, 
void *data)
dn-data = pdn;
pdn-node = dn;
pdn-phb = phb;
+#ifdef CONFIG_PPC_POWERNV
+   pdn-pe_number = IODA_INVALID_PE;
+#endif
regs = of_get_property(dn, reg, NULL);
if (regs) {
/* First register entry is addr (00BBSS00)  */
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 3185300..bcc3cb48 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,4 +2,4 @@ obj-y   += setup.o opal-takeover.o 
opal-wrappers.o opal.o
 obj-y  += opal-rtc.o opal-nvram.o
 
 obj-$(CONFIG_SMP)  += smp.o
-obj-$(CONFIG_PCI)  += pci.o pci-p5ioc2.o
+obj-$(CONFIG_PCI)  += pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 000..8857d9b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,1320 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include linux/kernel.h
+#include linux/pci.h
+#include linux/delay.h
+#include linux/string.h
+#include linux/init.h
+#include linux/bootmem.h
+#include linux/irq.h
+#include linux/io.h
+#include linux/msi.h
+
+#include asm/sections.h
+#include asm/io.h
+#include asm/prom.h
+#include asm/pci-bridge.h
+#include asm/machdep.h
+#include asm/ppc-pci.h
+#include asm/opal.h
+#include asm/iommu.h
+#include asm/tce.h
+#include asm/abs_addr.h
+
+#include powernv.h
+#include pci.h
+
+struct resource_wrap {
+   struct list_headlink;
+   resource_size_t size;
+   resource_size_t align;
+   struct pci_dev  *dev;   /* Set if it's a device */
+   struct pci_bus  *bus;   /* Set if it's a bridge */
+};
+
+static int __pe_printk(const char *level, const struct pnv_ioda_pe *pe,

RE: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe

2011-11-15 Thread Zang Roy-R61911


 -Original Message-
 From: Wood Scott-B07421
 Sent: Wednesday, November 16, 2011 6:14 AM
 To: Kumar Gala
 Cc: Zang Roy-R61911; linuxppc-dev@lists.ozlabs.org
 Subject: Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level
 sensitive for PCIe
 
 On 11/15/2011 03:51 PM, Kumar Gala wrote:
 
  On Nov 7, 2011, at 2:32 AM, Roy Zang wrote:
 
  Should be setting ALL PCIe interrupts to '2'?  As I think in general
  we say these PCIe are 'active high'.  The only reason I would think
  we would NOT do this is if they are shared with some external device
  that is 'active low'.  If so we should comment that somewhere (maybe
  in the .dts, maybe just in the commit message).
 
 I'd assume the ones that are pinned out are pulled high on the board.
yes. The boards pulled up the shared IRQs. PCIe specification does not specify 
'active low' or 'active high', but for PCI, the INTx is 'active low'.

 Active-low is normal, it's these non-pinned-out external interrupts
 that are pulled low inside the SoC that are weird.
I agree here. Do you want me to add something to point out the weird in the 
commit message?
Thanks.
Roy


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 0/2] powerpc: CPU cache op cleanup

2011-11-15 Thread Paul Mackerras
On Tue, Nov 15, 2011 at 04:45:18PM -0600, Moffett, Kyle D wrote:
 On Nov 15, 2011, at 17:29, Benjamin Herrenschmidt wrote:
  On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
  Unfortunately, I've been staring at PPC asm for long enough that I
  have a migraine headache and I'm going to have to stop here for now.
  If somebody else wants to tackle fixing up the 32-bit copy_page() and
  __copy_tofrom_user() routines it would be highly appreciated. 
  
  Yeah that's the one everybody's avoiding :-)
  
  What about my idea of instead compiling it multiple times with a
  different size and fixing up the branch to call the right one ?
 
 I guess that's doable, although I have to admit that idea almost gives
 me more of a headache than trying to fix up the 32-bit ASM.
 
 One thing that bothers me in particular is that both 32/64 versions of
 __copy_tofrom_user() are dramatically overcomplicated for what they
 ought to be doing.
 
 It would seem that if we get a page fault during an unaligned copy, we
 ought to just give up and fall back to a simple byte-by-byte copy loop
 from wherever we left off.  That would eliminate 90% of the ugly
 special cases without actually hurting performance, right?

That's basically what we do, IIRC, and most of the complexity comes
from working out where we were up to.  We could probably use a simpler
approximation that means we might copy some bytes twice.  In fact the
greatest simplification would probably be to implement range entries
in the exception table so we can just have one entry for all the loads
and stores instead of an entry for each individual load and store.

 For a page-fault during a cacheline-aligned copy, we should be able to
 handle the exception and retry from the last cacheline without much
 logic, again with good performance.
 
 With that said, I'm curious about the origin of the PPC32 ASM.  In
 particular, it looks like it was generated by GCC at some point in the
 distant past, and I'm wondering if there's a good way to rewrite that
 file in C and trick GCC into generating the relevant exception tables
 for it?

Why do you think it was generated by gcc?  I wrote the original
version, but I think it got extended and macro-ized by others.

Paul.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev