Re: How to mount rootfs though harddisk when booting?

2009-10-26 Thread swdebug

VFS: Cannot open root device sda3 or unknown-block(0,0)

# CONFIG_EXT3_FS is not set

Please add the ext2  ext3 filesystem support.


On Sun, 25 Oct 2009 11:43:20 +0800
wilbur.chan wilbur...@gmail.com wrote:

 Sorry, the config I just post is somewhat confusing, here I post my
 config again.Thx
 
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.21.7-EMBSYS-CGEL-3.06.10.P2.F0.B4
 # Sun Oct 25 11:24:25 2009
 #
 # CONFIG_PPC64 is not set
 CONFIG_PPC32=y
 CONFIG_PPC_MERGE=y
 CONFIG_MMU=y
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_IRQ_PER_CPU=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_ARCH_HAS_ILOG2_U32=y
 CONFIG_GENERIC_HWEIGHT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_PPC=y
 CONFIG_EARLY_PRINTK=y
 CONFIG_GENERIC_NVRAM=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
 CONFIG_ARCH_MAY_HAVE_PC_FDC=y
 CONFIG_PPC_OF=y
 CONFIG_PPC_UDBG_16550=y
 CONFIG_GENERIC_TBSYNC=y
 CONFIG_AUDIT_ARCH=y
 CONFIG_GENERIC_BUG=y
 CONFIG_DEFAULT_UIMAGE=y
 
 #
 # Processor support
 #
 # CONFIG_CLASSIC32 is not set
 # CONFIG_PPC_512x is not set
 # CONFIG_PPC_82xx is not set
 # CONFIG_PPC_83xx is not set
 CONFIG_PPC_85xx=y
 # CONFIG_PPC_86xx is not set
 # CONFIG_PPC_8xx is not set
 # CONFIG_40x is not set
 # CONFIG_44x is not set
 # CONFIG_E200 is not set
 CONFIG_85xx=y
 CONFIG_E500=y
 # CONFIG_PPC_DCR_NATIVE is not set
 # CONFIG_PPC_DCR_MMIO is not set
 CONFIG_BOOKE=y
 CONFIG_FSL_BOOKE=y
 # CONFIG_PHYS_64BIT is not set
 CONFIG_SPE=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_DEFCONFIG_LIST=/lib/modules/$UNAME_RELEASE/.config
 
 #
 # Code maturity level options
 #
 CONFIG_EXPERIMENTAL=y
 CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 
 #
 # General setup
 #
 CONFIG_LOCALVERSION=
 CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 # CONFIG_IPC_NS is not set
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_UTS_NS is not set
 CONFIG_AUDIT=y
 # CONFIG_AUDITSYSCALL is not set
 # CONFIG_WRS_FCHECK is not set
 # CONFIG_EVLOG is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 # CONFIG_CONTAINER_DEBUG is not set
 # CONFIG_CPUSETS is not set
 # CONFIG_RSS_CONTAINER is not set
 CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_FAIR_USER_SCHED=y
 # CONFIG_FAIR_CGROUP_SCHED is not set
 CONFIG_SYSFS_DEPRECATED=y
 # CONFIG_CONTAINER_CPUACCT is not set
 # CONFIG_RELAY is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=
 CONFIG_HWTIMER_HOOKS=y
 # CONFIG_HWTIMER_TEST is not set
 # CONFIG_HIGH_PRIO_OOMKILL is not set
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_EMBEDDED=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
 CONFIG_EPOLL=y
 CONFIG_SHMEM=y
 CONFIG_SLAB=y
 CONFIG_VM_EVENT_COUNTERS=y
 # CONFIG_ALWAYS_RESTART is not set
 CONFIG_SIGEXIT=y
 CONFIG_MEMMON=y
 # CONFIG_MEMMON_SWAP_SUPPORT is not set
 CONFIG_NOTIFY_SETTIME=y
 CONFIG_RT_MUTEXES=y
 # CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
 # CONFIG_SLOB is not set
 
 #
 # Loadable module support
 #
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODVERSIONS is not set
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_KMOD=y
 CONFIG_STOP_MACHINE=y
 
 #
 # Block layer
 #
 CONFIG_BLOCK=y
 CONFIG_LBD=y
 # CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_LSF is not set
 
 #
 # IO Schedulers
 #
 CONFIG_IOSCHED_NOOP=y
 CONFIG_IOSCHED_AS=y
 CONFIG_IOSCHED_DEADLINE=y
 CONFIG_IOSCHED_CFQ=y
 # CONFIG_DEFAULT_AS is not set
 # CONFIG_DEFAULT_DEADLINE is not set
 CONFIG_DEFAULT_CFQ=y
 # CONFIG_DEFAULT_NOOP is not set
 CONFIG_DEFAULT_IOSCHED=cfq
 # CONFIG_WANT_EARLY_SERIAL is not set
 
 #
 # Platform support
 #
 # CONFIG_MPC8540_ADS is not set
 # CONFIG_MPC8560_ADS is not set
 # CONFIG_MPC85xx_CDS is not set
 # CONFIG_MPC85xx_MDS is not set
 # CONFIG_MPC8572_PC is not set
 CONFIG_MPC85xx_DS=y
 CONFIG_P2020=y
 CONFIG_MPC85xx=y
 CONFIG_PPC_INDIRECT_PCI_BE=y
 CONFIG_MPIC=y
 
 #
 # Kernel options
 #
 CONFIG_HIGHMEM=y
 # CONFIG_HZ_100 is not set
 CONFIG_HZ_250=y
 # CONFIG_HZ_300 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=250
 # CONFIG_GENERIC_CLOCKEVENTS is not set
 # CONFIG_TICK_ONESHOT is not set
 # CONFIG_PREEMPT_NONE is not set
 # CONFIG_PREEMPT_VOLUNTARY is not set
 CONFIG_PREEMPT=y
 # CONFIG_REAL_PREEMPT is not set
 # CONFIG_PREEMPT_SOFTIRQS is not set
 # CONFIG_PREEMPT_HARDIRQS is not set
 CONFIG_PREEMPT_BKL=y
 CONFIG_BINFMT_ELF=y
 # CONFIG_BINFMT_MISC is not set
 CONFIG_MATH_EMULATION=y
 CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
 # CONFIG_KEXEC is not set
 # CONFIG_IRQ_ALL_CPUS is not set
 CONFIG_ARCH_FLATMEM_ENABLE=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_SELECT_MEMORY_MODEL=y
 CONFIG_FLATMEM_MANUAL=y
 # CONFIG_DISCONTIGMEM_MANUAL is not set
 # CONFIG_SPARSEMEM_MANUAL is not set
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 # 

Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.

2009-10-26 Thread Arun R Bharadwaj
* Pavel Machek pa...@ucw.cz [2009-10-23 18:07:11]:

 On Fri 2009-10-16 15:13:08, Arun R Bharadwaj wrote:
  * Arun R Bharadwaj a...@linux.vnet.ibm.com [2009-10-16 15:08:50]:
  
  This patch cleans up x86 of all instances of pm_idle.
  
  pm_idle which was earlier called from cpu_idle() idle loop
  is replaced by cpuidle_idle_call.
  
  x86 also registers to cpuidle when the idle routine is selected,
  by populating the cpuidle_device data structure for each cpu.
  
  This is replicated for apm module and for xen, which also used pm_idle.
  
  
  Signed-off-by: Arun R Bharadwaj a...@linux.vnet.ibm.com
  ---
   arch/x86/kernel/apm_32.c  |   55 -
   arch/x86/kernel/process.c |   90 
  --
   arch/x86/kernel/process_32.c  |3 -
   arch/x86/kernel/process_64.c  |3 -
   arch/x86/xen/setup.c  |   40 ++
   drivers/acpi/processor_core.c |9 ++--
   drivers/cpuidle/cpuidle.c |   16 +--
   7 files changed, 182 insertions(+), 34 deletions(-)
 ...
  +static int local_idle_loop(struct cpuidle_device *dev, struct 
  cpuidle_state *st)
  +{
  +   ktime_t t1, t2;
  +   s64 diff;
  +   int ret;
  +
  +   t1 = ktime_get();
  +   local_idle();
  +   t2 = ktime_get();
  +
  +   diff = ktime_to_us(ktime_sub(t2, t1));
  +   if (diff  INT_MAX)
  +   diff = INT_MAX;
  +   ret = (int) diff;
  +
  +   return ret;
  +}
 
 So we get this routine essentially 3 times. Is there no way to share
 the code?
 

We can move this code to a common place, but that would mean exporting
the idle function pointer to be called from within this routine, which
is exactly what we wanted to avoid.

Any suggestions are welcome.

arun

 -- 
 (english) http://www.livejournal.com/~pavelmachek
 (cesky, pictures) 
 http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.

2009-10-26 Thread Pavel Machek

   +static int local_idle_loop(struct cpuidle_device *dev, struct 
   cpuidle_state *st)
   +{
   + ktime_t t1, t2;
   + s64 diff;
   + int ret;
   +
   + t1 = ktime_get();
   + local_idle();
   + t2 = ktime_get();
   +
   + diff = ktime_to_us(ktime_sub(t2, t1));
   + if (diff  INT_MAX)
   + diff = INT_MAX;
   + ret = (int) diff;
   +
   + return ret;
   +}
  
  So we get this routine essentially 3 times. Is there no way to share
  the code?
  
 
 We can move this code to a common place, but that would mean exporting
 the idle function pointer to be called from within this routine, which
 is exactly what we wanted to avoid.
 
 Any suggestions are welcome.

You can just pass idle routine as a parameter...?

int common_idle_loop(struct cpuidle_device *dev, struct cpuidle_state
*st, void *idle(void))

...?
Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.

2009-10-26 Thread Arun R Bharadwaj
* Pavel Machek pa...@ucw.cz [2009-10-26 08:58:31]:

 
+static int local_idle_loop(struct cpuidle_device *dev, struct 
cpuidle_state *st)
+{
+   ktime_t t1, t2;
+   s64 diff;
+   int ret;
+
+   t1 = ktime_get();
+   local_idle();
+   t2 = ktime_get();
+
+   diff = ktime_to_us(ktime_sub(t2, t1));
+   if (diff  INT_MAX)
+   diff = INT_MAX;
+   ret = (int) diff;
+
+   return ret;
+}
   
   So we get this routine essentially 3 times. Is there no way to share
   the code?
   
  
  We can move this code to a common place, but that would mean exporting
  the idle function pointer to be called from within this routine, which
  is exactly what we wanted to avoid.
  
  Any suggestions are welcome.
 
 You can just pass idle routine as a parameter...?
 
 int common_idle_loop(struct cpuidle_device *dev, struct cpuidle_state
 *st, void *idle(void))
 
 ...?
   Pavel

Yes, this should be fine. I was trying to avoid passing the void
function pointer around but i guess this reduces considerable code
size.

thanks!
arun
 -- 
 (english) http://www.livejournal.com/~pavelmachek
 (cesky, pictures) 
 http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Jumbo Frame bug in ibm_newemac driver (was Jumbo Frames, sil24 SATA driver, and kswapd0 page allocation failures)

2009-10-26 Thread Jonathan Haws
Okay, I need to revisit this issue.  I have had my time taken away for other 
things the past couple of months, but I am now back at this network issue.

Here is what I have done:

1. I modified the ibm_newemac driver to follow scatter-gather chains on the RX 
path.  The idea was to setup the driver to only ever deal with single pages.  
The MAL in the PPC only supports data transfers of up to 4080 bytes (less than 
a single page), so it appears that the hardware should support single page 
chains.  I set this up just like the e1000 driver.  For whatever reason, this 
did not work.  It is probably because I do not fully understand the Linux 
network stack yet (as is apparent in the next iteration).

2. I reverted to the original driver and found that, contrary to what I had 
thought earlier, the driver does allocate a ring of skbs for use in the driver. 
 However, when a jumbo packet is received (larger than 4080 bytes) it uses the 
skb that was pre-allocated for the jumbo packet and allocates a new skb to 
replace the one in the ring.  This is where the problem is - in that new 
allocation to replace the one in the stack.  So, to remedy this, I 
pre-allocated the same number of jumbo skbs for the sole purpose of being used 
as new skbs for the rx ring.  Here is some code that shows the idea:

Statuc int emaC_open(struct net_device *ndev)
{
...

/* Allocate RX ring */
for (i = 0; i  NUM_RX_BUFF; ++i)
{
if (emac_alloc_rx_skb(dev, i, GFP_KERNEL)) {
printk(KERN_ERR %s: failed to allocate RX ring\n,
   ndev-name);
goto oom;
}

}

...
}

static inline int emac_alloc_rx_skb2(struct emac_instance *dev, int slot,
gfp_t flags)
{
struct sk_buff *skb = dev-rx_skb_pool[slot];
if (unlikely(!skb))
return -ENOMEM;

if(skb_recycle_check(skb, emac_rx_skb_size(dev-rx_skb_size)))
{
dev-rx_skb[slot] = skb;
dev-rx_desc[slot].data_len = 0;

skb_reserve(skb, EMAC_RX_SKB_HEADROOM + 2);
dev-rx_desc[slot].data_ptr =
dma_map_single(dev-ofdev-dev, skb-data - 2, dev-rx_sync_size,
   DMA_FROM_DEVICE) + 2;
wmb();
dev-rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY |
(slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0);

return 0;
}
else
{
printk(KERN_NOTICE EMAC: SKB not recycleable\n);
return -ENOMEM;
}
}

Static int emac_poll_rx(void *param, int budget)
{
...
  sg:
if (ctrl  MAL_RX_CTRL_FIRST) {
BUG_ON(dev-rx_sg_skb);
if (unlikely(emac_alloc_rx_skb2(dev, slot, 
GFP_ATOMIC))) {
DBG(dev, rx OOM %d NL, slot);
++dev-estats.rx_dropped_oom;
emac_recycle_rx_skb(dev, slot, 0);
} else {
dev-rx_sg_skb = skb;
  emac_recycle_rx_skb(dev,slot,len);
skb_put(skb, len);
}
} else if (!emac_rx_sg_append(dev, slot) 
   (ctrl  MAL_RX_CTRL_LAST)) {

skb = dev-rx_sg_skb;
dev-rx_sg_skb = NULL;

ctrl = EMAC_BAD_RX_MASK;
if (unlikely(ctrl  ctrl != EMAC_RX_TAH_BAD_CSUM)) {
emac_parse_rx_error(dev, ctrl);
++dev-estats.rx_dropped_error;
dev_kfree_skb(skb);
len = 0;
} else {
/*  printk(KERN_NOTICE EMAC: pushing sg 
packet\n);*/
goto push_packet;
}
}
goto skip;
...
}

The changes are the allocation of the rx_skb_pool in emac_open(), the function 
call emac_alloc_rx_skb2() in emac_poll_rx(), and the modifications to 
emac_alloc_skb to create emac_alloc_rx_skb2.  Also, corresponding allocations 
for rx_skb_pool are found in emac_resize_rx_ring() for when we need to resize 
the pool.

Now the problem that I am having is this - the first time through the ring, 
things work just fine.  But the second time through the loop, the buffers are 
not cleaned out - they still think they contain data.  I have tried calling 
skb_recycle_check() to restore the skb to a new state, however that call fails 
because apparently the skb cannot be reused for receive.  Why is that the case? 
 What am I missing?  It seems like I am missing something that allows the skb 
to be reused?

I will admit, I am not a Linux network driver expert, though I am learning.  If 

INIT: PANIC: segmentation violation! sleeping for 30 seconds.

2009-10-26 Thread Breno Leitao
Hi, 

I just put a upstream kernel(rc5) on a specific machine I have (Power5), and I 
got
the following error: 

INIT: PANIC: segmentation violation! sleeping for 30 seconds.
init has generated signal 11 but has no handler for it
init used greatest stack depth: 6240 bytes left
Kernel panic - not syncing: Attempted to kill init!
Call Trace:
[c001c6e7f920] [c0012588] .show_stack+0x6c/0x194 (unreliable)
[c001c6e7f9d0] [c0088bd4] .panic+0x74/0x1c0
[c001c6e7fa60] [c008cbdc] .do_exit+0x43c/0x82c
[c001c6e7fb20] [c00286f4] ._exception+0x1d4/0x204
[c001c6e7fcf0] [c04e7dc8] .do_page_fault+0x4fc/0x634
[c001c6e7fe30] [c000560c] handle_page_fault+0x20/0x74


Downgrading to rc2 shows the same result. Interesting enough, this is the only
machine that fails with the upstream kernel.

Have anyone seen anything similar ?

Thanks
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Network Stack SKB Reallocation

2009-10-26 Thread Jonathan Haws
Quick question about the network stack in general:

Does the stack itself release an SKB allocated by the device driver back to the 
heap upstream, or does it require that the device driver handle that?

Thanks!

Jonathan


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: Network Stack SKB Reallocation

2009-10-26 Thread Jonathan Haws
So, in my case, I allocate a bunch of skb's that I want to be able to reuse 
during network operation (256 in fact).  When I pass it up the stack, the stack 
will free that skb back to the system making any further use of it invalid 
until I call alloc_skb() again?

Thanks.

 On Monday 26 October 2009 19:43:00 Jonathan Haws wrote:
  Quick question about the network stack in general:
 
  Does the stack itself release an SKB allocated by the device
 driver back to the heap upstream, or does it require that the device
 driver handle that?
 
 There's the concept of passing responsibilities for the frames
 between
 the networking layers. So the driver passes the frame and all
 responsibilities
 to the networking stack. So if the networking stack accepts the
 packet in the first place,
 it needs to free it (or pass it to somebody else to take care of).
 
 --
 Greetings, Michael.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: Network Stack SKB Reallocation

2009-10-26 Thread Michael Buesch
On Monday 26 October 2009 19:43:00 Jonathan Haws wrote:
 Quick question about the network stack in general:
 
 Does the stack itself release an SKB allocated by the device driver back to 
 the heap upstream, or does it require that the device driver handle that?

There's the concept of passing responsibilities for the frames between
the networking layers. So the driver passes the frame and all responsibilities
to the networking stack. So if the networking stack accepts the packet in the 
first place,
it needs to free it (or pass it to somebody else to take care of).

-- 
Greetings, Michael.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/8] Fix 8xx MMU/TLB

2009-10-26 Thread Benjamin Herrenschmidt

 Probably better to walk the kernel page table too. Does this
 make a difference(needs the tophys() patch I posted earlier):

This whole thing would be a -lot- easier to do from C code. Why ? Simply
because you could just use get_user() to load the instruction rather
than doing this page table walking in asm, which is simpler, faster, and
more fool proof (ok, you do pay the price of a kernel entry/exit
instead, but I still believe that code simplicity and maintainability
wins here).

Ben.

 From 862dda30c3d3d3bedcc605e8520626408a26891c Mon Sep 17 00:00:00 2001
 From: Joakim Tjernlund joakim.tjernl...@transmode.se
 Date: Sat, 17 Oct 2009 13:54:03 +0200
 Subject: [PATCH] 8xx: Walk the page table for kernel addresses too.
 
 ---
  arch/powerpc/kernel/head_8xx.S |   25 -
  1 files changed, 12 insertions(+), 13 deletions(-)
 
 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
 index 0e91da4..edc9e9b 100644
 --- a/arch/powerpc/kernel/head_8xx.S
 +++ b/arch/powerpc/kernel/head_8xx.S
 @@ -532,28 +532,27 @@ DARFixed:/* Return from dcbx instruction bug 
 workaround, r10 holds value of DAR
   * by decoding the registers used by the dcbx instruction and adding them.
   * DAR is set to the calculated address and r10 also holds the EA on exit.
   */
 -#define NO_SELF_MODIFYING_CODE /* define if you don't want to use self 
 modifying code */
 - nop /* A few nops to make the modified_instr: space below cache 
 line aligned */
 - nop
 -139: /* fetch instruction from userspace memory */
 + /* define if you don't want to use self modifying code */
 +#define NO_SELF_MODIFYING_CODE
 +FixupDAR:/* Entry point for dcbx workaround. */
 + /* fetch instruction from memory. */
 + mfspr r10, SPRN_SRR0
   DO_8xx_CPU6(0x3780, r3)
   mtspr SPRN_MD_EPN, r10
   mfspr r11, SPRN_M_TWB   /* Get level 1 table entry address */
 - lwz   r11, 0(r11)   /* Get the level 1 entry */
 + cmplwi  cr0, r11, 0x0800
 + blt-  3f/* Branch if user space */
 + lis   r11, swapper_pg_...@h
 + ori   r11, r11, swapper_pg_...@l
 + rlwimi  r11, r11, 0, 2, 19
 +3:   lwz   r11, 0(r11)   /* Get the level 1 entry */
   DO_8xx_CPU6(0x3b80, r3)
   mtspr SPRN_MD_TWC, r11  /* Load pte table base address */
   mfspr r11, SPRN_MD_TWC  /* and get the pte address */
   lwz   r11, 0(r11)   /* Get the pte */
   /* concat physical page address(r11) and page offset(r10) */
   rlwimi  r11, r10, 0, 20, 31
 - b 140f
 -FixupDAR:/* Entry point for dcbx workaround. */
 - /* fetch instruction from memory. */
 - mfspr r10, SPRN_SRR0
 - andis.  r11, r10, 0x8000
 - tophys  (r11, r10)
 - beq-  139b  /* Branch if user space address */
 -140: lwz   r11,0(r11)
 + lwz   r11,0(r11)
  /* Check if it really is a dcbx instruction. */
  /* dcbt and dcbtst does not generate DTLB Misses/Errors,
   * no need to include them here */
 --
 1.6.4.4


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/8] Fix 8xx MMU/TLB

2009-10-26 Thread Dan Malek


On Oct 26, 2009, at 3:47 PM, Benjamin Herrenschmidt wrote:

This whole thing would be a -lot- easier to do from C code. Why ?  
Simply

because you could just use get_user() to load the instruction rather
than doing this page table walking in asm,


Just be careful the get_user() doesn't regenerate the same
translation error you are trying to fix by being here..
It is nice doing things in C code, but you have to be aware
of the environment and the side effects when in this kind
of exception state.

Thanks.

-- Dan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] [RFC] PowerPC64: Use preempt_schedule_irq instead of preempt_schedule when returning from exceptions

2009-10-26 Thread Benjamin Herrenschmidt
On Mon, 2009-10-19 at 22:28 +0400, Valentine Barshak wrote:
 Use preempt_schedule_irq to prevent infinite irq-entry and
 eventual stack overflow problems with fast-paced IRQ sources.
 This kind of problems has been observed on the PASemi Electra IDE
 controller. We have to make sure we are soft-disabled before calling
 preempt_schedule_irq and hard disable interrupts after that
 to avoid unrecoverable exceptions.
 
 This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of
 the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered
 and has to be restored in both cases.

So I _think_ that the irqs on/off accounting for lockdep isn't quite
right. What do you think of this slightly modified version ? I've only
done a quick boot test on a G5 with lockdep enabled and a played a bit,
nothing shows up so far but it's definitely not conclusive.

The main difference is that I call trace_hardirqs_off to advertise
the fact that we are soft-disabling (it could be a dup, but at this
stage this is no big deal, but it's not always, like in syscall return
the kernel thinks we have interrupts enabled and could thus get out
of sync without that).

I also mark the PACA hard disable to reflect the MSR:EE state before
calling into preempt_schedule_irq().

---

[PATCH v2] powerpc: Use preempt_schedule_irq instead of preempt_schedule when 
returning from exceptions

Use preempt_schedule_irq to prevent infinite irq-entry and
eventual stack overflow problems with fast-paced IRQ sources.
This kind of problems has been observed on the PASemi Electra IDE
controller. We have to make sure we are soft-disabled before calling
preempt_schedule_irq and hard disable interrupts after that
to avoid unrecoverable exceptions.

This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of
the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered
and has to be restored in both cases.

Signed-off-by: Valentine Barshak vbars...@ru.mvista.com
Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---
 arch/powerpc/kernel/entry_64.S |   38 +-
 1 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index f9fd54b..b64ae3d 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -659,34 +659,38 @@ do_work:
crandc  eq,cr1*4+eq,eq
bne restore
/* here we are preempting the current task */
-1:
+   /* ensure we are soft-disabled
+* */
+   li  r0,0
+   stb r0,PACASOFTIRQEN(r13)
+   /* Trace the IRQ state change */
 #ifdef CONFIG_TRACE_IRQFLAGS
-   bl  .trace_hardirqs_on
-   /* Note: we just clobbered r10 which used to contain the previous
-* MSR before the hard-disabling done by the caller of do_work.
-* We don't have that value anymore, but it doesn't matter as
-* we will hard-enable unconditionally, we can just reload the
-* current MSR into r10
-*/
+   bl  .trace_hardirqs_off
+#endif
+1: /* And make sure we are hard-enabled */
+#ifdef CONFIG_PPC_BOOK3E
+   wrteei  1
+#else
mfmsr   r10
-#endif /* CONFIG_TRACE_IRQFLAGS */
+   ori r10,r10,MSR_EE
+   mtmsrd  r10,1
+#endif
li  r0,1
-   stb r0,PACASOFTIRQEN(r13)
stb r0,PACAHARDIRQEN(r13)
+   /* Call the scheduler with soft IRQs off */
+   bl  .preempt_schedule_irq
+   /* hard-disable interrupts again */
 #ifdef CONFIG_PPC_BOOK3E
-   wrteei  1
-   bl  .preempt_schedule
wrteei  0
 #else
-   ori r10,r10,MSR_EE
-   mtmsrd  r10,1   /* reenable interrupts */
-   bl  .preempt_schedule
mfmsr   r10
-   clrrdi  r9,r1,THREAD_SHIFT
-   rldicl  r10,r10,48,1/* disable interrupts again */
+   rldicl  r10,r10,48,1
rotldi  r10,r10,16
mtmsrd  r10,1
 #endif /* CONFIG_PPC_BOOK3E */
+   li  r0,0
+   stb r0,PACAHARDIRQEN(r13)
+   clrrdi  r9,r1,THREAD_SHIFT
ld  r4,TI_FLAGS(r9)
andi.   r0,r4,_TIF_NEED_RESCHED
bne 1b
-- 
1.6.1.2.14.gf26b5




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/8] Fix 8xx MMU/TLB

2009-10-26 Thread Benjamin Herrenschmidt
On Mon, 2009-10-26 at 16:26 -0700, Dan Malek wrote:
 Just be careful the get_user() doesn't regenerate the same
 translation error you are trying to fix by being here..

It shouldn't since it will always come up with a proper DAR but
you may want to double check before hand that your instruction
address you are loading from is -not- your marker value for bad DAR.

 It is nice doing things in C code, but you have to be aware
 of the environment and the side effects when in this kind 

Yup.

Cheers,
Ben.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [2/6] Cleanup management of kmem_caches for pagetables

2009-10-26 Thread Benjamin Herrenschmidt
On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote:

Minor nits... if you can respin today I should push it out to -next

 +void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 +{
 + char *name;
 + unsigned long table_size = sizeof(void *)  shift;
 + unsigned long align = table_size;

This is a bit thick.. could use some air. Just separate the definitions
from the assignments so you can make the code breath a bit :-)

Also the above warrants a comment explaining that this won't work for
PTE pages since sizeof(PTE) = sizeof(void *) and the day we finally
move out of pte page == struct page, the code here will have to be
adapted.

 + /* When batching pgtable pointers for RCU freeing, we store
 +  * the index size in the low bits.  Table alignment must be
 +  * big enough to fit it */
 + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
 + struct kmem_cache *new;
 +
 + /* It would be nice if this was a BUILD_BUG_ON(), but at the
 +  * moment, gcc doesn't seem to recognize is_power_of_2 as a
 +  * constant expression, so so much for that. */
 + BUG_ON(!is_power_of_2(minalign));
 + BUG_ON((shift  1) || (shift  MAX_PGTABLE_INDEX_SIZE));
 +
 + if (PGT_CACHE(shift))
 + return; /* Already have a cache of this size */

Blank line here too

 + align = max_t(unsigned long, align, minalign);
 + name = kasprintf(GFP_KERNEL, pgtable-2^%d, shift);
 + new = kmem_cache_create(name, table_size, align, 0, ctor);
 + PGT_CACHE(shift) = new;

And here

 + pr_debug(Allocated pgtable cache for order %d\n, shift);
 +}
 +
  
  void pgtable_cache_init(void)
  {
 - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], 
 PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
 - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], 
 PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
 + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
 + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
 + panic(Couldn't allocate pgtable caches);
 + BUG_ON(PUD_INDEX_SIZE  !PGT_CACHE(PUD_INDEX_SIZE));
  }

panic vs. BUG_ON() ... could be a bit more consistent.
 
  #ifdef CONFIG_SPARSEMEM_VMEMMAP
 Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h
 ===
 --- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h2009-10-16 
 12:53:45.0 +1100
 +++ working-2.6/arch/powerpc/include/asm/pgalloc-64.h 2009-10-16 
 12:53:51.0 +1100
 @@ -11,27 +11,30 @@
  #include linux/cpumask.h
  #include linux/percpu.h
  
 +/*
 + * This needs to be big enough to allow any pagetable sizes we need,
 + * but small enough to fit in the low bits of any page table pointer.
 + * In other words all pagetables, even tiny ones, must be aligned to
 + * allow at least enough low 0 bits to contain this value.
 + */
 +#define MAX_PGTABLE_INDEX_SIZE   0xf

This also has the constraint of being a (power of 2) - 1... worth
mentioning somewhere ?

Also if you could comment somewhere that index size == 0 means a PTE
page ? Not totally obvious at first.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [3/6] Allow more flexible layouts for hugepage pagetables

2009-10-26 Thread Benjamin Herrenschmidt
On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote:

So far haven't seen anything blatantly wrong, in fact, this patch
results in some nice cleanups.

One thing tho...

 -#ifdef CONFIG_HUGETLB_PAGE
 -   /* Handle hugepage regions */
 -   if (HPAGE_SHIFT  mmu_huge_psizes[psize]) {
 -   DBG_LOW( - huge page !\n);
 -   return hash_huge_page(mm, access, ea, vsid, local, trap);
 -   }
 -#endif /* CONFIG_HUGETLB_PAGE */
 -
  #ifndef CONFIG_PPC_64K_PAGES
 /* If we use 4K pages and our psize is not 4K, then we are hitting
  * a special driver mapping, we need to align the address before
 @@ -961,12 +954,18 @@ int hash_page(unsigned long ea, unsigned
  #endif /* CONFIG_PPC_64K_PAGES */

You basically made the above code be run with huge pages. This may not
be what you want ... It will result in cropping the low EA bits probably
at a stage where you don't want that (it might also be a non-issue, I
just want you to double check :-)

I suppose one option would be to remove that alignment and duplicate
the PTEs when creating those special mappings (afaik the only user
is spufs using 64K pages to map the local store)

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 10/16] percpu: make percpu symbols in powerpc unique

2009-10-26 Thread Benjamin Herrenschmidt
On Wed, 2009-10-14 at 15:01 +0900, Tejun Heo wrote:
 This patch updates percpu related symbols in powerpc such that percpu
 symbols are unique and don't clash with local symbols.  This serves
 two purposes of decreasing the possibility of global percpu symbol
 collision and allowing dropping per_cpu__ prefix from percpu symbols.
 
 * arch/powerpc/kernel/perf_callchain.c: s/callchain/cpu_perf_callchain/
 
 * arch/powerpc/kernel/setup-common.c: s/pvr/cpu_pvr/
 
 * arch/powerpc/platforms/pseries/dtl.c: s/dtl/cpu_dtl/
 
 * arch/powerpc/platforms/cell/interrupt.c: s/iic/cpu_iic/
 
 Partly based on Rusty Russell's alloc_percpu: rename percpu vars
 which cause name clashes patch.
 
 Signed-off-by: Tejun Heo t...@kernel.org
 Acked-by: Arnd Bergmann a...@arndb.de
 Cc: Rusty Russell ru...@rustcorp.com.au

Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v4 4/4] pseries: Serialize cpu hotplug operations during deactivate Vs deallocate

2009-10-26 Thread Benjamin Herrenschmidt
On Fri, 2009-10-09 at 14:01 +0530, Gautham R Shenoy wrote:
 Currently the cpu-allocation/deallocation process comprises of two steps:
 - Set the indicators and to update the device tree with DLPAR node
   information.
 
 - Online/offline the allocated/deallocated CPU.
 
 This is achieved by writing to the sysfs tunables probe during allocation
 and release during deallocation.
 
 At the sametime, the userspace can independently online/offline the CPUs of
 the system using the sysfs tunable online.
 
 It is quite possible that when a userspace tool offlines a CPU
 for the purpose of deallocation and is in the process of updating the device
 tree, some other userspace tool could bring the CPU back online by writing to
 the online sysfs tunable thereby causing the deallocate process to fail.
 
 The solution to this is to serialize writes to the probe/release sysfs
 tunable with the writes to the online sysfs tunable.
 
 This patch employs a mutex to provide this serialization, which is a no-op on
 all architectures except PPC_PSERIES
 
 Signed-off-by: Gautham R Shenoy e...@in.ibm.com

Peter, did you get a chance to review this one ?

Cheers,
Ben.

 ---
  arch/powerpc/platforms/pseries/dlpar.c |   26 ++
  drivers/base/cpu.c |2 ++
  include/linux/cpu.h|   13 +
  3 files changed, 37 insertions(+), 4 deletions(-)
 
 diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
 b/arch/powerpc/platforms/pseries/dlpar.c
 index 9752386..fc261e6 100644
 --- a/arch/powerpc/platforms/pseries/dlpar.c
 +++ b/arch/powerpc/platforms/pseries/dlpar.c
 @@ -644,6 +644,18 @@ static ssize_t memory_release_store(struct class *class, 
 const char *buf,
   return rc ? -1 : count;
  }
  
 +static DEFINE_MUTEX(pseries_cpu_hotplug_mutex);
 +
 +void cpu_hotplug_driver_lock()
 +{
 + mutex_lock(pseries_cpu_hotplug_mutex);
 +}
 +
 +void cpu_hotplug_driver_unlock()
 +{
 + mutex_unlock(pseries_cpu_hotplug_mutex);
 +}
 +
  static ssize_t cpu_probe_store(struct class *class, const char *buf,
  size_t count)
  {
 @@ -656,14 +668,15 @@ static ssize_t cpu_probe_store(struct class *class, 
 const char *buf,
   if (rc)
   return -EINVAL;
  
 + cpu_hotplug_driver_lock();
   rc = acquire_drc(drc_index);
   if (rc)
 - return rc;
 + goto out;
  
   dn = configure_connector(drc_index);
   if (!dn) {
   release_drc(drc_index);
 - return rc;
 + goto out;
   }
  
   /* fixup dn name */
 @@ -672,7 +685,8 @@ static ssize_t cpu_probe_store(struct class *class, const 
 char *buf,
   if (!cpu_name) {
   free_cc_nodes(dn);
   release_drc(drc_index);
 - return -ENOMEM;
 + rc = -ENOMEM;
 + goto out;
   }
  
   sprintf(cpu_name, /cpus/%s, dn-full_name);
 @@ -684,6 +698,8 @@ static ssize_t cpu_probe_store(struct class *class, const 
 char *buf,
   release_drc(drc_index);
  
   rc = online_node_cpus(dn);
 +out:
 + cpu_hotplug_driver_unlock();
  
   return rc ? rc : count;
  }
 @@ -705,6 +721,7 @@ static ssize_t cpu_release_store(struct class *class, 
 const char *buf,
   return -EINVAL;
   }
  
 + cpu_hotplug_driver_lock();
   rc = offline_node_cpus(dn);
  
   if (rc)
 @@ -713,7 +730,7 @@ static ssize_t cpu_release_store(struct class *class, 
 const char *buf,
   rc = release_drc(*drc_index);
   if (rc) {
   of_node_put(dn);
 - return rc;
 + goto out;
   }
  
   rc = remove_device_tree_nodes(dn);
 @@ -723,6 +740,7 @@ static ssize_t cpu_release_store(struct class *class, 
 const char *buf,
   of_node_put(dn);
  
  out:
 + cpu_hotplug_driver_unlock();
   return rc ? rc : count;
  }
  
 diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
 index e62a4cc..07c3f05 100644
 --- a/drivers/base/cpu.c
 +++ b/drivers/base/cpu.c
 @@ -35,6 +35,7 @@ static ssize_t __ref store_online(struct sys_device *dev, 
 struct sysdev_attribut
   struct cpu *cpu = container_of(dev, struct cpu, sysdev);
   ssize_t ret;
  
 + cpu_hotplug_driver_lock();
   switch (buf[0]) {
   case '0':
   ret = cpu_down(cpu-sysdev.id);
 @@ -49,6 +50,7 @@ static ssize_t __ref store_online(struct sys_device *dev, 
 struct sysdev_attribut
   default:
   ret = -EINVAL;
   }
 + cpu_hotplug_driver_unlock();
  
   if (ret = 0)
   ret = count;
 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
 index 4753619..b0ad4e1 100644
 --- a/include/linux/cpu.h
 +++ b/include/linux/cpu.h
 @@ -115,6 +115,19 @@ extern void put_online_cpus(void);
  #define unregister_hotcpu_notifier(nb)   unregister_cpu_notifier(nb)
  int cpu_down(unsigned int cpu);
  
 +#ifdef CONFIG_PPC_PSERIES
 +extern void cpu_hotplug_driver_lock(void);
 +extern void 

Is there a patch for MPC8548 XOR?

2009-10-26 Thread hank peng
I want to use its' XOR engine to compute raid5 parity, but I can't
find this function in 2.6.30 downloaded from kernel.org, someone know
if there is a patch?

-- 
The simplest is not all best but the best is surely the simplest!
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [2/6] Cleanup management of kmem_caches for pagetables

2009-10-26 Thread David Gibson
On Tue, Oct 27, 2009 at 01:28:19PM +1100, Benjamin Herrenschmidt wrote:
 On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote:
 
 Minor nits... if you can respin today I should push it out to -next
 
  +void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
  +{
  +   char *name;
  +   unsigned long table_size = sizeof(void *)  shift;
  +   unsigned long align = table_size;
 
 This is a bit thick.. could use some air. Just separate the definitions
 from the assignments so you can make the code breath a bit :-)

Ok.

 Also the above warrants a comment explaining that this won't work for
 PTE pages since sizeof(PTE) = sizeof(void *) and the day we finally
 move out of pte page == struct page, the code here will have to be
 adapted.

Ok.

[snip]
   void pgtable_cache_init(void)
   {
  -   pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], 
  PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
  -   pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], 
  PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
  +   pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
  +   pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
  +   if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
  +   panic(Couldn't allocate pgtable caches);
  +   BUG_ON(PUD_INDEX_SIZE  !PGT_CACHE(PUD_INDEX_SIZE));
   }
 
 panic vs. BUG_ON() ... could be a bit more consistent.

Uh.. there is actually a rationale for the difference here.  The
panic() is due to a a runtime error - couldn't allocate the caches -
which isn't necessarily a kernel bug (could be hardware error, or
ludicrously short on memory).

The trick is that allocating the PGD and PMD caches is supposed to
also create the PUD cache, because the PUD index size is always the
same as either the PGD or PUD cache.  If that's not true, we've broken
the assumptions the code is based on, hence BUG().

   #ifdef CONFIG_SPARSEMEM_VMEMMAP
  Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h
  ===
  --- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h  2009-10-16 
  12:53:45.0 +1100
  +++ working-2.6/arch/powerpc/include/asm/pgalloc-64.h   2009-10-16 
  12:53:51.0 +1100
  @@ -11,27 +11,30 @@
   #include linux/cpumask.h
   #include linux/percpu.h
   
  +/*
  + * This needs to be big enough to allow any pagetable sizes we need,
  + * but small enough to fit in the low bits of any page table pointer.
  + * In other words all pagetables, even tiny ones, must be aligned to
  + * allow at least enough low 0 bits to contain this value.
  + */
  +#define MAX_PGTABLE_INDEX_SIZE 0xf
 
 This also has the constraint of being a (power of 2) - 1... worth
 mentioning somewhere ?
 
 Also if you could comment somewhere that index size == 0 means a PTE
 page ? Not totally obvious at first.

Ok, I've expanded on this comment.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [2/6] Cleanup management of kmem_caches for pagetables

2009-10-26 Thread Benjamin Herrenschmidt
On Tue, 2009-10-27 at 14:46 +1100, David Gibson wrote:
 
 The trick is that allocating the PGD and PMD caches is supposed to
 also create the PUD cache, because the PUD index size is always the
 same as either the PGD or PUD cache.  If that's not true, we've broken
 the assumptions the code is based on, hence BUG(). 

Ok, so maybe a little comment with the above explanation concerning
the PUD index size being the same as the PGD or PMD one would be
useful :-)

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [3/6] Allow more flexible layouts for hugepage pagetables

2009-10-26 Thread David Gibson
On Tue, Oct 27, 2009 at 02:10:59PM +1100, Benjamin Herrenschmidt wrote:
 On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote:
 
 So far haven't seen anything blatantly wrong, in fact, this patch
 results in some nice cleanups.
 
 One thing tho...
 
  -#ifdef CONFIG_HUGETLB_PAGE
  -   /* Handle hugepage regions */
  -   if (HPAGE_SHIFT  mmu_huge_psizes[psize]) {
  -   DBG_LOW( - huge page !\n);
  -   return hash_huge_page(mm, access, ea, vsid, local, trap);
  -   }
  -#endif /* CONFIG_HUGETLB_PAGE */
  -
   #ifndef CONFIG_PPC_64K_PAGES
  /* If we use 4K pages and our psize is not 4K, then we are hitting
   * a special driver mapping, we need to align the address before
  @@ -961,12 +954,18 @@ int hash_page(unsigned long ea, unsigned
   #endif /* CONFIG_PPC_64K_PAGES */
 
 You basically made the above code be run with huge pages. This may not
 be what you want ... It will result in cropping the low EA bits probably
 at a stage where you don't want that (it might also be a non-issue, I
 just want you to double check :-)

Ok, I've done that, and adjusted the comment accordingly.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 6/6] powerpc: Export powerpc_debugfs_root

2009-10-26 Thread Anton Blanchard

Kernel modules should be able to place their debug output inside our powerpc
debugfs directory.

Signed-off-by: Anton Blanchard an...@samba.org
---

Index: linux.trees.git/arch/powerpc/kernel/setup-common.c
===
--- linux.trees.git.orig/arch/powerpc/kernel/setup-common.c 2009-10-27 
12:59:00.0 +1100
+++ linux.trees.git/arch/powerpc/kernel/setup-common.c  2009-10-27 
12:59:15.0 +1100
@@ -660,6 +660,7 @@ late_initcall(check_cache_coherency);
 
 #ifdef CONFIG_DEBUG_FS
 struct dentry *powerpc_debugfs_root;
+EXPORT_SYMBOL(powerpc_debugfs_root);
 
 static int powerpc_debugfs_init(void)
 {
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/6] powerpc: tracing: Add hypervisor call tracepoints

2009-10-26 Thread Anton Blanchard

Add hcall_entry and hcall_exit tracepoints. This replaces the inline
assembly HCALL_STATS code and converts it to use the new tracepoints.

To keep the disabled case as quick as possible, we embed a status word
in the TOC so we can get at it with a single load. By doing so we
keep the overhead at a minimum. Time taken for a null hcall:

No tracepoint code: 135.79 cycles
Disabled tracepoints:   137.95 cycles

For reference, before this patch enabling HCALL_STATS resulted in a null
hcall of 201.44 cycles!

Signed-off-by: Anton Blanchard an...@samba.org
---

Index: linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S
===
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/hvCall.S
2009-10-27 13:36:05.0 +1100
+++ linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 
14:53:21.0 +1100
@@ -14,20 +14,54 @@

 #define STK_PARM(i) (48 + ((i)-3)*8)
 
-#ifdef CONFIG_HCALL_STATS
+#ifdef CONFIG_TRACEPOINTS
+
+   .section.toc,aw
+
+   .globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+   .llong  0
+
+   .section.text
+
 /*
  * precall must preserve all registers.  use unused STK_PARM()
- * areas to save snapshots and opcode.
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
  */
 #define HCALL_INST_PRECALL \
-   std r3,STK_PARM(r3)(r1);/* save opcode */   \
-   mftbr0; /* get timebase and */  \
-   std r0,STK_PARM(r5)(r1);/* save for later */\
 BEGIN_FTR_SECTION; \
-   mfspr   r0,SPRN_PURR;   /* get PURR and */  \
-   std r0,STK_PARM(r6)(r1);/* save for later */\
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);
-   
+   b   1f; \
+END_FTR_SECTION(0, 1); \
+   ld  r12,hcall_tracepoint_refco...@toc(r2);  \
+   cmpdi   r12,0;  \
+   beq+1f; \
+   mflrr0; \
+   std r3,STK_PARM(r3)(r1);\
+   std r4,STK_PARM(r4)(r1);\
+   std r5,STK_PARM(r5)(r1);\
+   std r6,STK_PARM(r6)(r1);\
+   std r7,STK_PARM(r7)(r1);\
+   std r8,STK_PARM(r8)(r1);\
+   std r9,STK_PARM(r9)(r1);\
+   std r10,STK_PARM(r10)(r1);  \
+   std r0,16(r1);  \
+   stdur1,-STACK_FRAME_OVERHEAD(r1);   \
+   bl  .__trace_hcall_entry;   \
+   addir1,r1,STACK_FRAME_OVERHEAD; \
+   ld  r0,16(r1);  \
+   ld  r3,STK_PARM(r3)(r1);\
+   ld  r4,STK_PARM(r4)(r1);\
+   ld  r5,STK_PARM(r5)(r1);\
+   ld  r6,STK_PARM(r6)(r1);\
+   ld  r7,STK_PARM(r7)(r1);\
+   ld  r8,STK_PARM(r8)(r1);\
+   ld  r9,STK_PARM(r9)(r1);\
+   ld  r10,STK_PARM(r10)(r1);  \
+   mtlrr0; \
+1:
+
 /*
  * postcall is performed immediately before function return which
  * allows liberal use of volatile registers.  We branch around this
@@ -38,40 +72,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_PURR);
 BEGIN_FTR_SECTION; \
b   1f; \
 END_FTR_SECTION(0, 1); \
-   ld  r4,STK_PARM(r3)(r1);/* validate opcode */   \
-   cmpldi  cr7,r4,MAX_HCALL_OPCODE;\
-   bgt-cr7,1f; \
-   \
-   /* get time and PURR snapshots after hcall */   \
-   mftbr7; /* timebase after */\
-BEGIN_FTR_SECTION; \
-   mfspr   r8,SPRN_PURR;   /* PURR after */\
-   ld  r6,STK_PARM(r6)(r1);/* PURR before */   \
-   subfr6,r6,r8;   /* delta */ \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);   \
-   ld  r5,STK_PARM(r5)(r1);/* 

[PATCH 1/6] powerpc: tracing: Add powerpc tracepoints for interrupt entry and exit

2009-10-26 Thread Anton Blanchard

This patch adds powerpc specific tracepoints for interrupt entry and exit.

While we already have generic irq_handler_entry and irq_handler_exit
tracepoints there are cases on our virtualised powerpc machines where an
interrupt is presented to the OS, but subsequently handled by the hypervisor.
This means no OS interrupt handler is invoked.

Here is an example on a POWER6 machine with the patch below applied:
 
idle-0 [006]  3243.949840744: irq_entry: pt_regs=c000ce31fb10
idle-0 [006]  3243.949850520: irq_exit: pt_regs=c000ce31fb10

idle-0 [007]  3243.950218208: irq_entry: pt_regs=c000ce323b10
idle-0 [007]  3243.950224080: irq_exit: pt_regs=c000ce323b10

idle-0 [000]  3244.021879320: irq_entry: pt_regs=c0a63aa0
idle-0 [000]  3244.021883616: irq_handler_entry: irq=87 handler=eth0
idle-0 [000]  3244.021887328: irq_handler_exit: irq=87 return=handled
idle-0 [000]  3244.021897408: irq_exit: pt_regs=c0a63aa0

Here we see two phantom interrupts (no handler was invoked), followed
by a real interrupt for eth0. Without the tracepoints in this patch we
would have missed the phantom interrupts.

Signed-off-by: Anton Blanchard an...@samba.org
Acked-by: Steven Rostedt rost...@goodmis.org
--

No change to this patch.

Index: linux.trees.git/arch/powerpc/include/asm/trace.h
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-17 
08:45:08.0 +1100
@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM powerpc
+
+#if !defined(_TRACE_POWERPC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWERPC_H
+
+#include linux/tracepoint.h
+
+struct pt_regs;
+
+TRACE_EVENT(irq_entry,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs),
+
+   TP_STRUCT__entry(
+   __field(struct pt_regs *, regs)
+   ),
+
+   TP_fast_assign(
+   __entry-regs = regs;
+   ),
+
+   TP_printk(pt_regs=%p, __entry-regs)
+);
+
+TRACE_EVENT(irq_exit,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs),
+
+   TP_STRUCT__entry(
+   __field(struct pt_regs *, regs)
+   ),
+
+   TP_fast_assign(
+   __entry-regs = regs;
+   ),
+
+   TP_printk(pt_regs=%p, __entry-regs)
+);
+
+#endif /* _TRACE_POWERPC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm
+#define TRACE_INCLUDE_FILE trace
+
+#include trace/define_trace.h
Index: linux.trees.git/arch/powerpc/kernel/irq.c
===
--- linux.trees.git.orig/arch/powerpc/kernel/irq.c  2009-10-17 
08:44:32.0 +1100
+++ linux.trees.git/arch/powerpc/kernel/irq.c   2009-10-17 08:45:44.0 
+1100
@@ -70,6 +70,8 @@
 #include asm/firmware.h
 #include asm/lv1call.h
 #endif
+#define CREATE_TRACE_POINTS
+#include asm/trace.h
 
 int __irq_offset_value;
 static int ppc_spurious_interrupts;
@@ -325,6 +327,8 @@ void do_IRQ(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
unsigned int irq;
 
+   trace_irq_entry(regs);
+
irq_enter();
 
check_stack_overflow();
@@ -348,6 +352,8 @@ void do_IRQ(struct pt_regs *regs)
timer_interrupt(regs);
}
 #endif
+
+   trace_irq_exit(regs);
 }
 
 void __init init_IRQ(void)
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/6] powerpc: Disable HCALL_STATS by default

2009-10-26 Thread Anton Blanchard

The overhead of HCALL_STATS is quite high and the functionality is very
rarely used. Key statistics are also missing (eg min/max).

With the new hcall tracepoints much more powerful tracing can be done in
a kernel module. Lets disable this by default.

Signed-off-by: Anton Blanchard an...@samba.org
---

Index: linux.trees.git/arch/powerpc/configs/pseries_defconfig
===
--- linux.trees.git.orig/arch/powerpc/configs/pseries_defconfig 2009-10-27 
14:56:58.0 +1100
+++ linux.trees.git/arch/powerpc/configs/pseries_defconfig  2009-10-27 
14:57:11.0 +1100
@@ -1683,7 +1683,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_STACK_USAGE is not set
 # CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_HCALL_STATS=y
+# CONFIG_HCALL_STATS is not set
 # CONFIG_CODE_PATCHING_SELFTEST is not set
 # CONFIG_FTR_FIXUP_SELFTEST is not set
 # CONFIG_MSI_BITMAP_SELFTEST is not set
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/6] powerpc: tracing: Add powerpc tracepoints for timer entry and exit

2009-10-26 Thread Anton Blanchard

We can monitor the effectiveness of our power management of both the
kernel and hypervisor by probing the timer interrupt. For example, on
this box we see 10.37s timer interrupts on an idle core:

idle-0 [010]  3900.671297: timer_interrupt_entry: pt_regs=c000ce1e7b10
idle-0 [010]  3900.671302: timer_interrupt_exit: pt_regs=c000ce1e7b10

idle-0 [010]  3911.042963: timer_interrupt_entry: pt_regs=c000ce1e7b10
idle-0 [010]  3911.042968: timer_interrupt_exit: pt_regs=c000ce1e7b10

idle-0 [010]  3921.414630: timer_interrupt_entry: pt_regs=c000ce1e7b10
idle-0 [010]  3921.414635: timer_interrupt_exit: pt_regs=c000ce1e7b10

Since we have a 207MHz decrementer it will go negative and fire every 10.37s
even if Linux is completely idle.

Signed-off-by: Anton Blanchard an...@samba.org
---

Index: linux.trees.git/arch/powerpc/kernel/time.c
===
--- linux.trees.git.orig/arch/powerpc/kernel/time.c 2009-10-07 
17:21:21.0 +1100
+++ linux.trees.git/arch/powerpc/kernel/time.c  2009-10-07 17:21:52.0 
+1100
@@ -54,6 +54,7 @@
 #include linux/irq.h
 #include linux/delay.h
 #include linux/perf_event.h
+#include asm/trace.h
 
 #include asm/io.h
 #include asm/processor.h
@@ -571,6 +572,8 @@ void timer_interrupt(struct pt_regs * re
struct clock_event_device *evt = decrementer-event;
u64 now;
 
+   trace_timer_interrupt_entry(regs);
+
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continuue to take decrementer exceptions */
set_dec(DECREMENTER_MAX);
@@ -590,6 +593,7 @@ void timer_interrupt(struct pt_regs * re
now = decrementer-next_tb - now;
if (now = DECREMENTER_MAX)
set_dec((int)now);
+   trace_timer_interrupt_exit(regs);
return;
}
old_regs = set_irq_regs(regs);
@@ -620,6 +624,8 @@ void timer_interrupt(struct pt_regs * re
 
irq_exit();
set_irq_regs(old_regs);
+
+   trace_timer_interrupt_exit(regs);
 }
 
 void wakeup_decrementer(void)
Index: linux.trees.git/arch/powerpc/include/asm/trace.h
===
--- linux.trees.git.orig/arch/powerpc/include/asm/trace.h   2009-10-07 
17:22:25.0 +1100
+++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-07 
17:23:20.0 +1100
@@ -42,6 +42,40 @@ TRACE_EVENT(irq_exit,
TP_printk(pt_regs=%p, __entry-regs)
 );
 
+TRACE_EVENT(timer_interrupt_entry,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs),
+
+   TP_STRUCT__entry(
+   __field(struct pt_regs *, regs)
+   ),
+
+   TP_fast_assign(
+   __entry-regs = regs;
+   ),
+
+   TP_printk(pt_regs=%p, __entry-regs)
+);
+
+TRACE_EVENT(timer_interrupt_exit,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs),
+
+   TP_STRUCT__entry(
+   __field(struct pt_regs *, regs)
+   ),
+
+   TP_fast_assign(
+   __entry-regs = regs;
+   ),
+
+   TP_printk(pt_regs=%p, __entry-regs)
+);
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/6] powerpc: tracing: Give hypervisor call tracepoints access to arguments

2009-10-26 Thread Anton Blanchard

While most users of the hcall tracepoints will only want the opcode and return
code, some will want all the arguments. To avoid the complexity of using
varargs we pass a pointer to the register save area which contain all
arguments.

Signed-off-by: Anton Blanchard an...@samba.org
---

Index: linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S
===
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/hvCall.S
2009-10-27 14:29:09.0 +1100
+++ linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 
14:29:16.0 +1100
@@ -30,7 +30,7 @@ hcall_tracepoint_refcount:
  * in early init (eg when populating the MMU hashtable) by using an
  * unconditional cpu feature.
  */
-#define HCALL_INST_PRECALL \
+#define HCALL_INST_PRECALL(FIRST_REG)  \
 BEGIN_FTR_SECTION; \
b   1f; \
 END_FTR_SECTION(0, 1); \
@@ -47,6 +47,7 @@ END_FTR_SECTION(0, 1);
\
std r9,STK_PARM(r9)(r1);\
std r10,STK_PARM(r10)(r1);  \
std r0,16(r1);  \
+   addir4,r1,STK_PARM(FIRST_REG);  \
stdur1,-STACK_FRAME_OVERHEAD(r1);   \
bl  .__trace_hcall_entry;   \
addir1,r1,STACK_FRAME_OVERHEAD; \
@@ -68,7 +69,7 @@ END_FTR_SECTION(0, 1);
\
  * in early init (eg when populating the MMU hashtable) by using an
  * unconditional cpu feature.
  */
-#define HCALL_INST_POSTCALL\
+#define __HCALL_INST_POSTCALL  \
 BEGIN_FTR_SECTION; \
b   1f; \
 END_FTR_SECTION(0, 1); \
@@ -88,9 +89,19 @@ END_FTR_SECTION(0, 1);   
\
ld  r3,STK_PARM(r3)(r1);\
mtlrr0; \
 1:
+
+#define HCALL_INST_POSTCALL_NORETS \
+   li  r5,0;   \
+   __HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)\
+   mr  r5,BUFREG;  \
+   __HCALL_INST_POSTCALL
+
 #else
-#define HCALL_INST_PRECALL
-#define HCALL_INST_POSTCALL
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
 #endif
 
.text
@@ -101,11 +112,11 @@ _GLOBAL(plpar_hcall_norets)
mfcrr0
stw r0,8(r1)
 
-   HCALL_INST_PRECALL
+   HCALL_INST_PRECALL(r4)
 
HVSC/* invoke the hypervisor */
 
-   HCALL_INST_POSTCALL
+   HCALL_INST_POSTCALL_NORETS
 
lwz r0,8(r1)
mtcrf   0xff,r0
@@ -117,7 +128,7 @@ _GLOBAL(plpar_hcall)
mfcrr0
stw r0,8(r1)
 
-   HCALL_INST_PRECALL
+   HCALL_INST_PRECALL(r5)
 
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
 
@@ -136,7 +147,7 @@ _GLOBAL(plpar_hcall)
std r6, 16(r12)
std r7, 24(r12)
 
-   HCALL_INST_POSTCALL
+   HCALL_INST_POSTCALL(r12)
 
lwz r0,8(r1)
mtcrf   0xff,r0
@@ -183,7 +194,7 @@ _GLOBAL(plpar_hcall9)
mfcrr0
stw r0,8(r1)
 
-   HCALL_INST_PRECALL
+   HCALL_INST_PRECALL(r5)
 
std r4,STK_PARM(r4)(r1) /* Save ret buffer */
 
@@ -211,7 +222,7 @@ _GLOBAL(plpar_hcall9)
std r11,56(r12)
std r0, 64(r12)
 
-   HCALL_INST_POSTCALL
+   HCALL_INST_POSTCALL(r12)
 
lwz r0,8(r1)
mtcrf   0xff,r0
Index: linux.trees.git/arch/powerpc/include/asm/trace.h
===
--- linux.trees.git.orig/arch/powerpc/include/asm/trace.h   2009-10-27 
14:28:15.0 +1100
+++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-27 
14:29:16.0 +1100
@@ -81,9 +81,9 @@ extern void hcall_tracepoint_unregfunc(v
 
 TRACE_EVENT_FN(hcall_entry,
 
-   TP_PROTO(unsigned long opcode),
+   TP_PROTO(unsigned long opcode, unsigned long *args),
 
-   TP_ARGS(opcode),
+   TP_ARGS(opcode, args),
 
TP_STRUCT__entry(
__field(unsigned long, opcode)
@@ -100,9 +100,10 @@ TRACE_EVENT_FN(hcall_entry,
 
 TRACE_EVENT_FN(hcall_exit,
 
-   TP_PROTO(unsigned long opcode, unsigned long retval),
+   TP_PROTO(unsigned long opcode, unsigned long retval,
+   

hypervisor call trace module

2009-10-26 Thread Anton Blanchard

Here is an example of using the hcall tracepoints. This kernel
module provides strace like functionality for hypervisor hcalls:

- 0x64(ff02, 1, 2, d34d7a71, f, c0a6f388, 1, 
c0989008, c0a3f480)
  - 0x64()

Which was an EOI (opcode 0x64) of 0xff02

There are a number of drivers that carry a lot of hcall related debug
code just in case we have to chase down a bug. I'm hoping hcall tracepoints
could replace it all and allow for much more powerful debugging.

Anton
obj-m := hcall_trace.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

clean:
rm -rf *.mod.c *.ko *.o .*.cmd .tmp_versions Module.markers 
modules.order Module.symvers
/*
 * Hypervisor hcall trace
 *
 * Look for output in /sys/kernel/debug/powerpc/hcall_trace/
 * 
 * Copyright (C) 2009 Anton Blanchard an...@au.ibm.com, IBM
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */ 

#include linux/module.h
#include linux/debugfs.h
#include linux/relay.h
#include asm/trace.h

#define SUBBUF_SIZE	131072
#define N_SUBBUFS	8

#define BUFLEN		512

static struct rchan *log_chan;

static void probe_hcall_entry(unsigned long opcode, unsigned long *args)
{
	char buf[BUFLEN];

	/* Don't log H_CEDE */
	if (opcode == H_CEDE)
		return;

	snprintf(buf, BUFLEN,
		- 0x%lx(%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx)\n,
		opcode, *args, *(args+1), *(args+2), *(args+3), *(args+4),
		*(args+5), *(args+6), *(args+7), *(args+8));

	relay_write(log_chan, buf, strlen(buf));
}

static void probe_hcall_exit(unsigned long opcode, unsigned long retval,
			 unsigned long *retbuf)
{
	char buf[BUFLEN];

	/* Don't log H_CEDE */
	if (opcode == H_CEDE)
		return;

	if (retbuf)
		snprintf(buf, BUFLEN, 
		  - 0x%lx(%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx)\n,
			opcode, *retbuf, *(retbuf+1),
			*(retbuf+2), *(retbuf+3), *(retbuf+4), *(retbuf+5),
			*(retbuf+6), *(retbuf+7), *(retbuf+8));
	else
		sprintf(buf,   - 0x%lx()\n, opcode);

	relay_write(log_chan, buf, strlen(buf));
}

static struct dentry *create_buf_file_handler(const char *filename,
	  struct dentry *parent, int mode,
	  struct rchan_buf *buf,
	  int *is_global)
{
	return debugfs_create_file(filename, mode, parent, buf,
		relay_file_operations);
}

static int remove_buf_file_handler(struct dentry *dentry)
{
	debugfs_remove(dentry);
	return 0;
}

static int subbuf_start(struct rchan_buf *buf, void *subbuf, void *prev_subbuf,
			size_t prev_padding)
{
	return 1;
}

static struct rchan_callbacks relay_callbacks =
{
	.create_buf_file = create_buf_file_handler,
	.remove_buf_file = remove_buf_file_handler,
	.subbuf_start = subbuf_start,
};

static struct dentry *debugfs_root;

static int __init hcall_trace_init(void)
{
	debugfs_root = debugfs_create_dir(hcall_trace, powerpc_debugfs_root);

	if (debugfs_root == ERR_PTR(-ENODEV)) {
		printk(KERN_ERR Debugfs not configured\n);
		goto err_out;
	}

	if (!debugfs_root) {
		printk(KERN_ERR Could not create debugfs directory\n);
		goto err_out;
	}

	log_chan = relay_open(cpu, debugfs_root, SUBBUF_SIZE,
			  N_SUBBUFS, relay_callbacks, NULL);
	if (!log_chan) {
		printk(KERN_ERR relay_open failed\n);
		goto err_relay_open;
	}

	if (register_trace_hcall_entry(probe_hcall_entry)) {
		printk(KERN_ERR probe_hcall_entry probe point failed\n);
		goto err_probe_hcall_entry;
	}

	if (register_trace_hcall_exit(probe_hcall_exit)) {
		printk(KERN_ERR probe_hcall_exit probe point failed\n);
		goto err_probe_hcall_exit;
	}

	return 0;

err_probe_hcall_exit:
	unregister_trace_hcall_entry(probe_hcall_entry);
err_probe_hcall_entry:
	relay_close(log_chan);
err_relay_open:
	debugfs_remove(debugfs_root);
err_out:
	return -ENODEV;
}

static void __exit hcall_trace_exit(void)
{
	unregister_trace_hcall_exit(probe_hcall_exit);
	unregister_trace_hcall_entry(probe_hcall_entry);

	relay_close(log_chan);
	debugfs_remove(debugfs_root);
}

module_init(hcall_trace_init)
module_exit(hcall_trace_exit)
MODULE_LICENSE(GPL);
MODULE_AUTHOR(Anton Blanchard);
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[0/6] Assorted hugepage cleanups (v4)

2009-10-26 Thread David Gibson
Currently, ordinary pages use one pagetable layout, and each different
hugepage size uses a slightly different variant layout.  A number of
places which need to walk the pagetable must first check the slice map
to see what the pagetable layout then handle the various different
forms.  New hardware, like Book3E is liable to introduce more possible
variants.

This patch series, therefore, is designed to simplify the matter by
limiting knowledge of the pagetable layout to only the allocation
path.  With this patch, ordinary pages are handled as ever, with a
fixed 4 (or 3) level tree.  All other variants branch off from some
layer of that with a specially marked PGD/PUD/PMD pointer which also
contains enough information to interpret the directories below that
point.  This means that things walking the pagetables (without
allocating) don't need to look up the slice map, they can just step
down the tree in the usual way, branching off to the non-standard
layout path for hugepages, which uses the embdded information to
interpret the tree from that point on.

This reduces the source size in a number of places, and means that
newer variants on the pagetable layout to handle new hardware and new
features will need to alter the existing code in less places.

In addition we split out the hash / classic MMU specific code into a
separate hugetlbpage-hash64.c file.  This will make adding support for
other MMUs (like 440 and/or Book3E) easier.

I've used the libhugetlbfs testsuite to test these patches on a
Power5+ machine, but they could certainly do with more testing. In
particular, I don't have any suitable hardware to test 16G pages.

V2: Made the tweaks that BenH suggested to patch 2 of the original
series.  Some corresponding tweaks in patch 3 to match.

V3: Fix a bug in the creation of the pgrable caches.  Slightly extend
the initialization cleanup.  Add a new patch cleaning up the hugepage
pte accessor functions.

V4: Revisions based on BenH's comments, fix compile breakage for
!CONFIG_HUGETLB_PAGE.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[2/6] Cleanup management of kmem_caches for pagetables

2009-10-26 Thread David Gibson
Currently we have a fair bit of rather fiddly code to manage the
various kmem_caches used to store page tables of various levels.  We
generally have two caches holding some combination of PGD, PUD and PMD
tables, plus several more for the special hugepage pagetables.

This patch cleans this all up by taking a different approach.  Rather
than the caches being designated as for PUDs or for hugeptes for 16M
pages, the caches are simply allocated to be a specific size.  Thus
sharing of caches between different types/levels of pagetables happens
naturally.  The pagetable size, where needed, is passed around encoded
in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the
pagetable contains 2^n pointers.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/include/asm/pgalloc-64.h|   60 +++---
 arch/powerpc/include/asm/pgalloc.h   |   30 +
 arch/powerpc/include/asm/pgtable-ppc64.h |1 
 arch/powerpc/mm/hugetlbpage.c|   45 +--
 arch/powerpc/mm/init_64.c|   70 +--
 arch/powerpc/mm/pgtable.c|   25 +++
 6 files changed, 117 insertions(+), 114 deletions(-)

Index: working-2.6/arch/powerpc/mm/init_64.c
===
--- working-2.6.orig/arch/powerpc/mm/init_64.c  2009-10-27 15:30:17.0 
+1100
+++ working-2.6/arch/powerpc/mm/init_64.c   2009-10-27 15:37:04.0 
+1100
@@ -119,30 +119,58 @@ static void pmd_ctor(void *addr)
memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-   PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-   pgd_cache, pmd_cache,
-#else
-   pgd_cache, pud_pmd_cache,
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache 
*pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+   char *name;
+   unsigned long table_size = sizeof(void *)  shift;
+   unsigned long align = table_size;
+
+   /* When batching pgtable pointers for RCU freeing, we store
+* the index size in the low bits.  Table alignment must be
+* big enough to fit it */
+   unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1;
+   struct kmem_cache *new;
+
+   /* It would be nice if this was a BUILD_BUG_ON(), but at the
+* moment, gcc doesn't seem to recognize is_power_of_2 as a
+* constant expression, so so much for that. */
+   BUG_ON(!is_power_of_2(minalign));
+   BUG_ON((shift  1) || (shift  MAX_PGTABLE_INDEX_SIZE));
+
+   if (PGT_CACHE(shift))
+   return; /* Already have a cache of this size */
+
+   align = max_t(unsigned long, align, minalign);
+   name = kasprintf(GFP_KERNEL, pgtable-2^%d, shift);
+   new = kmem_cache_create(name, table_size, align, 0, ctor);
+   PGT_CACHE(shift) = new;
+
+   pr_debug(Allocated pgtable cache for order %d\n, shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-   pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], 
PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-   pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], 
PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+   pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+   pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+   if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+   panic(Couldn't allocate pgtable caches);
+
+   /* In all current configs, when the PUD index exists it's the
+* same size as either the pgd or pmd index.  Verify that the
+* initialization above has also created a PUD cache.  This
+* will need re-examiniation if we add new possibilities for
+* the pagetable layout. */
+   BUG_ON(PUD_INDEX_SIZE  !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h
===
--- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h  2009-10-27 
15:30:16.0 +1100
+++ 

[1/6] Make hpte_need_flush() correctly mask for multiple page sizes

2009-10-26 Thread David Gibson
Currently, hpte_need_flush() only correctly flushes the given address
for normal pages.  Callers for hugepages are required to mask the
address themselves.

But hpte_need_flush() already looks up the page sizes for its own
reasons, so this is a rather silly imposition on the callers.  This
patch alters it to mask based on the pagesize it has looked up itself,
and removes the awkward masking code in the hugepage caller.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/mm/hugetlbpage.c |6 +-
 arch/powerpc/mm/tlb_hash64.c  |8 +++-
 2 files changed, 4 insertions(+), 10 deletions(-)

Index: working-2.6/arch/powerpc/mm/tlb_hash64.c
===
--- working-2.6.orig/arch/powerpc/mm/tlb_hash64.c   2009-09-04 
14:35:30.0 +1000
+++ working-2.6/arch/powerpc/mm/tlb_hash64.c2009-09-04 14:36:12.0 
+1000
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *m
 
i = batch-index;
 
-   /* We mask the address for the base page size. Huge pages will
-* have applied their own masking already
-*/
-   addr = PAGE_MASK;
-
/* Get page size (maybe move back to caller).
 *
 * NOTE: when using special 64K mappings in 4K environment like
@@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *m
} else
psize = pte_pagesize_index(mm, addr, pte);
 
+   /* Mask the address for the correct page size */
+   addr = ~((1UL  mmu_psize_defs[psize].shift) - 1);
+
/* Build full vaddr */
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c  2009-09-04 
14:35:30.0 +1000
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c   2009-09-04 14:36:12.0 
+1000
@@ -445,11 +445,7 @@ void set_huge_pte_at(struct mm_struct *m
 * necessary anymore if we make hpte_need_flush() get the
 * page size from the slices
 */
-   unsigned int psize = get_slice_psize(mm, addr);
-   unsigned int shift = mmu_psize_to_shift(psize);
-   unsigned long sz = ((1UL)  shift);
-   struct hstate *hstate = size_to_hstate(sz);
-   pte_update(mm, addr  hstate-mask, ptep, ~0UL, 1);
+   pte_update(mm, addr, ptep, ~0UL, 1);
}
*ptep = __pte(pte_val(pte)  ~_PAGE_HPTEFLAGS);
 }
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[3/6] Allow more flexible layouts for hugepage pagetables

2009-10-26 Thread David Gibson
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level.  Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly.  Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.

This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity.  In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask.  This scheme can be extended neatly to handle multiple
levels of self-describing special hugepage pagetables, although for
now we assume only one level exists.

This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out.  All other (hugepage)
pagetable walking paths can just interpret the structure as they go.

There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug.  We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping).  This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.

While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/include/asm/hugetlb.h   |1 
 arch/powerpc/include/asm/mmu-hash64.h|   14 
 arch/powerpc/include/asm/page.h  |   14 
 arch/powerpc/include/asm/pgtable-ppc64.h |   13 
 arch/powerpc/include/asm/pgtable.h   |3 
 arch/powerpc/kernel/perf_callchain.c |   20 -
 arch/powerpc/mm/gup.c|  149 +
 arch/powerpc/mm/hash_utils_64.c  |   26 -
 arch/powerpc/mm/hugetlbpage.c|  473 ++-
 arch/powerpc/mm/init_64.c|   10 
 10 files changed, 313 insertions(+), 410 deletions(-)

Index: working-2.6/arch/powerpc/mm/hugetlbpage.c
===
--- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c  2009-10-27 
15:35:27.0 +1100
+++ working-2.6/arch/powerpc/mm/hugetlbpage.c   2009-10-27 15:37:08.0 
+1100
@@ -40,25 +40,11 @@ static unsigned nr_gpages;
 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  * stored for the huge page sizes that are valid.
  */
-unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
-#define hugepte_shift  mmu_huge_psizes
-#define HUGEPTE_INDEX_SIZE(psize)  (mmu_huge_psizes[(psize)])
-#define PTRS_PER_HUGEPTE(psize)(1  mmu_huge_psizes[psize])
-
-#define HUGEPD_SHIFT(psize)(mmu_psize_to_shift(psize) \
-+ HUGEPTE_INDEX_SIZE(psize))
-#define HUGEPD_SIZE(psize) (1UL  HUGEPD_SHIFT(psize))
-#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
+static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all 
to 0 */
 
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
-#define HUGEPD_OK  0x1
-
-typedef struct { unsigned long pd; } hugepd_t;
-
-#define hugepd_none(hpd)   ((hpd).pd == 0)
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
@@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_
BUG();
 }
 
+#define hugepd_none(hpd)   ((hpd).pd == 0)
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
-   BUG_ON(!(hpd.pd  HUGEPD_OK));
-   return (pte_t *)(hpd.pd  ~HUGEPD_OK);
+   BUG_ON(!hugepd_ok(hpd));
+   return (pte_t *)((hpd.pd  ~HUGEPD_SHIFT_MASK) | 0xc000);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
-   struct hstate *hstate)
+static inline unsigned int hugepd_shift(hugepd_t hpd)
 {
-   unsigned int shift = huge_page_shift(hstate);
-   int psize = shift_to_mmu_psize(shift);
-   unsigned long idx = ((addr  shift)  (PTRS_PER_HUGEPTE(psize)-1));
+   return hpd.pd  HUGEPD_SHIFT_MASK;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, 
unsigned pdshift)
+{

[4/6] Cleanup initialization of hugepages on powerpc

2009-10-26 Thread David Gibson
This patch simplifies the logic used to initialize hugepages on
powerpc.  The somewhat oddly named set_huge_psize() is renamed to
add_huge_page_size() and now does all necessary verification of
whether it's given a valid hugepage sizes (instead of just some) and
instantiates the generic hstate structure (but no more).  

hugetlbpage_init() now steps through the available pagesizes, checks
if they're valid for hugepages by calling add_huge_page_size() and
initializes the kmem_caches for the hugepage pagetables.  This means
we can now eliminate the mmu_huge_psizes array, since we no longer
need to pass the sizing information for the pagetable caches from
set_huge_psize() into hugetlbpage_init()

Determination of the default huge page size is also moved from the
hash code into the general hugepage code.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/include/asm/page_64.h |2 
 arch/powerpc/mm/hash_utils_64.c|   10 --
 arch/powerpc/mm/hugetlbpage.c  |  130 +
 3 files changed, 64 insertions(+), 78 deletions(-)

Index: linux-a2/arch/powerpc/mm/hugetlbpage.c
===
--- linux-a2.orig/arch/powerpc/mm/hugetlbpage.c 2009-10-15 16:40:49.0 
+1100
+++ linux-a2/arch/powerpc/mm/hugetlbpage.c  2009-10-15 16:41:33.0 
+1100
@@ -37,27 +37,17 @@
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
 
-/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
- * stored for the huge page sizes that are valid.
- */
-static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all 
to 0 */
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
-   switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-   case PAGE_SHIFT_64K:
-   return MMU_PAGE_64K;
-#endif
-   case PAGE_SHIFT_16M:
-   return MMU_PAGE_16M;
-   case PAGE_SHIFT_16G:
-   return MMU_PAGE_16G;
-   }
+   int psize;
+
+   for (psize = 0; psize  MMU_PAGE_COUNT; ++psize)
+   if (mmu_psize_defs[psize].shift == shift)
+   return psize;
return -1;
 }
 
@@ -502,8 +492,6 @@ unsigned long hugetlb_get_unmapped_area(
struct hstate *hstate = hstate_file(file);
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
-   if (!mmu_huge_psizes[mmu_psize])
-   return -EINVAL;
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
@@ -666,47 +654,46 @@ repeat:
return err;
 }
 
-static void __init set_huge_psize(int psize)
+static int __init add_huge_page_size(unsigned long long size)
 {
-   unsigned pdshift;
+   int shift = __ffs(size);
+   int mmu_psize;
 
/* Check that it is a page size supported by the hardware and
-* that it fits within pagetable limits. */
-   if (mmu_psize_defs[psize].shift 
-   mmu_psize_defs[psize].shift  SID_SHIFT_1T 
-   (mmu_psize_defs[psize].shift  MIN_HUGEPTE_SHIFT ||
-mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
-mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-   /* Return if huge page size has already been setup or is the
-* same as the base page size. */
-   if (mmu_huge_psizes[psize] ||
-  mmu_psize_defs[psize].shift == PAGE_SHIFT)
-   return;
-   hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+* that it fits within pagetable and slice limits. */
+   if (!is_power_of_2(size)
+   || (shift  SLICE_HIGH_SHIFT) || (shift = PAGE_SHIFT))
+   return -EINVAL;
 
-   if (mmu_psize_defs[psize].shift  PMD_SHIFT)
-   pdshift = PMD_SHIFT;
-   else if (mmu_psize_defs[psize].shift  PUD_SHIFT)
-   pdshift = PUD_SHIFT;
-   else
-   pdshift = PGDIR_SHIFT;
-   mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift;
-   }
+   if ((mmu_psize = shift_to_mmu_psize(shift))  0)
+   return -EINVAL;
+
+#ifdef CONFIG_SPU_FS_64K_LS
+   /* Disable support for 64K huge pages when 64K SPU local store
+* support is enabled as the current implementation conflicts.
+*/
+   if (shift == PAGE_SHIFT_64K)
+   return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
+
+   BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
+
+   /* Return if huge page size has already been setup */
+   if (size_to_hstate(size))
+   return 0;
+
+   hugetlb_add_hstate(shift - PAGE_SHIFT);
+
+   return 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {

[5/6] Split hash MMU specific hugepage code into a new file

2009-10-26 Thread David Gibson
This patch separates the parts of hugetlbpage.c which are inherently
specific to the hash MMU into a new hugelbpage-hash64.c file.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/include/asm/hugetlb.h   |3 
 arch/powerpc/mm/Makefile |5 -
 arch/powerpc/mm/hugetlbpage-hash64.c |  167 ++
 arch/powerpc/mm/hugetlbpage.c|  168 ---
 4 files changed, 176 insertions(+), 167 deletions(-)

Index: working-2.6/arch/powerpc/mm/Makefile
===
--- working-2.6.orig/arch/powerpc/mm/Makefile   2009-10-27 15:07:38.0 
+1100
+++ working-2.6/arch/powerpc/mm/Makefile2009-10-27 15:08:09.0 
+1100
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x)+= 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)+= fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)+= slice.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-y  += hugetlbpage.o
+obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
+endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)  += highmem.o
Index: working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c2009-10-27 
15:08:09.0 +1100
@@ -0,0 +1,167 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth rohit.s...@intel.com
+ */
+
+#include linux/mm.h
+#include linux/hugetlb.h
+#include asm/pgtable.h
+#include asm/pgalloc.h
+#include asm/cacheflush.h
+#include asm/machdep.h
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+   pte_t pte, int trap, unsigned long sz)
+{
+   struct page *page;
+   int i;
+
+   if (!pfn_valid(pte_pfn(pte)))
+   return rflags;
+
+   page = pte_page(pte);
+
+   /* page is dirty */
+   if (!test_bit(PG_arch_1, page-flags)  !PageReserved(page)) {
+   if (trap == 0x400) {
+   for (i = 0; i  (sz / PAGE_SIZE); i++)
+   __flush_dcache_icache(page_address(page+i));
+   set_bit(PG_arch_1, page-flags);
+   } else {
+   rflags |= HPTE_R_N;
+   }
+   }
+   return rflags;
+}
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long 
vsid,
+pte_t *ptep, unsigned long trap, int local, int ssize,
+unsigned int shift, unsigned int mmu_psize)
+{
+   unsigned long old_pte, new_pte;
+   unsigned long va, rflags, pa, sz;
+   long slot;
+   int err = 1;
+
+   BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+   /* Search the Linux page table for a match with va */
+   va = hpt_va(ea, vsid, ssize);
+
+   /*
+* Check the user's access rights to the page.  If access should be
+* prevented then send the problem up to do_page_fault.
+*/
+   if (unlikely(access  ~pte_val(*ptep)))
+   goto out;
+   /*
+* At this point, we have a pte (old_pte) which can be used to build
+* or update an HPTE. There are 2 cases:
+*
+* 1. There is a valid (present) pte with no associated HPTE (this is
+*  the most common case)
+* 2. There is a valid (present) pte with an associated HPTE. The
+*  current values of the pp bits in the HPTE prevent access
+*  because we are doing software DIRTY bit management and the
+*  page is currently not DIRTY.
+*/
+
+
+   do {
+   old_pte = pte_val(*ptep);
+   if (old_pte  _PAGE_BUSY)
+   goto out;
+   new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+   } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+old_pte, new_pte));
+
+   rflags = 0x2 | (!(new_pte  _PAGE_RW));
+   /* _PAGE_EXEC - HW_NO_EXEC since it's inverted */
+   rflags |= ((new_pte  _PAGE_EXEC) ? 0 : HPTE_R_N);
+   sz = ((1UL)  shift);
+   if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+   /* No CPU has hugepages but lacks no execute, so we
+* don't need to worry about that case */
+   rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+  trap, sz);
+
+   /* Check if pte already has an hpte 

[6/6] Bring hugepage PTE accessor functions back into sync with normal accessors

2009-10-26 Thread David Gibson
The hugepage arch code provides a number of hook functions/macros
which mirror the functionality of various normal page pte access
functions.  Various changes in the normal page accessors (in
particular BenH's recent changes to the handling of lazy icache
flushing and PAGE_EXEC) have caused the hugepage versions to get out
of sync with the originals.  In some cases, this is a bug, at least on
some MMU types.

One of the reasons that some hooks were not identical to the normal
page versions, is that the fact we're dealing with a hugepage needed
to be passed down do use the correct dcache-icache flush function.
This patch makes the main flush_dcache_icache_page() function hugepage
aware (by checking for the PageCompound flag).  That in turn means we
can make set_huge_pte_at() just a call to set_pte_at() bringing it
back into sync.  As a bonus, this lets us remove the
hash_huge_page_do_lazy_icache() function, replacing it with a call to
the hash_page_do_lazy_icache() function it was based on.

Some other hugepage pte access hooks - huge_ptep_get_and_clear() and
huge_ptep_clear_flush() - are not so easily unified, but this patch at
least brings them back into sync with the current versions of the
corresponding normal page functions.

Signed-off-by: David Gibson d...@au1.ibm.com

---
 arch/powerpc/include/asm/hugetlb.h|   25 +++--
 arch/powerpc/include/asm/mmu-hash64.h |1 +
 arch/powerpc/mm/hash_utils_64.c   |2 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  |   30 +-
 arch/powerpc/mm/hugetlbpage.c |   31 ++-
 arch/powerpc/mm/mem.c |   17 +
 6 files changed, 45 insertions(+), 61 deletions(-)

Index: working-2.6/arch/powerpc/include/asm/hugetlb.h
===
--- working-2.6.orig/arch/powerpc/include/asm/hugetlb.h 2009-10-27 
14:50:34.0 +1100
+++ working-2.6/arch/powerpc/include/asm/hugetlb.h  2009-10-27 
14:56:31.0 +1100
@@ -6,6 +6,8 @@
 pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
 unsigned long addr, unsigned *shift);
 
+void flush_dcache_icache_hugepage(struct page *page);
+
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
   unsigned long len);
 
@@ -13,12 +15,6 @@ void hugetlb_free_pgd_range(struct mmu_g
unsigned long end, unsigned long floor,
unsigned long ceiling);
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-pte_t *ptep, pte_t pte);
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep);
-
 /*
  * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs
  * to override the version in mm/hugetlb.c
@@ -44,9 +40,26 @@ static inline void hugetlb_prefault_arch
 {
 }
 
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+  pte_t *ptep, pte_t pte)
+{
+   set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep)
+{
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
+   return __pte(old);
+}
+
 static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
 unsigned long addr, pte_t *ptep)
 {
+   pte_t pte;
+   pte = huge_ptep_get_and_clear(vma-vm_mm, addr, ptep);
+   flush_tlb_page(vma, addr);
 }
 
 static inline int huge_pte_none(pte_t pte)
Index: working-2.6/arch/powerpc/include/asm/mmu-hash64.h
===
--- working-2.6.orig/arch/powerpc/include/asm/mmu-hash64.h  2009-10-27 
14:36:36.0 +1100
+++ working-2.6/arch/powerpc/include/asm/mmu-hash64.h   2009-10-27 
14:55:22.0 +1100
@@ -245,6 +245,7 @@ extern int __hash_page_64K(unsigned long
   unsigned long vsid, pte_t *ptep, unsigned long trap,
   unsigned int local, int ssize);
 struct mm_struct;
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap);
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long 
trap);
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long 
vsid,
 pte_t *ptep, unsigned long trap, int local, int ssize,
Index: working-2.6/arch/powerpc/mm/hash_utils_64.c
===
--- working-2.6.orig/arch/powerpc/mm/hash_utils_64.c2009-10-27 
14:42:47.0 +1100
+++ working-2.6/arch/powerpc/mm/hash_utils_64.c 2009-10-27 14:55:22.0 
+1100
@@ -775,7 +775,7 @@ unsigned int hash_page_do_lazy_icache(un
/* page is dirty */
if (!test_bit(PG_arch_1, 

[PATCH v3] powerpc/ppc64: Use preempt_schedule_irq instead of preempt_schedule

2009-10-26 Thread Benjamin Herrenschmidt

 So I _think_ that the irqs on/off accounting for lockdep isn't quite
 right. What do you think of this slightly modified version ? I've only
 done a quick boot test on a G5 with lockdep enabled and a played a bit,
 nothing shows up so far but it's definitely not conclusive.
 
 The main difference is that I call trace_hardirqs_off to advertise
 the fact that we are soft-disabling (it could be a dup, but at this
 stage this is no big deal, but it's not always, like in syscall return
 the kernel thinks we have interrupts enabled and could thus get out
 of sync without that).
 
 I also mark the PACA hard disable to reflect the MSR:EE state before
 calling into preempt_schedule_irq().

Allright, second thought :-)

It's probably simpler to just keep hardirqs off. Code is smaller and
simpler and the scheduler will re-enable them soon enough anyways.

This version of the patch also spaces the code a bit and adds comments
which makes them (the code and the patch) more readable.

Cheers,
Ben.
 
From: Benjamin Herrenschmidt b...@kernel.crashing.org

[PATCH v3] powerpc/ppc64: Use preempt_schedule_irq instead of preempt_schedule

Based on an original patch by Valentine Barshak vbars...@ru.mvista.com

Use preempt_schedule_irq to prevent infinite irq-entry and
eventual stack overflow problems with fast-paced IRQ sources.

This kind of problems has been observed on the PASemi Electra IDE
controller. We have to make sure we are soft-disabled before calling
preempt_schedule_irq and hard disable interrupts after that
to avoid unrecoverable exceptions.

This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of
the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered
and has to be restored in both cases.
---
 arch/powerpc/kernel/entry_64.S |   41 ---
 1 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index f9fd54b..9763267 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -658,42 +658,43 @@ do_work:
cmpdi   r0,0
crandc  eq,cr1*4+eq,eq
bne restore
-   /* here we are preempting the current task */
-1:
-#ifdef CONFIG_TRACE_IRQFLAGS
-   bl  .trace_hardirqs_on
-   /* Note: we just clobbered r10 which used to contain the previous
-* MSR before the hard-disabling done by the caller of do_work.
-* We don't have that value anymore, but it doesn't matter as
-* we will hard-enable unconditionally, we can just reload the
-* current MSR into r10
+
+   /* Here we are preempting the current task.
+*
+* Ensure interrupts are soft-disabled. We also properly mark
+* the PACA to reflect the fact that they are hard-disabled
+* and trace the change
 */
-   mfmsr   r10
-#endif /* CONFIG_TRACE_IRQFLAGS */
-   li  r0,1
+   li  r0,0
stb r0,PACASOFTIRQEN(r13)
stb r0,PACAHARDIRQEN(r13)
+   TRACE_DISABLE_INTS
+
+   /* Call the scheduler with soft IRQs off */
+1: bl  .preempt_schedule_irq
+
+   /* Hard-disable interrupts again (and update PACA) */
 #ifdef CONFIG_PPC_BOOK3E
-   wrteei  1
-   bl  .preempt_schedule
wrteei  0
 #else
-   ori r10,r10,MSR_EE
-   mtmsrd  r10,1   /* reenable interrupts */
-   bl  .preempt_schedule
mfmsr   r10
-   clrrdi  r9,r1,THREAD_SHIFT
-   rldicl  r10,r10,48,1/* disable interrupts again */
+   rldicl  r10,r10,48,1
rotldi  r10,r10,16
mtmsrd  r10,1
 #endif /* CONFIG_PPC_BOOK3E */
+   li  r0,0
+   stb r0,PACAHARDIRQEN(r13)
+
+   /* Re-test flags and eventually loop */
+   clrrdi  r9,r1,THREAD_SHIFT
ld  r4,TI_FLAGS(r9)
andi.   r0,r4,_TIF_NEED_RESCHED
bne 1b
b   restore
 
 user_work:
-#endif
+#endif /* CONFIG_PREEMPT */
+
/* Enable interrupts */
 #ifdef CONFIG_PPC_BOOK3E
wrteei  1
-- 
1.6.1.2.14.gf26b5



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev