Re: How to mount rootfs though harddisk when booting?
VFS: Cannot open root device sda3 or unknown-block(0,0) # CONFIG_EXT3_FS is not set Please add the ext2 ext3 filesystem support. On Sun, 25 Oct 2009 11:43:20 +0800 wilbur.chan wilbur...@gmail.com wrote: Sorry, the config I just post is somewhat confusing, here I post my config again.Thx # # Automatically generated make config: don't edit # Linux kernel version: 2.6.21.7-EMBSYS-CGEL-3.06.10.P2.F0.B4 # Sun Oct 25 11:24:25 2009 # # CONFIG_PPC64 is not set CONFIG_PPC32=y CONFIG_PPC_MERGE=y CONFIG_MMU=y CONFIG_GENERIC_TIME=y CONFIG_GENERIC_HARDIRQS=y CONFIG_IRQ_PER_CPU=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_ARCH_HAS_ILOG2_U32=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_PPC=y CONFIG_EARLY_PRINTK=y CONFIG_GENERIC_NVRAM=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_PPC_OF=y CONFIG_PPC_UDBG_16550=y CONFIG_GENERIC_TBSYNC=y CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y CONFIG_DEFAULT_UIMAGE=y # # Processor support # # CONFIG_CLASSIC32 is not set # CONFIG_PPC_512x is not set # CONFIG_PPC_82xx is not set # CONFIG_PPC_83xx is not set CONFIG_PPC_85xx=y # CONFIG_PPC_86xx is not set # CONFIG_PPC_8xx is not set # CONFIG_40x is not set # CONFIG_44x is not set # CONFIG_E200 is not set CONFIG_85xx=y CONFIG_E500=y # CONFIG_PPC_DCR_NATIVE is not set # CONFIG_PPC_DCR_MMIO is not set CONFIG_BOOKE=y CONFIG_FSL_BOOKE=y # CONFIG_PHYS_64BIT is not set CONFIG_SPE=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_DEFCONFIG_LIST=/lib/modules/$UNAME_RELEASE/.config # # Code maturity level options # CONFIG_EXPERIMENTAL=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup # CONFIG_LOCALVERSION= CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y # CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set # CONFIG_UTS_NS is not set CONFIG_AUDIT=y # CONFIG_AUDITSYSCALL is not set # CONFIG_WRS_FCHECK is not set # CONFIG_EVLOG is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CONTAINER_DEBUG is not set # CONFIG_CPUSETS is not set # CONFIG_RSS_CONTAINER is not set CONFIG_FAIR_GROUP_SCHED=y CONFIG_FAIR_USER_SCHED=y # CONFIG_FAIR_CGROUP_SCHED is not set CONFIG_SYSFS_DEPRECATED=y # CONFIG_CONTAINER_CPUACCT is not set # CONFIG_RELAY is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE= CONFIG_HWTIMER_HOOKS=y # CONFIG_HWTIMER_TEST is not set # CONFIG_HIGH_PRIO_OOMKILL is not set # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_EMBEDDED=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y CONFIG_VM_EVENT_COUNTERS=y # CONFIG_ALWAYS_RESTART is not set CONFIG_SIGEXIT=y CONFIG_MEMMON=y # CONFIG_MEMMON_SWAP_SUPPORT is not set CONFIG_NOTIFY_SETTIME=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set # # Loadable module support # CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y CONFIG_STOP_MACHINE=y # # Block layer # CONFIG_BLOCK=y CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_LSF is not set # # IO Schedulers # CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y # CONFIG_DEFAULT_AS is not set # CONFIG_DEFAULT_DEADLINE is not set CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED=cfq # CONFIG_WANT_EARLY_SERIAL is not set # # Platform support # # CONFIG_MPC8540_ADS is not set # CONFIG_MPC8560_ADS is not set # CONFIG_MPC85xx_CDS is not set # CONFIG_MPC85xx_MDS is not set # CONFIG_MPC8572_PC is not set CONFIG_MPC85xx_DS=y CONFIG_P2020=y CONFIG_MPC85xx=y CONFIG_PPC_INDIRECT_PCI_BE=y CONFIG_MPIC=y # # Kernel options # CONFIG_HIGHMEM=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 # CONFIG_GENERIC_CLOCKEVENTS is not set # CONFIG_TICK_ONESHOT is not set # CONFIG_PREEMPT_NONE is not set # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y # CONFIG_REAL_PREEMPT is not set # CONFIG_PREEMPT_SOFTIRQS is not set # CONFIG_PREEMPT_HARDIRQS is not set CONFIG_PREEMPT_BKL=y CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_MISC is not set CONFIG_MATH_EMULATION=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y # CONFIG_KEXEC is not set # CONFIG_IRQ_ALL_CPUS is not set CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y #
Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.
* Pavel Machek pa...@ucw.cz [2009-10-23 18:07:11]: On Fri 2009-10-16 15:13:08, Arun R Bharadwaj wrote: * Arun R Bharadwaj a...@linux.vnet.ibm.com [2009-10-16 15:08:50]: This patch cleans up x86 of all instances of pm_idle. pm_idle which was earlier called from cpu_idle() idle loop is replaced by cpuidle_idle_call. x86 also registers to cpuidle when the idle routine is selected, by populating the cpuidle_device data structure for each cpu. This is replicated for apm module and for xen, which also used pm_idle. Signed-off-by: Arun R Bharadwaj a...@linux.vnet.ibm.com --- arch/x86/kernel/apm_32.c | 55 - arch/x86/kernel/process.c | 90 -- arch/x86/kernel/process_32.c |3 - arch/x86/kernel/process_64.c |3 - arch/x86/xen/setup.c | 40 ++ drivers/acpi/processor_core.c |9 ++-- drivers/cpuidle/cpuidle.c | 16 +-- 7 files changed, 182 insertions(+), 34 deletions(-) ... +static int local_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st) +{ + ktime_t t1, t2; + s64 diff; + int ret; + + t1 = ktime_get(); + local_idle(); + t2 = ktime_get(); + + diff = ktime_to_us(ktime_sub(t2, t1)); + if (diff INT_MAX) + diff = INT_MAX; + ret = (int) diff; + + return ret; +} So we get this routine essentially 3 times. Is there no way to share the code? We can move this code to a common place, but that would mean exporting the idle function pointer to be called from within this routine, which is exactly what we wanted to avoid. Any suggestions are welcome. arun -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.
+static int local_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st) +{ + ktime_t t1, t2; + s64 diff; + int ret; + + t1 = ktime_get(); + local_idle(); + t2 = ktime_get(); + + diff = ktime_to_us(ktime_sub(t2, t1)); + if (diff INT_MAX) + diff = INT_MAX; + ret = (int) diff; + + return ret; +} So we get this routine essentially 3 times. Is there no way to share the code? We can move this code to a common place, but that would mean exporting the idle function pointer to be called from within this routine, which is exactly what we wanted to avoid. Any suggestions are welcome. You can just pass idle routine as a parameter...? int common_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st, void *idle(void)) ...? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v9 PATCH 4/9]: x86: refactor x86 idle power management code and remove all instances of pm_idle.
* Pavel Machek pa...@ucw.cz [2009-10-26 08:58:31]: +static int local_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st) +{ + ktime_t t1, t2; + s64 diff; + int ret; + + t1 = ktime_get(); + local_idle(); + t2 = ktime_get(); + + diff = ktime_to_us(ktime_sub(t2, t1)); + if (diff INT_MAX) + diff = INT_MAX; + ret = (int) diff; + + return ret; +} So we get this routine essentially 3 times. Is there no way to share the code? We can move this code to a common place, but that would mean exporting the idle function pointer to be called from within this routine, which is exactly what we wanted to avoid. Any suggestions are welcome. You can just pass idle routine as a parameter...? int common_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st, void *idle(void)) ...? Pavel Yes, this should be fine. I was trying to avoid passing the void function pointer around but i guess this reduces considerable code size. thanks! arun -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Jumbo Frame bug in ibm_newemac driver (was Jumbo Frames, sil24 SATA driver, and kswapd0 page allocation failures)
Okay, I need to revisit this issue. I have had my time taken away for other things the past couple of months, but I am now back at this network issue. Here is what I have done: 1. I modified the ibm_newemac driver to follow scatter-gather chains on the RX path. The idea was to setup the driver to only ever deal with single pages. The MAL in the PPC only supports data transfers of up to 4080 bytes (less than a single page), so it appears that the hardware should support single page chains. I set this up just like the e1000 driver. For whatever reason, this did not work. It is probably because I do not fully understand the Linux network stack yet (as is apparent in the next iteration). 2. I reverted to the original driver and found that, contrary to what I had thought earlier, the driver does allocate a ring of skbs for use in the driver. However, when a jumbo packet is received (larger than 4080 bytes) it uses the skb that was pre-allocated for the jumbo packet and allocates a new skb to replace the one in the ring. This is where the problem is - in that new allocation to replace the one in the stack. So, to remedy this, I pre-allocated the same number of jumbo skbs for the sole purpose of being used as new skbs for the rx ring. Here is some code that shows the idea: Statuc int emaC_open(struct net_device *ndev) { ... /* Allocate RX ring */ for (i = 0; i NUM_RX_BUFF; ++i) { if (emac_alloc_rx_skb(dev, i, GFP_KERNEL)) { printk(KERN_ERR %s: failed to allocate RX ring\n, ndev-name); goto oom; } } ... } static inline int emac_alloc_rx_skb2(struct emac_instance *dev, int slot, gfp_t flags) { struct sk_buff *skb = dev-rx_skb_pool[slot]; if (unlikely(!skb)) return -ENOMEM; if(skb_recycle_check(skb, emac_rx_skb_size(dev-rx_skb_size))) { dev-rx_skb[slot] = skb; dev-rx_desc[slot].data_len = 0; skb_reserve(skb, EMAC_RX_SKB_HEADROOM + 2); dev-rx_desc[slot].data_ptr = dma_map_single(dev-ofdev-dev, skb-data - 2, dev-rx_sync_size, DMA_FROM_DEVICE) + 2; wmb(); dev-rx_desc[slot].ctrl = MAL_RX_CTRL_EMPTY | (slot == (NUM_RX_BUFF - 1) ? MAL_RX_CTRL_WRAP : 0); return 0; } else { printk(KERN_NOTICE EMAC: SKB not recycleable\n); return -ENOMEM; } } Static int emac_poll_rx(void *param, int budget) { ... sg: if (ctrl MAL_RX_CTRL_FIRST) { BUG_ON(dev-rx_sg_skb); if (unlikely(emac_alloc_rx_skb2(dev, slot, GFP_ATOMIC))) { DBG(dev, rx OOM %d NL, slot); ++dev-estats.rx_dropped_oom; emac_recycle_rx_skb(dev, slot, 0); } else { dev-rx_sg_skb = skb; emac_recycle_rx_skb(dev,slot,len); skb_put(skb, len); } } else if (!emac_rx_sg_append(dev, slot) (ctrl MAL_RX_CTRL_LAST)) { skb = dev-rx_sg_skb; dev-rx_sg_skb = NULL; ctrl = EMAC_BAD_RX_MASK; if (unlikely(ctrl ctrl != EMAC_RX_TAH_BAD_CSUM)) { emac_parse_rx_error(dev, ctrl); ++dev-estats.rx_dropped_error; dev_kfree_skb(skb); len = 0; } else { /* printk(KERN_NOTICE EMAC: pushing sg packet\n);*/ goto push_packet; } } goto skip; ... } The changes are the allocation of the rx_skb_pool in emac_open(), the function call emac_alloc_rx_skb2() in emac_poll_rx(), and the modifications to emac_alloc_skb to create emac_alloc_rx_skb2. Also, corresponding allocations for rx_skb_pool are found in emac_resize_rx_ring() for when we need to resize the pool. Now the problem that I am having is this - the first time through the ring, things work just fine. But the second time through the loop, the buffers are not cleaned out - they still think they contain data. I have tried calling skb_recycle_check() to restore the skb to a new state, however that call fails because apparently the skb cannot be reused for receive. Why is that the case? What am I missing? It seems like I am missing something that allows the skb to be reused? I will admit, I am not a Linux network driver expert, though I am learning. If
INIT: PANIC: segmentation violation! sleeping for 30 seconds.
Hi, I just put a upstream kernel(rc5) on a specific machine I have (Power5), and I got the following error: INIT: PANIC: segmentation violation! sleeping for 30 seconds. init has generated signal 11 but has no handler for it init used greatest stack depth: 6240 bytes left Kernel panic - not syncing: Attempted to kill init! Call Trace: [c001c6e7f920] [c0012588] .show_stack+0x6c/0x194 (unreliable) [c001c6e7f9d0] [c0088bd4] .panic+0x74/0x1c0 [c001c6e7fa60] [c008cbdc] .do_exit+0x43c/0x82c [c001c6e7fb20] [c00286f4] ._exception+0x1d4/0x204 [c001c6e7fcf0] [c04e7dc8] .do_page_fault+0x4fc/0x634 [c001c6e7fe30] [c000560c] handle_page_fault+0x20/0x74 Downgrading to rc2 shows the same result. Interesting enough, this is the only machine that fails with the upstream kernel. Have anyone seen anything similar ? Thanks ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Network Stack SKB Reallocation
Quick question about the network stack in general: Does the stack itself release an SKB allocated by the device driver back to the heap upstream, or does it require that the device driver handle that? Thanks! Jonathan ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: Network Stack SKB Reallocation
So, in my case, I allocate a bunch of skb's that I want to be able to reuse during network operation (256 in fact). When I pass it up the stack, the stack will free that skb back to the system making any further use of it invalid until I call alloc_skb() again? Thanks. On Monday 26 October 2009 19:43:00 Jonathan Haws wrote: Quick question about the network stack in general: Does the stack itself release an SKB allocated by the device driver back to the heap upstream, or does it require that the device driver handle that? There's the concept of passing responsibilities for the frames between the networking layers. So the driver passes the frame and all responsibilities to the networking stack. So if the networking stack accepts the packet in the first place, it needs to free it (or pass it to somebody else to take care of). -- Greetings, Michael. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: Network Stack SKB Reallocation
On Monday 26 October 2009 19:43:00 Jonathan Haws wrote: Quick question about the network stack in general: Does the stack itself release an SKB allocated by the device driver back to the heap upstream, or does it require that the device driver handle that? There's the concept of passing responsibilities for the frames between the networking layers. So the driver passes the frame and all responsibilities to the networking stack. So if the networking stack accepts the packet in the first place, it needs to free it (or pass it to somebody else to take care of). -- Greetings, Michael. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/8] Fix 8xx MMU/TLB
Probably better to walk the kernel page table too. Does this make a difference(needs the tophys() patch I posted earlier): This whole thing would be a -lot- easier to do from C code. Why ? Simply because you could just use get_user() to load the instruction rather than doing this page table walking in asm, which is simpler, faster, and more fool proof (ok, you do pay the price of a kernel entry/exit instead, but I still believe that code simplicity and maintainability wins here). Ben. From 862dda30c3d3d3bedcc605e8520626408a26891c Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund joakim.tjernl...@transmode.se Date: Sat, 17 Oct 2009 13:54:03 +0200 Subject: [PATCH] 8xx: Walk the page table for kernel addresses too. --- arch/powerpc/kernel/head_8xx.S | 25 - 1 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 0e91da4..edc9e9b 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -532,28 +532,27 @@ DARFixed:/* Return from dcbx instruction bug workaround, r10 holds value of DAR * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address and r10 also holds the EA on exit. */ -#define NO_SELF_MODIFYING_CODE /* define if you don't want to use self modifying code */ - nop /* A few nops to make the modified_instr: space below cache line aligned */ - nop -139: /* fetch instruction from userspace memory */ + /* define if you don't want to use self modifying code */ +#define NO_SELF_MODIFYING_CODE +FixupDAR:/* Entry point for dcbx workaround. */ + /* fetch instruction from memory. */ + mfspr r10, SPRN_SRR0 DO_8xx_CPU6(0x3780, r3) mtspr SPRN_MD_EPN, r10 mfspr r11, SPRN_M_TWB /* Get level 1 table entry address */ - lwz r11, 0(r11) /* Get the level 1 entry */ + cmplwi cr0, r11, 0x0800 + blt- 3f/* Branch if user space */ + lis r11, swapper_pg_...@h + ori r11, r11, swapper_pg_...@l + rlwimi r11, r11, 0, 2, 19 +3: lwz r11, 0(r11) /* Get the level 1 entry */ DO_8xx_CPU6(0x3b80, r3) mtspr SPRN_MD_TWC, r11 /* Load pte table base address */ mfspr r11, SPRN_MD_TWC /* and get the pte address */ lwz r11, 0(r11) /* Get the pte */ /* concat physical page address(r11) and page offset(r10) */ rlwimi r11, r10, 0, 20, 31 - b 140f -FixupDAR:/* Entry point for dcbx workaround. */ - /* fetch instruction from memory. */ - mfspr r10, SPRN_SRR0 - andis. r11, r10, 0x8000 - tophys (r11, r10) - beq- 139b /* Branch if user space address */ -140: lwz r11,0(r11) + lwz r11,0(r11) /* Check if it really is a dcbx instruction. */ /* dcbt and dcbtst does not generate DTLB Misses/Errors, * no need to include them here */ -- 1.6.4.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/8] Fix 8xx MMU/TLB
On Oct 26, 2009, at 3:47 PM, Benjamin Herrenschmidt wrote: This whole thing would be a -lot- easier to do from C code. Why ? Simply because you could just use get_user() to load the instruction rather than doing this page table walking in asm, Just be careful the get_user() doesn't regenerate the same translation error you are trying to fix by being here.. It is nice doing things in C code, but you have to be aware of the environment and the side effects when in this kind of exception state. Thanks. -- Dan ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] [RFC] PowerPC64: Use preempt_schedule_irq instead of preempt_schedule when returning from exceptions
On Mon, 2009-10-19 at 22:28 +0400, Valentine Barshak wrote: Use preempt_schedule_irq to prevent infinite irq-entry and eventual stack overflow problems with fast-paced IRQ sources. This kind of problems has been observed on the PASemi Electra IDE controller. We have to make sure we are soft-disabled before calling preempt_schedule_irq and hard disable interrupts after that to avoid unrecoverable exceptions. This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered and has to be restored in both cases. So I _think_ that the irqs on/off accounting for lockdep isn't quite right. What do you think of this slightly modified version ? I've only done a quick boot test on a G5 with lockdep enabled and a played a bit, nothing shows up so far but it's definitely not conclusive. The main difference is that I call trace_hardirqs_off to advertise the fact that we are soft-disabling (it could be a dup, but at this stage this is no big deal, but it's not always, like in syscall return the kernel thinks we have interrupts enabled and could thus get out of sync without that). I also mark the PACA hard disable to reflect the MSR:EE state before calling into preempt_schedule_irq(). --- [PATCH v2] powerpc: Use preempt_schedule_irq instead of preempt_schedule when returning from exceptions Use preempt_schedule_irq to prevent infinite irq-entry and eventual stack overflow problems with fast-paced IRQ sources. This kind of problems has been observed on the PASemi Electra IDE controller. We have to make sure we are soft-disabled before calling preempt_schedule_irq and hard disable interrupts after that to avoid unrecoverable exceptions. This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered and has to be restored in both cases. Signed-off-by: Valentine Barshak vbars...@ru.mvista.com Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org --- arch/powerpc/kernel/entry_64.S | 38 +- 1 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index f9fd54b..b64ae3d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -659,34 +659,38 @@ do_work: crandc eq,cr1*4+eq,eq bne restore /* here we are preempting the current task */ -1: + /* ensure we are soft-disabled +* */ + li r0,0 + stb r0,PACASOFTIRQEN(r13) + /* Trace the IRQ state change */ #ifdef CONFIG_TRACE_IRQFLAGS - bl .trace_hardirqs_on - /* Note: we just clobbered r10 which used to contain the previous -* MSR before the hard-disabling done by the caller of do_work. -* We don't have that value anymore, but it doesn't matter as -* we will hard-enable unconditionally, we can just reload the -* current MSR into r10 -*/ + bl .trace_hardirqs_off +#endif +1: /* And make sure we are hard-enabled */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else mfmsr r10 -#endif /* CONFIG_TRACE_IRQFLAGS */ + ori r10,r10,MSR_EE + mtmsrd r10,1 +#endif li r0,1 - stb r0,PACASOFTIRQEN(r13) stb r0,PACAHARDIRQEN(r13) + /* Call the scheduler with soft IRQs off */ + bl .preempt_schedule_irq + /* hard-disable interrupts again */ #ifdef CONFIG_PPC_BOOK3E - wrteei 1 - bl .preempt_schedule wrteei 0 #else - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule mfmsr r10 - clrrdi r9,r1,THREAD_SHIFT - rldicl r10,r10,48,1/* disable interrupts again */ + rldicl r10,r10,48,1 rotldi r10,r10,16 mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ + li r0,0 + stb r0,PACAHARDIRQEN(r13) + clrrdi r9,r1,THREAD_SHIFT ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b -- 1.6.1.2.14.gf26b5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/8] Fix 8xx MMU/TLB
On Mon, 2009-10-26 at 16:26 -0700, Dan Malek wrote: Just be careful the get_user() doesn't regenerate the same translation error you are trying to fix by being here.. It shouldn't since it will always come up with a proper DAR but you may want to double check before hand that your instruction address you are loading from is -not- your marker value for bad DAR. It is nice doing things in C code, but you have to be aware of the environment and the side effects when in this kind Yup. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [2/6] Cleanup management of kmem_caches for pagetables
On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote: Minor nits... if you can respin today I should push it out to -next +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) shift; + unsigned long align = table_size; This is a bit thick.. could use some air. Just separate the definitions from the assignments so you can make the code breath a bit :-) Also the above warrants a comment explaining that this won't work for PTE pages since sizeof(PTE) = sizeof(void *) and the day we finally move out of pte page == struct page, the code here will have to be adapted. + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it */ + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift 1) || (shift MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ Blank line here too + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, pgtable-2^%d, shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + PGT_CACHE(shift) = new; And here + pr_debug(Allocated pgtable cache for order %d\n, shift); +} + void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + panic(Couldn't allocate pgtable caches); + BUG_ON(PUD_INDEX_SIZE !PGT_CACHE(PUD_INDEX_SIZE)); } panic vs. BUG_ON() ... could be a bit more consistent. #ifdef CONFIG_SPARSEMEM_VMEMMAP Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h === --- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h2009-10-16 12:53:45.0 +1100 +++ working-2.6/arch/powerpc/include/asm/pgalloc-64.h 2009-10-16 12:53:51.0 +1100 @@ -11,27 +11,30 @@ #include linux/cpumask.h #include linux/percpu.h +/* + * This needs to be big enough to allow any pagetable sizes we need, + * but small enough to fit in the low bits of any page table pointer. + * In other words all pagetables, even tiny ones, must be aligned to + * allow at least enough low 0 bits to contain this value. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf This also has the constraint of being a (power of 2) - 1... worth mentioning somewhere ? Also if you could comment somewhere that index size == 0 means a PTE page ? Not totally obvious at first. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [3/6] Allow more flexible layouts for hugepage pagetables
On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote: So far haven't seen anything blatantly wrong, in fact, this patch results in some nice cleanups. One thing tho... -#ifdef CONFIG_HUGETLB_PAGE - /* Handle hugepage regions */ - if (HPAGE_SHIFT mmu_huge_psizes[psize]) { - DBG_LOW( - huge page !\n); - return hash_huge_page(mm, access, ea, vsid, local, trap); - } -#endif /* CONFIG_HUGETLB_PAGE */ - #ifndef CONFIG_PPC_64K_PAGES /* If we use 4K pages and our psize is not 4K, then we are hitting * a special driver mapping, we need to align the address before @@ -961,12 +954,18 @@ int hash_page(unsigned long ea, unsigned #endif /* CONFIG_PPC_64K_PAGES */ You basically made the above code be run with huge pages. This may not be what you want ... It will result in cropping the low EA bits probably at a stage where you don't want that (it might also be a non-issue, I just want you to double check :-) I suppose one option would be to remove that alignment and duplicate the PTEs when creating those special mappings (afaik the only user is spufs using 64K pages to map the local store) Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 10/16] percpu: make percpu symbols in powerpc unique
On Wed, 2009-10-14 at 15:01 +0900, Tejun Heo wrote: This patch updates percpu related symbols in powerpc such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * arch/powerpc/kernel/perf_callchain.c: s/callchain/cpu_perf_callchain/ * arch/powerpc/kernel/setup-common.c: s/pvr/cpu_pvr/ * arch/powerpc/platforms/pseries/dtl.c: s/dtl/cpu_dtl/ * arch/powerpc/platforms/cell/interrupt.c: s/iic/cpu_iic/ Partly based on Rusty Russell's alloc_percpu: rename percpu vars which cause name clashes patch. Signed-off-by: Tejun Heo t...@kernel.org Acked-by: Arnd Bergmann a...@arndb.de Cc: Rusty Russell ru...@rustcorp.com.au Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v4 4/4] pseries: Serialize cpu hotplug operations during deactivate Vs deallocate
On Fri, 2009-10-09 at 14:01 +0530, Gautham R Shenoy wrote: Currently the cpu-allocation/deallocation process comprises of two steps: - Set the indicators and to update the device tree with DLPAR node information. - Online/offline the allocated/deallocated CPU. This is achieved by writing to the sysfs tunables probe during allocation and release during deallocation. At the sametime, the userspace can independently online/offline the CPUs of the system using the sysfs tunable online. It is quite possible that when a userspace tool offlines a CPU for the purpose of deallocation and is in the process of updating the device tree, some other userspace tool could bring the CPU back online by writing to the online sysfs tunable thereby causing the deallocate process to fail. The solution to this is to serialize writes to the probe/release sysfs tunable with the writes to the online sysfs tunable. This patch employs a mutex to provide this serialization, which is a no-op on all architectures except PPC_PSERIES Signed-off-by: Gautham R Shenoy e...@in.ibm.com Peter, did you get a chance to review this one ? Cheers, Ben. --- arch/powerpc/platforms/pseries/dlpar.c | 26 ++ drivers/base/cpu.c |2 ++ include/linux/cpu.h| 13 + 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 9752386..fc261e6 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -644,6 +644,18 @@ static ssize_t memory_release_store(struct class *class, const char *buf, return rc ? -1 : count; } +static DEFINE_MUTEX(pseries_cpu_hotplug_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(pseries_cpu_hotplug_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(pseries_cpu_hotplug_mutex); +} + static ssize_t cpu_probe_store(struct class *class, const char *buf, size_t count) { @@ -656,14 +668,15 @@ static ssize_t cpu_probe_store(struct class *class, const char *buf, if (rc) return -EINVAL; + cpu_hotplug_driver_lock(); rc = acquire_drc(drc_index); if (rc) - return rc; + goto out; dn = configure_connector(drc_index); if (!dn) { release_drc(drc_index); - return rc; + goto out; } /* fixup dn name */ @@ -672,7 +685,8 @@ static ssize_t cpu_probe_store(struct class *class, const char *buf, if (!cpu_name) { free_cc_nodes(dn); release_drc(drc_index); - return -ENOMEM; + rc = -ENOMEM; + goto out; } sprintf(cpu_name, /cpus/%s, dn-full_name); @@ -684,6 +698,8 @@ static ssize_t cpu_probe_store(struct class *class, const char *buf, release_drc(drc_index); rc = online_node_cpus(dn); +out: + cpu_hotplug_driver_unlock(); return rc ? rc : count; } @@ -705,6 +721,7 @@ static ssize_t cpu_release_store(struct class *class, const char *buf, return -EINVAL; } + cpu_hotplug_driver_lock(); rc = offline_node_cpus(dn); if (rc) @@ -713,7 +730,7 @@ static ssize_t cpu_release_store(struct class *class, const char *buf, rc = release_drc(*drc_index); if (rc) { of_node_put(dn); - return rc; + goto out; } rc = remove_device_tree_nodes(dn); @@ -723,6 +740,7 @@ static ssize_t cpu_release_store(struct class *class, const char *buf, of_node_put(dn); out: + cpu_hotplug_driver_unlock(); return rc ? rc : count; } diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index e62a4cc..07c3f05 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -35,6 +35,7 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut struct cpu *cpu = container_of(dev, struct cpu, sysdev); ssize_t ret; + cpu_hotplug_driver_lock(); switch (buf[0]) { case '0': ret = cpu_down(cpu-sysdev.id); @@ -49,6 +50,7 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut default: ret = -EINVAL; } + cpu_hotplug_driver_unlock(); if (ret = 0) ret = count; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4753619..b0ad4e1 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -115,6 +115,19 @@ extern void put_online_cpus(void); #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) int cpu_down(unsigned int cpu); +#ifdef CONFIG_PPC_PSERIES +extern void cpu_hotplug_driver_lock(void); +extern void
Is there a patch for MPC8548 XOR?
I want to use its' XOR engine to compute raid5 parity, but I can't find this function in 2.6.30 downloaded from kernel.org, someone know if there is a patch? -- The simplest is not all best but the best is surely the simplest! ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [2/6] Cleanup management of kmem_caches for pagetables
On Tue, Oct 27, 2009 at 01:28:19PM +1100, Benjamin Herrenschmidt wrote: On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote: Minor nits... if you can respin today I should push it out to -next +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) shift; + unsigned long align = table_size; This is a bit thick.. could use some air. Just separate the definitions from the assignments so you can make the code breath a bit :-) Ok. Also the above warrants a comment explaining that this won't work for PTE pages since sizeof(PTE) = sizeof(void *) and the day we finally move out of pte page == struct page, the code here will have to be adapted. Ok. [snip] void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + panic(Couldn't allocate pgtable caches); + BUG_ON(PUD_INDEX_SIZE !PGT_CACHE(PUD_INDEX_SIZE)); } panic vs. BUG_ON() ... could be a bit more consistent. Uh.. there is actually a rationale for the difference here. The panic() is due to a a runtime error - couldn't allocate the caches - which isn't necessarily a kernel bug (could be hardware error, or ludicrously short on memory). The trick is that allocating the PGD and PMD caches is supposed to also create the PUD cache, because the PUD index size is always the same as either the PGD or PUD cache. If that's not true, we've broken the assumptions the code is based on, hence BUG(). #ifdef CONFIG_SPARSEMEM_VMEMMAP Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h === --- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h 2009-10-16 12:53:45.0 +1100 +++ working-2.6/arch/powerpc/include/asm/pgalloc-64.h 2009-10-16 12:53:51.0 +1100 @@ -11,27 +11,30 @@ #include linux/cpumask.h #include linux/percpu.h +/* + * This needs to be big enough to allow any pagetable sizes we need, + * but small enough to fit in the low bits of any page table pointer. + * In other words all pagetables, even tiny ones, must be aligned to + * allow at least enough low 0 bits to contain this value. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf This also has the constraint of being a (power of 2) - 1... worth mentioning somewhere ? Also if you could comment somewhere that index size == 0 means a PTE page ? Not totally obvious at first. Ok, I've expanded on this comment. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [2/6] Cleanup management of kmem_caches for pagetables
On Tue, 2009-10-27 at 14:46 +1100, David Gibson wrote: The trick is that allocating the PGD and PMD caches is supposed to also create the PUD cache, because the PUD index size is always the same as either the PGD or PUD cache. If that's not true, we've broken the assumptions the code is based on, hence BUG(). Ok, so maybe a little comment with the above explanation concerning the PUD index size being the same as the PGD or PMD one would be useful :-) Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [3/6] Allow more flexible layouts for hugepage pagetables
On Tue, Oct 27, 2009 at 02:10:59PM +1100, Benjamin Herrenschmidt wrote: On Fri, 2009-10-16 at 16:22 +1100, David Gibson wrote: So far haven't seen anything blatantly wrong, in fact, this patch results in some nice cleanups. One thing tho... -#ifdef CONFIG_HUGETLB_PAGE - /* Handle hugepage regions */ - if (HPAGE_SHIFT mmu_huge_psizes[psize]) { - DBG_LOW( - huge page !\n); - return hash_huge_page(mm, access, ea, vsid, local, trap); - } -#endif /* CONFIG_HUGETLB_PAGE */ - #ifndef CONFIG_PPC_64K_PAGES /* If we use 4K pages and our psize is not 4K, then we are hitting * a special driver mapping, we need to align the address before @@ -961,12 +954,18 @@ int hash_page(unsigned long ea, unsigned #endif /* CONFIG_PPC_64K_PAGES */ You basically made the above code be run with huge pages. This may not be what you want ... It will result in cropping the low EA bits probably at a stage where you don't want that (it might also be a non-issue, I just want you to double check :-) Ok, I've done that, and adjusted the comment accordingly. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 6/6] powerpc: Export powerpc_debugfs_root
Kernel modules should be able to place their debug output inside our powerpc debugfs directory. Signed-off-by: Anton Blanchard an...@samba.org --- Index: linux.trees.git/arch/powerpc/kernel/setup-common.c === --- linux.trees.git.orig/arch/powerpc/kernel/setup-common.c 2009-10-27 12:59:00.0 +1100 +++ linux.trees.git/arch/powerpc/kernel/setup-common.c 2009-10-27 12:59:15.0 +1100 @@ -660,6 +660,7 @@ late_initcall(check_cache_coherency); #ifdef CONFIG_DEBUG_FS struct dentry *powerpc_debugfs_root; +EXPORT_SYMBOL(powerpc_debugfs_root); static int powerpc_debugfs_init(void) { ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/6] powerpc: tracing: Add hypervisor call tracepoints
Add hcall_entry and hcall_exit tracepoints. This replaces the inline assembly HCALL_STATS code and converts it to use the new tracepoints. To keep the disabled case as quick as possible, we embed a status word in the TOC so we can get at it with a single load. By doing so we keep the overhead at a minimum. Time taken for a null hcall: No tracepoint code: 135.79 cycles Disabled tracepoints: 137.95 cycles For reference, before this patch enabling HCALL_STATS resulted in a null hcall of 201.44 cycles! Signed-off-by: Anton Blanchard an...@samba.org --- Index: linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S === --- linux.trees.git.orig/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 13:36:05.0 +1100 +++ linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 14:53:21.0 +1100 @@ -14,20 +14,54 @@ #define STK_PARM(i) (48 + ((i)-3)*8) -#ifdef CONFIG_HCALL_STATS +#ifdef CONFIG_TRACEPOINTS + + .section.toc,aw + + .globl hcall_tracepoint_refcount +hcall_tracepoint_refcount: + .llong 0 + + .section.text + /* * precall must preserve all registers. use unused STK_PARM() - * areas to save snapshots and opcode. + * areas to save snapshots and opcode. We branch around this + * in early init (eg when populating the MMU hashtable) by using an + * unconditional cpu feature. */ #define HCALL_INST_PRECALL \ - std r3,STK_PARM(r3)(r1);/* save opcode */ \ - mftbr0; /* get timebase and */ \ - std r0,STK_PARM(r5)(r1);/* save for later */\ BEGIN_FTR_SECTION; \ - mfspr r0,SPRN_PURR; /* get PURR and */ \ - std r0,STK_PARM(r6)(r1);/* save for later */\ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); - + b 1f; \ +END_FTR_SECTION(0, 1); \ + ld r12,hcall_tracepoint_refco...@toc(r2); \ + cmpdi r12,0; \ + beq+1f; \ + mflrr0; \ + std r3,STK_PARM(r3)(r1);\ + std r4,STK_PARM(r4)(r1);\ + std r5,STK_PARM(r5)(r1);\ + std r6,STK_PARM(r6)(r1);\ + std r7,STK_PARM(r7)(r1);\ + std r8,STK_PARM(r8)(r1);\ + std r9,STK_PARM(r9)(r1);\ + std r10,STK_PARM(r10)(r1); \ + std r0,16(r1); \ + stdur1,-STACK_FRAME_OVERHEAD(r1); \ + bl .__trace_hcall_entry; \ + addir1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,16(r1); \ + ld r3,STK_PARM(r3)(r1);\ + ld r4,STK_PARM(r4)(r1);\ + ld r5,STK_PARM(r5)(r1);\ + ld r6,STK_PARM(r6)(r1);\ + ld r7,STK_PARM(r7)(r1);\ + ld r8,STK_PARM(r8)(r1);\ + ld r9,STK_PARM(r9)(r1);\ + ld r10,STK_PARM(r10)(r1); \ + mtlrr0; \ +1: + /* * postcall is performed immediately before function return which * allows liberal use of volatile registers. We branch around this @@ -38,40 +72,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_PURR); BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ - ld r4,STK_PARM(r3)(r1);/* validate opcode */ \ - cmpldi cr7,r4,MAX_HCALL_OPCODE;\ - bgt-cr7,1f; \ - \ - /* get time and PURR snapshots after hcall */ \ - mftbr7; /* timebase after */\ -BEGIN_FTR_SECTION; \ - mfspr r8,SPRN_PURR; /* PURR after */\ - ld r6,STK_PARM(r6)(r1);/* PURR before */ \ - subfr6,r6,r8; /* delta */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ - ld r5,STK_PARM(r5)(r1);/*
[PATCH 1/6] powerpc: tracing: Add powerpc tracepoints for interrupt entry and exit
This patch adds powerpc specific tracepoints for interrupt entry and exit. While we already have generic irq_handler_entry and irq_handler_exit tracepoints there are cases on our virtualised powerpc machines where an interrupt is presented to the OS, but subsequently handled by the hypervisor. This means no OS interrupt handler is invoked. Here is an example on a POWER6 machine with the patch below applied: idle-0 [006] 3243.949840744: irq_entry: pt_regs=c000ce31fb10 idle-0 [006] 3243.949850520: irq_exit: pt_regs=c000ce31fb10 idle-0 [007] 3243.950218208: irq_entry: pt_regs=c000ce323b10 idle-0 [007] 3243.950224080: irq_exit: pt_regs=c000ce323b10 idle-0 [000] 3244.021879320: irq_entry: pt_regs=c0a63aa0 idle-0 [000] 3244.021883616: irq_handler_entry: irq=87 handler=eth0 idle-0 [000] 3244.021887328: irq_handler_exit: irq=87 return=handled idle-0 [000] 3244.021897408: irq_exit: pt_regs=c0a63aa0 Here we see two phantom interrupts (no handler was invoked), followed by a real interrupt for eth0. Without the tracepoints in this patch we would have missed the phantom interrupts. Signed-off-by: Anton Blanchard an...@samba.org Acked-by: Steven Rostedt rost...@goodmis.org -- No change to this patch. Index: linux.trees.git/arch/powerpc/include/asm/trace.h === --- /dev/null 1970-01-01 00:00:00.0 + +++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-17 08:45:08.0 +1100 @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM powerpc + +#if !defined(_TRACE_POWERPC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_POWERPC_H + +#include linux/tracepoint.h + +struct pt_regs; + +TRACE_EVENT(irq_entry, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs), + + TP_STRUCT__entry( + __field(struct pt_regs *, regs) + ), + + TP_fast_assign( + __entry-regs = regs; + ), + + TP_printk(pt_regs=%p, __entry-regs) +); + +TRACE_EVENT(irq_exit, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs), + + TP_STRUCT__entry( + __field(struct pt_regs *, regs) + ), + + TP_fast_assign( + __entry-regs = regs; + ), + + TP_printk(pt_regs=%p, __entry-regs) +); + +#endif /* _TRACE_POWERPC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE + +#define TRACE_INCLUDE_PATH asm +#define TRACE_INCLUDE_FILE trace + +#include trace/define_trace.h Index: linux.trees.git/arch/powerpc/kernel/irq.c === --- linux.trees.git.orig/arch/powerpc/kernel/irq.c 2009-10-17 08:44:32.0 +1100 +++ linux.trees.git/arch/powerpc/kernel/irq.c 2009-10-17 08:45:44.0 +1100 @@ -70,6 +70,8 @@ #include asm/firmware.h #include asm/lv1call.h #endif +#define CREATE_TRACE_POINTS +#include asm/trace.h int __irq_offset_value; static int ppc_spurious_interrupts; @@ -325,6 +327,8 @@ void do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq; + trace_irq_entry(regs); + irq_enter(); check_stack_overflow(); @@ -348,6 +352,8 @@ void do_IRQ(struct pt_regs *regs) timer_interrupt(regs); } #endif + + trace_irq_exit(regs); } void __init init_IRQ(void) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 5/6] powerpc: Disable HCALL_STATS by default
The overhead of HCALL_STATS is quite high and the functionality is very rarely used. Key statistics are also missing (eg min/max). With the new hcall tracepoints much more powerful tracing can be done in a kernel module. Lets disable this by default. Signed-off-by: Anton Blanchard an...@samba.org --- Index: linux.trees.git/arch/powerpc/configs/pseries_defconfig === --- linux.trees.git.orig/arch/powerpc/configs/pseries_defconfig 2009-10-27 14:56:58.0 +1100 +++ linux.trees.git/arch/powerpc/configs/pseries_defconfig 2009-10-27 14:57:11.0 +1100 @@ -1683,7 +1683,7 @@ CONFIG_HAVE_ARCH_KGDB=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_PAGEALLOC is not set -CONFIG_HCALL_STATS=y +# CONFIG_HCALL_STATS is not set # CONFIG_CODE_PATCHING_SELFTEST is not set # CONFIG_FTR_FIXUP_SELFTEST is not set # CONFIG_MSI_BITMAP_SELFTEST is not set ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/6] powerpc: tracing: Add powerpc tracepoints for timer entry and exit
We can monitor the effectiveness of our power management of both the kernel and hypervisor by probing the timer interrupt. For example, on this box we see 10.37s timer interrupts on an idle core: idle-0 [010] 3900.671297: timer_interrupt_entry: pt_regs=c000ce1e7b10 idle-0 [010] 3900.671302: timer_interrupt_exit: pt_regs=c000ce1e7b10 idle-0 [010] 3911.042963: timer_interrupt_entry: pt_regs=c000ce1e7b10 idle-0 [010] 3911.042968: timer_interrupt_exit: pt_regs=c000ce1e7b10 idle-0 [010] 3921.414630: timer_interrupt_entry: pt_regs=c000ce1e7b10 idle-0 [010] 3921.414635: timer_interrupt_exit: pt_regs=c000ce1e7b10 Since we have a 207MHz decrementer it will go negative and fire every 10.37s even if Linux is completely idle. Signed-off-by: Anton Blanchard an...@samba.org --- Index: linux.trees.git/arch/powerpc/kernel/time.c === --- linux.trees.git.orig/arch/powerpc/kernel/time.c 2009-10-07 17:21:21.0 +1100 +++ linux.trees.git/arch/powerpc/kernel/time.c 2009-10-07 17:21:52.0 +1100 @@ -54,6 +54,7 @@ #include linux/irq.h #include linux/delay.h #include linux/perf_event.h +#include asm/trace.h #include asm/io.h #include asm/processor.h @@ -571,6 +572,8 @@ void timer_interrupt(struct pt_regs * re struct clock_event_device *evt = decrementer-event; u64 now; + trace_timer_interrupt_entry(regs); + /* Ensure a positive value is written to the decrementer, or else * some CPUs will continuue to take decrementer exceptions */ set_dec(DECREMENTER_MAX); @@ -590,6 +593,7 @@ void timer_interrupt(struct pt_regs * re now = decrementer-next_tb - now; if (now = DECREMENTER_MAX) set_dec((int)now); + trace_timer_interrupt_exit(regs); return; } old_regs = set_irq_regs(regs); @@ -620,6 +624,8 @@ void timer_interrupt(struct pt_regs * re irq_exit(); set_irq_regs(old_regs); + + trace_timer_interrupt_exit(regs); } void wakeup_decrementer(void) Index: linux.trees.git/arch/powerpc/include/asm/trace.h === --- linux.trees.git.orig/arch/powerpc/include/asm/trace.h 2009-10-07 17:22:25.0 +1100 +++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-07 17:23:20.0 +1100 @@ -42,6 +42,40 @@ TRACE_EVENT(irq_exit, TP_printk(pt_regs=%p, __entry-regs) ); +TRACE_EVENT(timer_interrupt_entry, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs), + + TP_STRUCT__entry( + __field(struct pt_regs *, regs) + ), + + TP_fast_assign( + __entry-regs = regs; + ), + + TP_printk(pt_regs=%p, __entry-regs) +); + +TRACE_EVENT(timer_interrupt_exit, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs), + + TP_STRUCT__entry( + __field(struct pt_regs *, regs) + ), + + TP_fast_assign( + __entry-regs = regs; + ), + + TP_printk(pt_regs=%p, __entry-regs) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 4/6] powerpc: tracing: Give hypervisor call tracepoints access to arguments
While most users of the hcall tracepoints will only want the opcode and return code, some will want all the arguments. To avoid the complexity of using varargs we pass a pointer to the register save area which contain all arguments. Signed-off-by: Anton Blanchard an...@samba.org --- Index: linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S === --- linux.trees.git.orig/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 14:29:09.0 +1100 +++ linux.trees.git/arch/powerpc/platforms/pseries/hvCall.S 2009-10-27 14:29:16.0 +1100 @@ -30,7 +30,7 @@ hcall_tracepoint_refcount: * in early init (eg when populating the MMU hashtable) by using an * unconditional cpu feature. */ -#define HCALL_INST_PRECALL \ +#define HCALL_INST_PRECALL(FIRST_REG) \ BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ @@ -47,6 +47,7 @@ END_FTR_SECTION(0, 1); \ std r9,STK_PARM(r9)(r1);\ std r10,STK_PARM(r10)(r1); \ std r0,16(r1); \ + addir4,r1,STK_PARM(FIRST_REG); \ stdur1,-STACK_FRAME_OVERHEAD(r1); \ bl .__trace_hcall_entry; \ addir1,r1,STACK_FRAME_OVERHEAD; \ @@ -68,7 +69,7 @@ END_FTR_SECTION(0, 1); \ * in early init (eg when populating the MMU hashtable) by using an * unconditional cpu feature. */ -#define HCALL_INST_POSTCALL\ +#define __HCALL_INST_POSTCALL \ BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ @@ -88,9 +89,19 @@ END_FTR_SECTION(0, 1); \ ld r3,STK_PARM(r3)(r1);\ mtlrr0; \ 1: + +#define HCALL_INST_POSTCALL_NORETS \ + li r5,0; \ + __HCALL_INST_POSTCALL + +#define HCALL_INST_POSTCALL(BUFREG)\ + mr r5,BUFREG; \ + __HCALL_INST_POSTCALL + #else -#define HCALL_INST_PRECALL -#define HCALL_INST_POSTCALL +#define HCALL_INST_PRECALL(FIRST_ARG) +#define HCALL_INST_POSTCALL_NORETS +#define HCALL_INST_POSTCALL(BUFREG) #endif .text @@ -101,11 +112,11 @@ _GLOBAL(plpar_hcall_norets) mfcrr0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r4) HVSC/* invoke the hypervisor */ - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL_NORETS lwz r0,8(r1) mtcrf 0xff,r0 @@ -117,7 +128,7 @@ _GLOBAL(plpar_hcall) mfcrr0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r5) std r4,STK_PARM(r4)(r1) /* Save ret buffer */ @@ -136,7 +147,7 @@ _GLOBAL(plpar_hcall) std r6, 16(r12) std r7, 24(r12) - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL(r12) lwz r0,8(r1) mtcrf 0xff,r0 @@ -183,7 +194,7 @@ _GLOBAL(plpar_hcall9) mfcrr0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r5) std r4,STK_PARM(r4)(r1) /* Save ret buffer */ @@ -211,7 +222,7 @@ _GLOBAL(plpar_hcall9) std r11,56(r12) std r0, 64(r12) - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL(r12) lwz r0,8(r1) mtcrf 0xff,r0 Index: linux.trees.git/arch/powerpc/include/asm/trace.h === --- linux.trees.git.orig/arch/powerpc/include/asm/trace.h 2009-10-27 14:28:15.0 +1100 +++ linux.trees.git/arch/powerpc/include/asm/trace.h2009-10-27 14:29:16.0 +1100 @@ -81,9 +81,9 @@ extern void hcall_tracepoint_unregfunc(v TRACE_EVENT_FN(hcall_entry, - TP_PROTO(unsigned long opcode), + TP_PROTO(unsigned long opcode, unsigned long *args), - TP_ARGS(opcode), + TP_ARGS(opcode, args), TP_STRUCT__entry( __field(unsigned long, opcode) @@ -100,9 +100,10 @@ TRACE_EVENT_FN(hcall_entry, TRACE_EVENT_FN(hcall_exit, - TP_PROTO(unsigned long opcode, unsigned long retval), + TP_PROTO(unsigned long opcode, unsigned long retval, +
hypervisor call trace module
Here is an example of using the hcall tracepoints. This kernel module provides strace like functionality for hypervisor hcalls: - 0x64(ff02, 1, 2, d34d7a71, f, c0a6f388, 1, c0989008, c0a3f480) - 0x64() Which was an EOI (opcode 0x64) of 0xff02 There are a number of drivers that carry a lot of hcall related debug code just in case we have to chase down a bug. I'm hoping hcall tracepoints could replace it all and allow for much more powerful debugging. Anton obj-m := hcall_trace.o KDIR := /lib/modules/$(shell uname -r)/build PWD := $(shell pwd) default: $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules clean: rm -rf *.mod.c *.ko *.o .*.cmd .tmp_versions Module.markers modules.order Module.symvers /* * Hypervisor hcall trace * * Look for output in /sys/kernel/debug/powerpc/hcall_trace/ * * Copyright (C) 2009 Anton Blanchard an...@au.ibm.com, IBM * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include linux/module.h #include linux/debugfs.h #include linux/relay.h #include asm/trace.h #define SUBBUF_SIZE 131072 #define N_SUBBUFS 8 #define BUFLEN 512 static struct rchan *log_chan; static void probe_hcall_entry(unsigned long opcode, unsigned long *args) { char buf[BUFLEN]; /* Don't log H_CEDE */ if (opcode == H_CEDE) return; snprintf(buf, BUFLEN, - 0x%lx(%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx)\n, opcode, *args, *(args+1), *(args+2), *(args+3), *(args+4), *(args+5), *(args+6), *(args+7), *(args+8)); relay_write(log_chan, buf, strlen(buf)); } static void probe_hcall_exit(unsigned long opcode, unsigned long retval, unsigned long *retbuf) { char buf[BUFLEN]; /* Don't log H_CEDE */ if (opcode == H_CEDE) return; if (retbuf) snprintf(buf, BUFLEN, - 0x%lx(%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx)\n, opcode, *retbuf, *(retbuf+1), *(retbuf+2), *(retbuf+3), *(retbuf+4), *(retbuf+5), *(retbuf+6), *(retbuf+7), *(retbuf+8)); else sprintf(buf, - 0x%lx()\n, opcode); relay_write(log_chan, buf, strlen(buf)); } static struct dentry *create_buf_file_handler(const char *filename, struct dentry *parent, int mode, struct rchan_buf *buf, int *is_global) { return debugfs_create_file(filename, mode, parent, buf, relay_file_operations); } static int remove_buf_file_handler(struct dentry *dentry) { debugfs_remove(dentry); return 0; } static int subbuf_start(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, size_t prev_padding) { return 1; } static struct rchan_callbacks relay_callbacks = { .create_buf_file = create_buf_file_handler, .remove_buf_file = remove_buf_file_handler, .subbuf_start = subbuf_start, }; static struct dentry *debugfs_root; static int __init hcall_trace_init(void) { debugfs_root = debugfs_create_dir(hcall_trace, powerpc_debugfs_root); if (debugfs_root == ERR_PTR(-ENODEV)) { printk(KERN_ERR Debugfs not configured\n); goto err_out; } if (!debugfs_root) { printk(KERN_ERR Could not create debugfs directory\n); goto err_out; } log_chan = relay_open(cpu, debugfs_root, SUBBUF_SIZE, N_SUBBUFS, relay_callbacks, NULL); if (!log_chan) { printk(KERN_ERR relay_open failed\n); goto err_relay_open; } if (register_trace_hcall_entry(probe_hcall_entry)) { printk(KERN_ERR probe_hcall_entry probe point failed\n); goto err_probe_hcall_entry; } if (register_trace_hcall_exit(probe_hcall_exit)) { printk(KERN_ERR probe_hcall_exit probe point failed\n); goto err_probe_hcall_exit; } return 0; err_probe_hcall_exit: unregister_trace_hcall_entry(probe_hcall_entry); err_probe_hcall_entry: relay_close(log_chan); err_relay_open: debugfs_remove(debugfs_root); err_out: return -ENODEV; } static void __exit hcall_trace_exit(void) { unregister_trace_hcall_exit(probe_hcall_exit); unregister_trace_hcall_entry(probe_hcall_entry); relay_close(log_chan); debugfs_remove(debugfs_root); } module_init(hcall_trace_init) module_exit(hcall_trace_exit) MODULE_LICENSE(GPL); MODULE_AUTHOR(Anton Blanchard); ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[0/6] Assorted hugepage cleanups (v4)
Currently, ordinary pages use one pagetable layout, and each different hugepage size uses a slightly different variant layout. A number of places which need to walk the pagetable must first check the slice map to see what the pagetable layout then handle the various different forms. New hardware, like Book3E is liable to introduce more possible variants. This patch series, therefore, is designed to simplify the matter by limiting knowledge of the pagetable layout to only the allocation path. With this patch, ordinary pages are handled as ever, with a fixed 4 (or 3) level tree. All other variants branch off from some layer of that with a specially marked PGD/PUD/PMD pointer which also contains enough information to interpret the directories below that point. This means that things walking the pagetables (without allocating) don't need to look up the slice map, they can just step down the tree in the usual way, branching off to the non-standard layout path for hugepages, which uses the embdded information to interpret the tree from that point on. This reduces the source size in a number of places, and means that newer variants on the pagetable layout to handle new hardware and new features will need to alter the existing code in less places. In addition we split out the hash / classic MMU specific code into a separate hugetlbpage-hash64.c file. This will make adding support for other MMUs (like 440 and/or Book3E) easier. I've used the libhugetlbfs testsuite to test these patches on a Power5+ machine, but they could certainly do with more testing. In particular, I don't have any suitable hardware to test 16G pages. V2: Made the tweaks that BenH suggested to patch 2 of the original series. Some corresponding tweaks in patch 3 to match. V3: Fix a bug in the creation of the pgrable caches. Slightly extend the initialization cleanup. Add a new patch cleaning up the hugepage pte accessor functions. V4: Revisions based on BenH's comments, fix compile breakage for !CONFIG_HUGETLB_PAGE. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[2/6] Cleanup management of kmem_caches for pagetables
Currently we have a fair bit of rather fiddly code to manage the various kmem_caches used to store page tables of various levels. We generally have two caches holding some combination of PGD, PUD and PMD tables, plus several more for the special hugepage pagetables. This patch cleans this all up by taking a different approach. Rather than the caches being designated as for PUDs or for hugeptes for 16M pages, the caches are simply allocated to be a specific size. Thus sharing of caches between different types/levels of pagetables happens naturally. The pagetable size, where needed, is passed around encoded in the same way as {PGD,PUD,PMD}_INDEX_SIZE; that is n where the pagetable contains 2^n pointers. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/include/asm/pgalloc-64.h| 60 +++--- arch/powerpc/include/asm/pgalloc.h | 30 + arch/powerpc/include/asm/pgtable-ppc64.h |1 arch/powerpc/mm/hugetlbpage.c| 45 +-- arch/powerpc/mm/init_64.c| 70 +-- arch/powerpc/mm/pgtable.c| 25 +++ 6 files changed, 117 insertions(+), 114 deletions(-) Index: working-2.6/arch/powerpc/mm/init_64.c === --- working-2.6.orig/arch/powerpc/mm/init_64.c 2009-10-27 15:30:17.0 +1100 +++ working-2.6/arch/powerpc/mm/init_64.c 2009-10-27 15:37:04.0 +1100 @@ -119,30 +119,58 @@ static void pmd_ctor(void *addr) memset(addr, 0, PMD_TABLE_SIZE); } -static const unsigned int pgtable_cache_size[2] = { - PGD_TABLE_SIZE, PMD_TABLE_SIZE -}; -static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { -#ifdef CONFIG_PPC_64K_PAGES - pgd_cache, pmd_cache, -#else - pgd_cache, pud_pmd_cache, -#endif /* CONFIG_PPC_64K_PAGES */ -}; - -#ifdef CONFIG_HUGETLB_PAGE -/* Hugepages need an extra cache per hugepagesize, initialized in - * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT - * is not compile time constant. */ -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; -#else -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; -#endif +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store +* the index size in the low bits. Table alignment must be +* big enough to fit it */ + unsigned long minalign = MAX_PGTABLE_INDEX_SIZE + 1; + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the +* moment, gcc doesn't seem to recognize is_power_of_2 as a +* constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift 1) || (shift MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, pgtable-2^%d, shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + PGT_CACHE(shift) = new; + + pr_debug(Allocated pgtable cache for order %d\n, shift); +} + void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + panic(Couldn't allocate pgtable caches); + + /* In all current configs, when the PUD index exists it's the +* same size as either the pgd or pmd index. Verify that the +* initialization above has also created a PUD cache. This +* will need re-examiniation if we add new possibilities for +* the pagetable layout. */ + BUG_ON(PUD_INDEX_SIZE !PGT_CACHE(PUD_INDEX_SIZE)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP Index: working-2.6/arch/powerpc/include/asm/pgalloc-64.h === --- working-2.6.orig/arch/powerpc/include/asm/pgalloc-64.h 2009-10-27 15:30:16.0 +1100 +++
[1/6] Make hpte_need_flush() correctly mask for multiple page sizes
Currently, hpte_need_flush() only correctly flushes the given address for normal pages. Callers for hugepages are required to mask the address themselves. But hpte_need_flush() already looks up the page sizes for its own reasons, so this is a rather silly imposition on the callers. This patch alters it to mask based on the pagesize it has looked up itself, and removes the awkward masking code in the hugepage caller. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/mm/hugetlbpage.c |6 +- arch/powerpc/mm/tlb_hash64.c |8 +++- 2 files changed, 4 insertions(+), 10 deletions(-) Index: working-2.6/arch/powerpc/mm/tlb_hash64.c === --- working-2.6.orig/arch/powerpc/mm/tlb_hash64.c 2009-09-04 14:35:30.0 +1000 +++ working-2.6/arch/powerpc/mm/tlb_hash64.c2009-09-04 14:36:12.0 +1000 @@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *m i = batch-index; - /* We mask the address for the base page size. Huge pages will -* have applied their own masking already -*/ - addr = PAGE_MASK; - /* Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like @@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *m } else psize = pte_pagesize_index(mm, addr, pte); + /* Mask the address for the correct page size */ + addr = ~((1UL mmu_psize_defs[psize].shift) - 1); + /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); Index: working-2.6/arch/powerpc/mm/hugetlbpage.c === --- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c 2009-09-04 14:35:30.0 +1000 +++ working-2.6/arch/powerpc/mm/hugetlbpage.c 2009-09-04 14:36:12.0 +1000 @@ -445,11 +445,7 @@ void set_huge_pte_at(struct mm_struct *m * necessary anymore if we make hpte_need_flush() get the * page size from the slices */ - unsigned int psize = get_slice_psize(mm, addr); - unsigned int shift = mmu_psize_to_shift(psize); - unsigned long sz = ((1UL) shift); - struct hstate *hstate = size_to_hstate(sz); - pte_update(mm, addr hstate-mask, ptep, ~0UL, 1); + pte_update(mm, addr, ptep, ~0UL, 1); } *ptep = __pte(pte_val(pte) ~_PAGE_HPTEFLAGS); } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[3/6] Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different pagetable layout: that is, the bottem level table of pointers to hugepages is a different size, and may branch off from the normal page tables at a different level. Every hugepage aware path that needs to walk the pagetables must therefore look up the hugepage size from the slice info first, and work out the correct way to walk the pagetables accordingly. Future hardware is likely to add more possible hugepage sizes, more layout options and more mess. This patch, therefore reworks the handling of hugepage pagetables to reduce this complexity. In the new scheme, instead of having to consult the slice mask, pagetable walking code can check a flag in the PGD/PUD/PMD entries to see where to branch off to hugepage pagetables, and the entry also contains the information (eseentially hugepage shift) necessary to then interpret that table without recourse to the slice mask. This scheme can be extended neatly to handle multiple levels of self-describing special hugepage pagetables, although for now we assume only one level exists. This approach means that only the pagetable allocation path needs to know how the pagetables should be set out. All other (hugepage) pagetable walking paths can just interpret the structure as they go. There already was a flag bit in PGD/PUD/PMD entries for hugepage directory pointers, but it was only used for debug. We alter that flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable pointer (normally it would be 1 since the pointer lies in the linear mapping). This means that asm pagetable walking can test for (and punt on) hugepage pointers with the same test that checks for unpopulated page directory entries (beq becomes bge), since hugepage pointers will always be positive, and normal pointers always negative. While we're at it, we get rid of the confusing (and grep defeating) #defining of hugepte_shift to be the same thing as mmu_huge_psizes. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/include/asm/hugetlb.h |1 arch/powerpc/include/asm/mmu-hash64.h| 14 arch/powerpc/include/asm/page.h | 14 arch/powerpc/include/asm/pgtable-ppc64.h | 13 arch/powerpc/include/asm/pgtable.h |3 arch/powerpc/kernel/perf_callchain.c | 20 - arch/powerpc/mm/gup.c| 149 + arch/powerpc/mm/hash_utils_64.c | 26 - arch/powerpc/mm/hugetlbpage.c| 473 ++- arch/powerpc/mm/init_64.c| 10 10 files changed, 313 insertions(+), 410 deletions(-) Index: working-2.6/arch/powerpc/mm/hugetlbpage.c === --- working-2.6.orig/arch/powerpc/mm/hugetlbpage.c 2009-10-27 15:35:27.0 +1100 +++ working-2.6/arch/powerpc/mm/hugetlbpage.c 2009-10-27 15:37:08.0 +1100 @@ -40,25 +40,11 @@ static unsigned nr_gpages; /* Array of valid huge page sizes - non-zero value(hugepte_shift) is * stored for the huge page sizes that are valid. */ -unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ - -#define hugepte_shift mmu_huge_psizes -#define HUGEPTE_INDEX_SIZE(psize) (mmu_huge_psizes[(psize)]) -#define PTRS_PER_HUGEPTE(psize)(1 mmu_huge_psizes[psize]) - -#define HUGEPD_SHIFT(psize)(mmu_psize_to_shift(psize) \ -+ HUGEPTE_INDEX_SIZE(psize)) -#define HUGEPD_SIZE(psize) (1UL HUGEPD_SHIFT(psize)) -#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) +static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */ -#define HUGEPD_OK 0x1 - -typedef struct { unsigned long pd; } hugepd_t; - -#define hugepd_none(hpd) ((hpd).pd == 0) static inline int shift_to_mmu_psize(unsigned int shift) { @@ -82,71 +68,126 @@ static inline unsigned int mmu_psize_to_ BUG(); } +#define hugepd_none(hpd) ((hpd).pd == 0) + static inline pte_t *hugepd_page(hugepd_t hpd) { - BUG_ON(!(hpd.pd HUGEPD_OK)); - return (pte_t *)(hpd.pd ~HUGEPD_OK); + BUG_ON(!hugepd_ok(hpd)); + return (pte_t *)((hpd.pd ~HUGEPD_SHIFT_MASK) | 0xc000); } -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, - struct hstate *hstate) +static inline unsigned int hugepd_shift(hugepd_t hpd) { - unsigned int shift = huge_page_shift(hstate); - int psize = shift_to_mmu_psize(shift); - unsigned long idx = ((addr shift) (PTRS_PER_HUGEPTE(psize)-1)); + return hpd.pd HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) +{
[4/6] Cleanup initialization of hugepages on powerpc
This patch simplifies the logic used to initialize hugepages on powerpc. The somewhat oddly named set_huge_psize() is renamed to add_huge_page_size() and now does all necessary verification of whether it's given a valid hugepage sizes (instead of just some) and instantiates the generic hstate structure (but no more). hugetlbpage_init() now steps through the available pagesizes, checks if they're valid for hugepages by calling add_huge_page_size() and initializes the kmem_caches for the hugepage pagetables. This means we can now eliminate the mmu_huge_psizes array, since we no longer need to pass the sizing information for the pagetable caches from set_huge_psize() into hugetlbpage_init() Determination of the default huge page size is also moved from the hash code into the general hugepage code. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/include/asm/page_64.h |2 arch/powerpc/mm/hash_utils_64.c| 10 -- arch/powerpc/mm/hugetlbpage.c | 130 + 3 files changed, 64 insertions(+), 78 deletions(-) Index: linux-a2/arch/powerpc/mm/hugetlbpage.c === --- linux-a2.orig/arch/powerpc/mm/hugetlbpage.c 2009-10-15 16:40:49.0 +1100 +++ linux-a2/arch/powerpc/mm/hugetlbpage.c 2009-10-15 16:41:33.0 +1100 @@ -37,27 +37,17 @@ static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; -/* Array of valid huge page sizes - non-zero value(hugepte_shift) is - * stored for the huge page sizes that are valid. - */ -static unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ - /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */ static inline int shift_to_mmu_psize(unsigned int shift) { - switch (shift) { -#ifndef CONFIG_PPC_64K_PAGES - case PAGE_SHIFT_64K: - return MMU_PAGE_64K; -#endif - case PAGE_SHIFT_16M: - return MMU_PAGE_16M; - case PAGE_SHIFT_16G: - return MMU_PAGE_16G; - } + int psize; + + for (psize = 0; psize MMU_PAGE_COUNT; ++psize) + if (mmu_psize_defs[psize].shift == shift) + return psize; return -1; } @@ -502,8 +492,6 @@ unsigned long hugetlb_get_unmapped_area( struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - if (!mmu_huge_psizes[mmu_psize]) - return -EINVAL; return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); } @@ -666,47 +654,46 @@ repeat: return err; } -static void __init set_huge_psize(int psize) +static int __init add_huge_page_size(unsigned long long size) { - unsigned pdshift; + int shift = __ffs(size); + int mmu_psize; /* Check that it is a page size supported by the hardware and -* that it fits within pagetable limits. */ - if (mmu_psize_defs[psize].shift - mmu_psize_defs[psize].shift SID_SHIFT_1T - (mmu_psize_defs[psize].shift MIN_HUGEPTE_SHIFT || -mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || -mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { - /* Return if huge page size has already been setup or is the -* same as the base page size. */ - if (mmu_huge_psizes[psize] || - mmu_psize_defs[psize].shift == PAGE_SHIFT) - return; - hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); +* that it fits within pagetable and slice limits. */ + if (!is_power_of_2(size) + || (shift SLICE_HIGH_SHIFT) || (shift = PAGE_SHIFT)) + return -EINVAL; - if (mmu_psize_defs[psize].shift PMD_SHIFT) - pdshift = PMD_SHIFT; - else if (mmu_psize_defs[psize].shift PUD_SHIFT) - pdshift = PUD_SHIFT; - else - pdshift = PGDIR_SHIFT; - mmu_huge_psizes[psize] = pdshift - mmu_psize_defs[psize].shift; - } + if ((mmu_psize = shift_to_mmu_psize(shift)) 0) + return -EINVAL; + +#ifdef CONFIG_SPU_FS_64K_LS + /* Disable support for 64K huge pages when 64K SPU local store +* support is enabled as the current implementation conflicts. +*/ + if (shift == PAGE_SHIFT_64K) + return -EINVAL; +#endif /* CONFIG_SPU_FS_64K_LS */ + + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); + + /* Return if huge page size has already been setup */ + if (size_to_hstate(size)) + return 0; + + hugetlb_add_hstate(shift - PAGE_SHIFT); + + return 0; } static int __init hugepage_setup_sz(char *str) {
[5/6] Split hash MMU specific hugepage code into a new file
This patch separates the parts of hugetlbpage.c which are inherently specific to the hash MMU into a new hugelbpage-hash64.c file. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/include/asm/hugetlb.h |3 arch/powerpc/mm/Makefile |5 - arch/powerpc/mm/hugetlbpage-hash64.c | 167 ++ arch/powerpc/mm/hugetlbpage.c| 168 --- 4 files changed, 176 insertions(+), 167 deletions(-) Index: working-2.6/arch/powerpc/mm/Makefile === --- working-2.6.orig/arch/powerpc/mm/Makefile 2009-10-27 15:07:38.0 +1100 +++ working-2.6/arch/powerpc/mm/Makefile2009-10-27 15:08:09.0 +1100 @@ -28,7 +28,10 @@ obj-$(CONFIG_44x)+= 44x_mmu.o obj-$(CONFIG_FSL_BOOKE)+= fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES)+= slice.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) +obj-y += hugetlbpage.o +obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o Index: working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ working-2.6/arch/powerpc/mm/hugetlbpage-hash64.c2009-10-27 15:08:09.0 +1100 @@ -0,0 +1,167 @@ +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth rohit.s...@intel.com + */ + +#include linux/mm.h +#include linux/hugetlb.h +#include asm/pgtable.h +#include asm/pgalloc.h +#include asm/cacheflush.h +#include asm/machdep.h + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, + pte_t pte, int trap, unsigned long sz) +{ + struct page *page; + int i; + + if (!pfn_valid(pte_pfn(pte))) + return rflags; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, page-flags) !PageReserved(page)) { + if (trap == 0x400) { + for (i = 0; i (sz / PAGE_SIZE); i++) + __flush_dcache_icache(page_address(page+i)); + set_bit(PG_arch_1, page-flags); + } else { + rflags |= HPTE_R_N; + } + } + return rflags; +} + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, +pte_t *ptep, unsigned long trap, int local, int ssize, +unsigned int shift, unsigned int mmu_psize) +{ + unsigned long old_pte, new_pte; + unsigned long va, rflags, pa, sz; + long slot; + int err = 1; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + va = hpt_va(ea, vsid, ssize); + + /* +* Check the user's access rights to the page. If access should be +* prevented then send the problem up to do_page_fault. +*/ + if (unlikely(access ~pte_val(*ptep))) + goto out; + /* +* At this point, we have a pte (old_pte) which can be used to build +* or update an HPTE. There are 2 cases: +* +* 1. There is a valid (present) pte with no associated HPTE (this is +* the most common case) +* 2. There is a valid (present) pte with an associated HPTE. The +* current values of the pp bits in the HPTE prevent access +* because we are doing software DIRTY bit management and the +* page is currently not DIRTY. +*/ + + + do { + old_pte = pte_val(*ptep); + if (old_pte _PAGE_BUSY) + goto out; + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, +old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte _PAGE_RW)); + /* _PAGE_EXEC - HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte _PAGE_EXEC) ? 0 : HPTE_R_N); + sz = ((1UL) shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we +* don't need to worry about that case */ + rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), + trap, sz); + + /* Check if pte already has an hpte
[6/6] Bring hugepage PTE accessor functions back into sync with normal accessors
The hugepage arch code provides a number of hook functions/macros which mirror the functionality of various normal page pte access functions. Various changes in the normal page accessors (in particular BenH's recent changes to the handling of lazy icache flushing and PAGE_EXEC) have caused the hugepage versions to get out of sync with the originals. In some cases, this is a bug, at least on some MMU types. One of the reasons that some hooks were not identical to the normal page versions, is that the fact we're dealing with a hugepage needed to be passed down do use the correct dcache-icache flush function. This patch makes the main flush_dcache_icache_page() function hugepage aware (by checking for the PageCompound flag). That in turn means we can make set_huge_pte_at() just a call to set_pte_at() bringing it back into sync. As a bonus, this lets us remove the hash_huge_page_do_lazy_icache() function, replacing it with a call to the hash_page_do_lazy_icache() function it was based on. Some other hugepage pte access hooks - huge_ptep_get_and_clear() and huge_ptep_clear_flush() - are not so easily unified, but this patch at least brings them back into sync with the current versions of the corresponding normal page functions. Signed-off-by: David Gibson d...@au1.ibm.com --- arch/powerpc/include/asm/hugetlb.h| 25 +++-- arch/powerpc/include/asm/mmu-hash64.h |1 + arch/powerpc/mm/hash_utils_64.c |2 +- arch/powerpc/mm/hugetlbpage-hash64.c | 30 +- arch/powerpc/mm/hugetlbpage.c | 31 ++- arch/powerpc/mm/mem.c | 17 + 6 files changed, 45 insertions(+), 61 deletions(-) Index: working-2.6/arch/powerpc/include/asm/hugetlb.h === --- working-2.6.orig/arch/powerpc/include/asm/hugetlb.h 2009-10-27 14:50:34.0 +1100 +++ working-2.6/arch/powerpc/include/asm/hugetlb.h 2009-10-27 14:56:31.0 +1100 @@ -6,6 +6,8 @@ pte_t *huge_pte_offset_and_shift(struct mm_struct *mm, unsigned long addr, unsigned *shift); +void flush_dcache_icache_hugepage(struct page *page); + int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); @@ -13,12 +15,6 @@ void hugetlb_free_pgd_range(struct mmu_g unsigned long end, unsigned long floor, unsigned long ceiling); -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, -pte_t *ptep, pte_t pte); - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - /* * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs * to override the version in mm/hugetlb.c @@ -44,9 +40,26 @@ static inline void hugetlb_prefault_arch { } + +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(mm, addr, ptep, pte); +} + +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); + return __pte(old); +} + static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + pte_t pte; + pte = huge_ptep_get_and_clear(vma-vm_mm, addr, ptep); + flush_tlb_page(vma, addr); } static inline int huge_pte_none(pte_t pte) Index: working-2.6/arch/powerpc/include/asm/mmu-hash64.h === --- working-2.6.orig/arch/powerpc/include/asm/mmu-hash64.h 2009-10-27 14:36:36.0 +1100 +++ working-2.6/arch/powerpc/include/asm/mmu-hash64.h 2009-10-27 14:55:22.0 +1100 @@ -245,6 +245,7 @@ extern int __hash_page_64K(unsigned long unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned int local, int ssize); struct mm_struct; +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap); extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap); int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, int local, int ssize, Index: working-2.6/arch/powerpc/mm/hash_utils_64.c === --- working-2.6.orig/arch/powerpc/mm/hash_utils_64.c2009-10-27 14:42:47.0 +1100 +++ working-2.6/arch/powerpc/mm/hash_utils_64.c 2009-10-27 14:55:22.0 +1100 @@ -775,7 +775,7 @@ unsigned int hash_page_do_lazy_icache(un /* page is dirty */ if (!test_bit(PG_arch_1,
[PATCH v3] powerpc/ppc64: Use preempt_schedule_irq instead of preempt_schedule
So I _think_ that the irqs on/off accounting for lockdep isn't quite right. What do you think of this slightly modified version ? I've only done a quick boot test on a G5 with lockdep enabled and a played a bit, nothing shows up so far but it's definitely not conclusive. The main difference is that I call trace_hardirqs_off to advertise the fact that we are soft-disabling (it could be a dup, but at this stage this is no big deal, but it's not always, like in syscall return the kernel thinks we have interrupts enabled and could thus get out of sync without that). I also mark the PACA hard disable to reflect the MSR:EE state before calling into preempt_schedule_irq(). Allright, second thought :-) It's probably simpler to just keep hardirqs off. Code is smaller and simpler and the scheduler will re-enable them soon enough anyways. This version of the patch also spaces the code a bit and adds comments which makes them (the code and the patch) more readable. Cheers, Ben. From: Benjamin Herrenschmidt b...@kernel.crashing.org [PATCH v3] powerpc/ppc64: Use preempt_schedule_irq instead of preempt_schedule Based on an original patch by Valentine Barshak vbars...@ru.mvista.com Use preempt_schedule_irq to prevent infinite irq-entry and eventual stack overflow problems with fast-paced IRQ sources. This kind of problems has been observed on the PASemi Electra IDE controller. We have to make sure we are soft-disabled before calling preempt_schedule_irq and hard disable interrupts after that to avoid unrecoverable exceptions. This patch also moves the clrrdi r9,r1,THREAD_SHIFT out of the #ifdef CONFIG_PPC_BOOK3E scope, since r9 is clobbered and has to be restored in both cases. --- arch/powerpc/kernel/entry_64.S | 41 --- 1 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index f9fd54b..9763267 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -658,42 +658,43 @@ do_work: cmpdi r0,0 crandc eq,cr1*4+eq,eq bne restore - /* here we are preempting the current task */ -1: -#ifdef CONFIG_TRACE_IRQFLAGS - bl .trace_hardirqs_on - /* Note: we just clobbered r10 which used to contain the previous -* MSR before the hard-disabling done by the caller of do_work. -* We don't have that value anymore, but it doesn't matter as -* we will hard-enable unconditionally, we can just reload the -* current MSR into r10 + + /* Here we are preempting the current task. +* +* Ensure interrupts are soft-disabled. We also properly mark +* the PACA to reflect the fact that they are hard-disabled +* and trace the change */ - mfmsr r10 -#endif /* CONFIG_TRACE_IRQFLAGS */ - li r0,1 + li r0,0 stb r0,PACASOFTIRQEN(r13) stb r0,PACAHARDIRQEN(r13) + TRACE_DISABLE_INTS + + /* Call the scheduler with soft IRQs off */ +1: bl .preempt_schedule_irq + + /* Hard-disable interrupts again (and update PACA) */ #ifdef CONFIG_PPC_BOOK3E - wrteei 1 - bl .preempt_schedule wrteei 0 #else - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule mfmsr r10 - clrrdi r9,r1,THREAD_SHIFT - rldicl r10,r10,48,1/* disable interrupts again */ + rldicl r10,r10,48,1 rotldi r10,r10,16 mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ + li r0,0 + stb r0,PACAHARDIRQEN(r13) + + /* Re-test flags and eventually loop */ + clrrdi r9,r1,THREAD_SHIFT ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b b restore user_work: -#endif +#endif /* CONFIG_PREEMPT */ + /* Enable interrupts */ #ifdef CONFIG_PPC_BOOK3E wrteei 1 -- 1.6.1.2.14.gf26b5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev