[PATCH v2] powerpc: mpc5200: Add a3m071 board support
This patch adds the MPC5200B based a3m071 board. Signed-off-by: Stefan Roese s...@denx.de Cc: Anatolij Gustschin ag...@denx.de --- v2: - Remove cdm@200 DT node as it's not used - Disable i2c controller in dts as its unused on this board arch/powerpc/boot/dts/a3m071.dts | 144 +++ arch/powerpc/platforms/52xx/mpc5200_simple.c | 1 + 2 files changed, 145 insertions(+) create mode 100644 arch/powerpc/boot/dts/a3m071.dts diff --git a/arch/powerpc/boot/dts/a3m071.dts b/arch/powerpc/boot/dts/a3m071.dts new file mode 100644 index 000..877a28c --- /dev/null +++ b/arch/powerpc/boot/dts/a3m071.dts @@ -0,0 +1,144 @@ +/* + * a3m071 board Device Tree Source + * + * Copyright 2012 Stefan Roese s...@denx.de + * + * Copyright (C) 2011 DENX Software Engineering GmbH + * Heiko Schocher h...@denx.de + * + * Copyright (C) 2007 Semihalf + * Marian Balakowicz m...@semihalf.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +/include/ mpc5200b.dtsi + +/ { + model = anonymous,a3m071; + compatible = anonymous,a3m071; + + soc5200@f000 { + #address-cells = 1; + #size-cells = 1; + compatible = fsl,mpc5200b-immr; + ranges = 0 0xf000 0xc000; + reg = 0xf000 0x0100; + bus-frequency = 0; /* From boot loader */ + system-frequency = 0; /* From boot loader */ + + timer@600 { + fsl,has-wdt; + }; + + spi@f00 { + status = disabled; + }; + + usb: usb@1000 { + status = disabled; + }; + + psc@2000 { + compatible = fsl,mpc5200b-psc-uart,fsl,mpc5200-psc-uart; + reg = 0x2000 0x100; + interrupts = 2 1 0; + }; + + psc@2200 { + status = disabled; + }; + + psc@2400 { + status = disabled; + }; + + psc@2600 { + status = disabled; + }; + + psc@2800 { + status = disabled; + }; + + psc@2c00 { // PSC6 + compatible = fsl,mpc5200b-psc-uart,fsl,mpc5200-psc-uart; + reg = 0x2c00 0x100; + interrupts = 2 4 0; + }; + + ethernet@3000 { + phy-handle = phy0; + }; + + mdio@3000 { + phy0: ethernet-phy@3 { + reg = 0x03; + }; + }; + + ata@3a00 { + status = disabled; + }; + + i2c@3d00 { + status = disabled; + }; + + i2c@3d40 { + status = disabled; + }; + }; + + localbus { + compatible = fsl,mpc5200b-lpb,simple-bus; + #address-cells = 2; + #size-cells = 1; + ranges = 0 0 0xfc00 0x0200 + 3 0 0xe900 0x0008 + 5 0 0xe800 0x0001; + + flash@0,0 { + #address-cells = 1; + #size-cells = 1; + reg = 0 0x0 0x0200; + compatible = cfi-flash; + bank-width = 2; + partition@0x0 { + label = u-boot; + reg = 0x 0x0004; + read-only; + }; + partition@0x0004 { + label = env; + reg = 0x0004 0x0002; + }; + partition@0x0006 { + label = dtb; + reg = 0x0006 0x0002; + }; + partition@0x0008 { + label = kernel; + reg = 0x0008 0x0050; + }; + partition@0x0058 { + label = root; + reg = 0x0058 0x00A8; + }; + }; + + fpga@3,0 { + compatible = anonymous,a3m071-fpga; + reg = 3 0x0 0x0008 + 5 0x0 0x0001; +
Re: [linuxppc-release][PATCH] powerpc/pci-hotplug: fix init issue of rescanned pci device
On 12/06/2012 05:30 AM, Bjorn Helgaas wrote: On Wed, Dec 5, 2012 at 2:29 AM, Chen Yuanquan-B41889 b41...@freescale.com wrote: On 12/05/2012 04:26 PM, Benjamin Herrenschmidt wrote: On Wed, 2012-12-05 at 16:20 +0800, Chen Yuanquan-B41889 wrote: On 12/05/2012 03:17 PM, Benjamin Herrenschmidt wrote: On Wed, 2012-12-05 at 10:31 +0800, Yuanquan Chen wrote: On powerpc arch, some fixup work of PCI/PCI-e device is just done during the first scan at booting time. For the PCI/PCI-e device rescanned after linux OS booting up, the fixup work won't be done, which leads to dma_set_mask error or irq related issue in rescanned PCI/PCI-e device's driver. So, it does the same fixup work for the rescanned device to avoid this issue. Hrm, the patch is a bit gross. First the code shouldn't be copy/pasted that way but factored out. Please, at least format your email properly so I can try to undertand without needing aspirin. There's a judgement if (!bus-is_added) before calling of pcibios_fixup_bus in pci_scan_child_bus, so for the rescanned device, the fixup won't execute, which leads to fatal error in driver of rescanned device on freescale powerpc, no this issues on x86 arch. First, none of that invalidates my statement that you shouldn't duplicate a whole block of code like this. Even if your approach is correct (which is debated separately), at the very least you should factor the code out into a common function between the two copies. Remove the judgement, let it to do the pcibios_fixup_bus directly, the error won't occur for the rescanned device. But it's general code, not proper to change here, so copy the pcibios_fixup_bus work to pcibios_enable_device. I'm surprised also that is_added is false when pcibios_enable_device() gets called ... that looks strange to me. At what point is that enable happening in the hotplug sequence ? All devices are rescanned and then call the pci_enable_devices and pci_bus_add_devices. Where ? How ? What is the sequence happening ? In any case, I think if we need a proper fixup done per-device like that after scan we ought to create a new hook at the generic level rather than that sort of hack. echo 1 rescan to trigger dev_rescan_store: dev_rescan_store-pci_rescan_bus-pci_scan_child_bus, pci_assign_unassigned_bus_resources, pci_enable_bridges, pci_bus_add_devices pci_enable_bridges-pci_enable_device-__pci_enable_device_flags-do_pci_enable_device- pcibios_enable_device pci_bus_add_devices-pci_bus_add_device-dev-is_added = 1 Yeah, it's general fixup code for every rescanned PCI/PCI-e device on powerpc at runtime. So if we want to call it in a ppc_md member, we need to wrap it as a function and assign it in every ppc_md, it isn't proper for the general code. Regards, yuanquan The patch code will be called by pci_enable_devices. The dev-is_added is set in pci_bus_add_device which is called by pci_bus_add_devices. So dev-is_added is false when checking it in pcibios_enable_device for the rescanned device. Who calls pci_enable_device() in the rescan case ? Why isn't it left to the driver ? I don't think we can rely on that behaviour not to change. How do you trigger the rescan anyway ? Use the interface under /sys : echo 1 /sys/bus/pci/devices/xxx/remove then echo 1 to the pci device which is the bus of the removed device echo 1 /sys/bus/pci/devices//rescan the removed device will be scanned and it's driver module will be loaded automatically. Yeah this code path are known to be fishy. I think the problem is at the generic abstraction level and that's where it needs to be fixed. Cheers, Ben. Regards, yuanquan I think the problem needs to be solve at a higher level, I'm adding linux-pci Bjorn to the CC list. Cheers, Ben. Signed-off-by: Yuanquan Chen b41...@freescale.com --- arch/powerpc/kernel/pci-common.c | 20 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 7f94f76..f0fb070 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1496,6 +1496,26 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) if (ppc_md.pcibios_enable_device_hook(dev)) return -EINVAL; +if (!dev-is_added) { + /* +* Fixup NUMA node as it may not be setup yet by the generic +* code and is needed by the DMA init +*/ + set_dev_node(dev-dev, pcibus_to_node(dev-bus)); + + /* Hook up default DMA ops */ + set_dma_ops(dev-dev, pci_dma_ops); + set_dma_offset(dev-dev, PCI_DRAM_OFFSET); + + /* Additional platform DMA/iommu setup */ + if (ppc_md.pci_dma_dev_setup) + ppc_md.pci_dma_dev_setup(dev); + + /* Read default IRQs and fixup if necessary */ + pci_read_irq_line(dev); + if
Re: [stable] [PATCH] powerpc/ptrace: Fix build with gcc 4.6
Michael Ellerman mich...@ellerman.id.au wrote: On Thu, 2011-11-17 at 13:31 +1100, Michael Neuling wrote: From: Benjamin Herrenschmidt b...@kernel.crashing.org powerpc/ptrace: Fix build with gcc 4.6 gcc (rightfully) complains that we are accessing beyond the end of the fpr array (we do, to access the fpscr). This patch is still missing from the 3.0 stable series. Do we need to resend ? And resent to the correct stable address. cheers -- Sent from my Android phone with K-9 Mail. Please excuse my brevity.___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: Understanding how kernel updates MMU hash table
On Wed, 2012-12-05 at 23:57 -0800, Pegasus11 wrote: Hi Ben. Got it..no more quoting replies... Quoting is fine ... as long as you quote the bits your reply to, not your actual reply part :) You mentioned the MMU looking into a hash table if it misses a translation entry in the TLB. This means that there is a hardware TLB for sure. Sure, nobody sane would design a CPU without one nowadays :-) By your words, I understand that the hash table is an in-memory cache of translations meaning it is implemented in software. Well, it's populated by software and read by HW. IE. On x86, the MMU will walk a radix tree of page tables, on powerpc it will walk an in memory hash table. The main difference is that on x86, there is usually a tree per process while the powerpc hash table tends to be global. So whenever the MMU wishes to translate a virtual address, it first checks the TLB and if it isn't found there, it looks for it in the hash table. Now this seems fine to me when looked at from the perspective of the MMU. Now when I look at it from the kernel's perspective, I am a bit confused. So when we (the kernel) encounter a virtual address, we walk the page tables and if we find that there is no valid entry for this address, we page fault which causes an exception right? Hrm ... not sure what we mean by the kernel. There are two different path here, but let's focus on the usual case... the processor encounters an address, whether it's trying to fetch an instruction, or having done that, is performing a load or a store. This will use what we call in powerpc lingua an effective address. This gets in turn turned into a virtual address after an SLB lookup. I refer you to the architecture here, it's a bit tricky but basically the principle is that the virtual address space is *somewhat* the effective address space along with the process id. Except that on powerpc, we do that per-segment (we divide the address space into segments) so each segment has its top bits transformed into something larger called the VSID. In any case, this results in a virtual address which is then looked up in the TLB (I'm ignoring the ERAT here which is the 1-st level TLB but let's not complicate things even more). If that misses, the CPU looks up in the hash table. If that misses, it causes an exception (0x300 for data accesses, 0x400 for instruction accesses). There, Linux will usually go into hash_page which looks for the Linux PTE. If the PTE is absent (or has any other reason to be unusable such as being read-only for a write access), we get to do_page_fault. Else, we populate the hash table with a translation, set the HASHPTE bit in the PTE, and retry the access. And this exception then takes us to the exception handler which I guess is 'do_page_fault'. On checking this function I see that it gets the PGD, allocates a PMD, allocates a PTE and then it calls handle_pte_fault. The comment banner for handle_pte_fault reads: 1638 /* These routines also need to handle stuff like marking pages dirty 1639 * and/or accessed for architectures that don't do it in hardware (most 1640 * RISC architectures). The early dirtying is also good on the i386. 1641 * 1642 * There is also a hook called update_mmu_cache() that architectures 1643 * with external mmu caches can use to update those (ie the Sparc or 1644 * PowerPC hashed page tables that act as extended TLBs) . */ Yes, when we go to do_page_fault() because the PTE wasn't populated in the first place, we have a hook to pre-fill the hash table instead of taking a fault again which will fill it the second time around. It's just a shortcut. It is from such comments that I inferred that the hash tables were being used as extended TLBs. However the above also infers (atleast to me) that these caches are in hardware as theyve used the word 'extended'. Pardon me if I am being nitpicky but these things are confusing me a bit. So to clear this confusion, there are three things I would like to know. 1. Is the MMU cache implemented in hardware or software? I trust you on it being software but it would be great if you could address my concern in the above paragraph. The TLB is a piece of HW. (there's really three in fact, the I-ERAT, the D-ERAT and the TLB ;-) The Hash Table is a piece of RAM (pointed to by the SDR1 register) setup by the OS and populated by the OS but read by the HW. Just like the page tables on x86. 2. The kernel, it looks from the do_page_fault sequence, is updating its internal page table first and then it goes on to update the mmu cache. So this only means it is satisfying the requirement of someone else, perhaps the MMU here. update_mmu_cache() is just a shortcut. As I explained above, we populate the hash table lazily on fault. However, when taking an actual high level page fault (do_page_fault), we *know* the hash doesn't have an appropriate translation, so rather than just filling up the linux PTE and then
[PATCH 13/20] ALSA: sound/ps3: remove __dev* attributes
CONFIG_HOTPLUG is going away as an option. As result the __dev* markings will be going away. Remove use of __devinit, __devexit_p, __devinitdata, __devinitconst, and __devexit. Signed-off-by: Bill Pemberton wf...@virginia.edu Cc: Geoff Levand ge...@infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: cbe-oss-...@lists.ozlabs.org --- sound/ppc/snd_ps3.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/ppc/snd_ps3.c b/sound/ppc/snd_ps3.c index 9b18b52..8c7dcbe 100644 --- a/sound/ppc/snd_ps3.c +++ b/sound/ppc/snd_ps3.c @@ -786,7 +786,7 @@ static struct snd_pcm_ops snd_ps3_pcm_spdif_ops = { }; -static int __devinit snd_ps3_map_mmio(void) +static int snd_ps3_map_mmio(void) { the_card.mapped_mmio_vaddr = ioremap(the_card.ps3_dev-m_region-bus_addr, @@ -808,7 +808,7 @@ static void snd_ps3_unmap_mmio(void) the_card.mapped_mmio_vaddr = NULL; } -static int __devinit snd_ps3_allocate_irq(void) +static int snd_ps3_allocate_irq(void) { int ret; u64 lpar_addr, lpar_size; @@ -866,7 +866,7 @@ static void snd_ps3_free_irq(void) ps3_irq_plug_destroy(the_card.irq_no); } -static void __devinit snd_ps3_audio_set_base_addr(uint64_t ioaddr_start) +static void snd_ps3_audio_set_base_addr(uint64_t ioaddr_start) { uint64_t val; int ret; @@ -882,7 +882,7 @@ static void __devinit snd_ps3_audio_set_base_addr(uint64_t ioaddr_start) ret); } -static void __devinit snd_ps3_audio_fixup(struct snd_ps3_card_info *card) +static void snd_ps3_audio_fixup(struct snd_ps3_card_info *card) { /* * avsetting driver seems to never change the followings @@ -906,7 +906,7 @@ static void __devinit snd_ps3_audio_fixup(struct snd_ps3_card_info *card) PS3_AUDIO_AO_3WMCTRL_ASOPLRCK_DEFAULT); } -static int __devinit snd_ps3_init_avsetting(struct snd_ps3_card_info *card) +static int snd_ps3_init_avsetting(struct snd_ps3_card_info *card) { int ret; pr_debug(%s: start\n, __func__); @@ -928,7 +928,7 @@ static int __devinit snd_ps3_init_avsetting(struct snd_ps3_card_info *card) return ret; } -static int __devinit snd_ps3_driver_probe(struct ps3_system_bus_device *dev) +static int snd_ps3_driver_probe(struct ps3_system_bus_device *dev) { int i, ret; u64 lpar_addr, lpar_size; -- 1.8.0.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 13/20] ALSA: sound/ps3: remove __dev* attributes
On Thu, 2012-12-06 at 12:35 -0500, Bill Pemberton wrote: CONFIG_HOTPLUG is going away as an option. As result the __dev* markings will be going away. Remove use of __devinit, __devexit_p, __devinitdata, __devinitconst, and __devexit. Signed-off-by: Bill Pemberton wf...@virginia.edu Cc: Geoff Levand ge...@infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: cbe-oss-...@lists.ozlabs.org --- sound/ppc/snd_ps3.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) Looks OK for PS3. Acked-by: Geoff Levand ge...@infradead.org ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap
Hi Wu, I met some problems when I was digging into the code. It's very kind of you if you could help me with that. :) If I misunderstood your code, please tell me. Please see below. :) On 12/03/2012 10:23 AM, Jianguo Wu wrote: Signed-off-by: Jianguo Wuwujian...@huawei.com Signed-off-by: Jiang Liujiang@huawei.com --- include/linux/mm.h |1 + mm/sparse-vmemmap.c | 231 +++ mm/sparse.c |3 +- 3 files changed, 234 insertions(+), 1 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5657670..1f26af5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long size); +void vmemmap_free(struct page *memmap, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 0, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22a..748732d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -29,6 +29,10 @@ #includeasm/pgalloc.h #includeasm/pgtable.h +#ifdef CONFIG_MEMORY_HOTREMOVE +#includeasm/tlbflush.h +#endif + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, vmemmap_buf_end = NULL; } } + +#ifdef CONFIG_MEMORY_HOTREMOVE + +#define PAGE_INUSE 0xFD + +static void vmemmap_free_pages(struct page *page, int order) +{ + struct zone *zone; + unsigned long magic; + + magic = (unsigned long) page-lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + put_page_bootmem(page); + + zone = page_zone(page); + zone_span_writelock(zone); + zone-present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; + } else + free_pages((unsigned long)page_address(page), order); Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated by bootmem, so I put this function this way. I'm not sure if parameter order is necessary here. It will always be 0 in your code. Is this OK to you ? static void free_pagetable(struct page *page) { struct zone *zone; bool bootmem = false; unsigned long magic; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); bootmem = true; } magic = (unsigned long) page-lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) put_page_bootmem(page); else __free_page(page); /* * SECTION_INFO pages and MIX_SECTION_INFO pages * are all allocated by bootmem. */ if (bootmem) { zone = page_zone(page); zone_span_writelock(zone); zone-present_pages++; zone_span_writeunlock(zone); totalram_pages++; } } (snip) + +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + unsigned long next; + void *page_addr; + + pte = pte_offset_kernel(pmd, addr); + for (; addr end; pte++, addr += PAGE_SIZE) { + next = (addr + PAGE_SIZE) PAGE_MASK; + if (next end) + next = end; + + if (pte_none(*pte)) Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used !xxx_present() in your x86_64 patches. Is it OK if I only check !xxx_present() ? + continue; + if (IS_ALIGNED(addr, PAGE_SIZE) + IS_ALIGNED(next, PAGE_SIZE)) { + vmemmap_free_pages(pte_page(*pte), 0); + spin_lock(init_mm.page_table_lock); + pte_clear(init_mm, addr, pte); + spin_unlock(init_mm.page_table_lock); + } else { + /* +* Removed page structs are filled with 0xFD. +*/ + memset((void *)addr, PAGE_INUSE, next - addr); + page_addr = page_address(pte_page(*pte)); + + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + spin_lock(init_mm.page_table_lock); + pte_clear(init_mm, addr, pte); + spin_unlock(init_mm.page_table_lock); Here, since we clear pte, we should also free the page, right ? + } + } + } + +
[RFC] Add IBM Blue Gene/Q Platform
Rather than flood the mailing list with the patches, I've arranged for a git repo to hold the changesets. You can find the repo here: https://github.com/jimix/linux-bgq They are against GregKH's linux-stable.git long-term 3.4.y (y=22) branch. The first 9 (6e58088f..) effect common code and the rest are BGQ specific. Here is a are the summary logs: $ git log --reverse linux-stable/linux-3.4.y.. commit 5a8edb2bdd914597693eed299119ff4c2e6d31f2 Author: Jimi Xenidis ji...@pobox.com Date: Fri Nov 9 09:26:00 2012 -0600 powerpc: Fix cputable #ifdef where CONFIG_PPC_A2 is used for CONFIG_PPC_BOOK3E_64 Signed-off-by: Jimi Xenidis ji...@pobox.com commit ea51920d7035c8d23801d6de46261e7d0a537dfd Author: Jimi Xenidis ji...@pobox.com Date: Fri Nov 9 08:58:27 2012 -0600 powerpc/book3e: Remove config for PPC_A2_DD2 since there is no reference to it This must have been leftover from early DD1 days which is not present in any current kernel code. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 08151401a5db4ff0d441a1b7bf8ad92bd92b14c5 Author: Jimi Xenidis ji...@pobox.com Date: Mon Nov 5 09:38:01 2012 -0600 powerpc/dcr: Some native DCR fixes The following fixes have been made: - dcr_read/write_native() must use the indexed version of the m[ft]dcrx since the non-indexed version only allows a 10-bit numerical space, but the C interface allows a full 32-bits. - C bindings for m[ft]dcrx, and the table versions, should use unsigned long so that they are 64/32 bit neutral. - The table versions (__m[ft]cdr) should obtain the table address with LOAD_REG_ADDR(), this will also make it 64/32bit neutral. Signed-off-by: Jimi Xenidis ji...@pobox.com commit c8320a5daaceed03992d763302020834ea8e17dd Author: Jimi Xenidis ji...@pobox.com Date: Mon Nov 5 09:12:00 2012 -0600 powerpc/dcr: Add 64-bit DCR access methods. This patch adds the ability to make 64-bit Device Control Register (DCR) accesses. Signed-off-by: Jimi Xenidis ji...@pobox.com commit a763b3f8453b3bd83d7dded8c6644939863af430 Author: Jimi Xenidis ji...@pobox.com Date: Thu Nov 29 12:49:24 2012 -0500 powerpc/boot: Add a spin_threads hook to platform_ops It is useful for the boot program to arrange for all secondary cpus and threads to enter the kernel in a kexec fashion. This hook makes it possible. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 391e43393380b514d4d02a42d059619542c7597b Author: Jimi Xenidis ji...@pobox.com Date: Thu Nov 29 13:01:23 2012 -0500 powerpc/kexec: Add kexec hold support for Book3e processors This patch add two items: 1) Book3e requires that GPR4 survive the hold process, so we make sure that happens. 2) Book3e has no real mode, and the hold code exploits this. Since these processors ares always translated, we arrange for the kexeced threads to enter the hold code using the normal kernel linear mapping. Signed-off-by: Jimi Xenidis ji...@pobox.com commit f6e3c1f706cb6922349d639a74ff6c50acc8b9f8 Author: Jimi Xenidis ji...@pobox.com Date: Wed Dec 5 13:41:25 2012 -0500 powerpc: Remove unecessary VSX symbols The symbol THREAD_VSR0 is defined to be the same as THREAD_FPR0. Its presence causes build issues with more complex configurations. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 4e817bb42ec8e3d3689877528dd97c4286a870eb Author: Jimi Xenidis ji...@pobox.com Date: Tue Nov 20 10:10:52 2012 -0600 Blue Gene/Q wicked optimizing compiler does not know the rfdi instruction yet Signed-off-by: Jimi Xenidis ji...@pobox.com commit 2071aa58b2f3b33d97c94e3a127f7c5d4ffaeb34 Author: Jimi Xenidis ji...@pobox.com Date: Tue Nov 20 10:14:22 2012 -0600 Blue Gene/Q wicked optimizing compiler does not know the mfdcrx instruction yet Signed-off-by: Jimi Xenidis ji...@pobox.com commit 6e58088fabedbb2d724637b539ba180c03ed8b68 Author: Jimi Xenidis ji...@pobox.com Date: Wed Oct 31 16:33:21 2012 -0500 powerpc/book3e: IBM Blue Gene/Q Boot This patch specifically deals with the initial program load environment so that a boot image (dtbImage.bgq) can be loaded by the BGQ management tools. The boot code is a little odd because it has to deal with the following issues: - Linux boot image wrappers are 32-bit programs - BGQ Tools only load 64bit ELF programs - BGQ Firmware information is typically loaded at an address 4G - BGQ FW information contains 64-bit ABI function pointers (which are actually function descriptors) to access firmware methods - BGQ FW methods must be called in 64-bit mode Includes code contributed from: Andrew Tauferner atau...@us.ibm.com Todd Inglett tingl...@us.ibm.com Eric Van Hensbergen eri...@gmail.com Signed-off-by: Jimi Xenidis ji...@pobox.com commit
Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap
Hi Tang, On 2012/12/7 9:42, Tang Chen wrote: Hi Wu, I met some problems when I was digging into the code. It's very kind of you if you could help me with that. :) If I misunderstood your code, please tell me. Please see below. :) On 12/03/2012 10:23 AM, Jianguo Wu wrote: Signed-off-by: Jianguo Wuwujian...@huawei.com Signed-off-by: Jiang Liujiang@huawei.com --- include/linux/mm.h |1 + mm/sparse-vmemmap.c | 231 +++ mm/sparse.c |3 +- 3 files changed, 234 insertions(+), 1 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5657670..1f26af5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long size); +void vmemmap_free(struct page *memmap, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 0, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22a..748732d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -29,6 +29,10 @@ #includeasm/pgalloc.h #includeasm/pgtable.h +#ifdef CONFIG_MEMORY_HOTREMOVE +#includeasm/tlbflush.h +#endif + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, vmemmap_buf_end = NULL; } } + +#ifdef CONFIG_MEMORY_HOTREMOVE + +#define PAGE_INUSE 0xFD + +static void vmemmap_free_pages(struct page *page, int order) +{ +struct zone *zone; +unsigned long magic; + +magic = (unsigned long) page-lru.next; +if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { +put_page_bootmem(page); + +zone = page_zone(page); +zone_span_writelock(zone); +zone-present_pages++; +zone_span_writeunlock(zone); +totalram_pages++; +} else +free_pages((unsigned long)page_address(page), order); Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated by bootmem, so I put this function this way. I'm not sure if parameter order is necessary here. It will always be 0 in your code. Is this OK to you ? parameter order is necessary in cpu_has_pse case: vmemmap_pmd_remove free_pagetable(pmd_page(*pmd), get_order(PMD_SIZE)) static void free_pagetable(struct page *page) { struct zone *zone; bool bootmem = false; unsigned long magic; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); bootmem = true; } magic = (unsigned long) page-lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) put_page_bootmem(page); else __free_page(page); /* * SECTION_INFO pages and MIX_SECTION_INFO pages * are all allocated by bootmem. */ if (bootmem) { zone = page_zone(page); zone_span_writelock(zone); zone-present_pages++; zone_span_writeunlock(zone); totalram_pages++; } } (snip) + +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long end) +{ +pte_t *pte; +unsigned long next; +void *page_addr; + +pte = pte_offset_kernel(pmd, addr); +for (; addr end; pte++, addr += PAGE_SIZE) { +next = (addr + PAGE_SIZE) PAGE_MASK; +if (next end) +next = end; + +if (pte_none(*pte)) Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used !xxx_present() in your x86_64 patches. Is it OK if I only check !xxx_present() ? It is Ok. +continue; +if (IS_ALIGNED(addr, PAGE_SIZE) +IS_ALIGNED(next, PAGE_SIZE)) { +vmemmap_free_pages(pte_page(*pte), 0); +spin_lock(init_mm.page_table_lock); +pte_clear(init_mm, addr, pte); +spin_unlock(init_mm.page_table_lock); +} else { +/* + * Removed page structs are filled with 0xFD. + */ +memset((void *)addr, PAGE_INUSE, next - addr); +page_addr = page_address(pte_page(*pte)); + +if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { +spin_lock(init_mm.page_table_lock); +pte_clear(init_mm, addr, pte); +spin_unlock(init_mm.page_table_lock); Here, since we clear pte, we should also free the page, right ? Right, I forgot here, sorry. +} +
Re: [RFC] Add IBM Blue Gene/Q Platform
commit f6e3c1f706cb6922349d639a74ff6c50acc8b9f8 Author: Jimi Xenidis ji...@pobox.com Date: Wed Dec 5 13:41:25 2012 -0500 powerpc: Remove unecessary VSX symbols The symbol THREAD_VSR0 is defined to be the same as THREAD_FPR0. Its presence causes build issues with more complex configurations. Signed-off-by: Jimi Xenidis ji...@pobox.com Can you explain what these complex configurations are? Mikey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC] Add IBM Blue Gene/Q Platform
commit 279c0615917b959a652e81f4ad0d886e2d426d85 Author: Jimi Xenidis ji...@pobox.com Date: Wed Dec 5 13:43:22 2012 -0500 powerpc/book3e: IBM Blue Gene/Q Quad Processing eXtention (QPX) This enables kernel support for the QPX extention and is intended for processors that support it, usually an IBM Blue Gene processor. Turning it on does not effect other processors but it does add code and will quadruple the per thread save and restore area for the FPU (hense the name). If you have enabled VSX it will only double the space. Signed-off-by: Jimi Xenidis ji...@pobox.com Can you give a diagram of how the QPX registers are layed out. +#if defined(CONFIG_PPC_QPX) +#define TS_FPRWIDTH 4 +#elif defined(CONFIG_VSX) Are they 256 bits wide? +#define QVLFDXA(QRT,RA,RB) \ + .long (0x7c00048f | ((QRT) 21) | ((RA) 16) | ((RB) 11)) Put this in ppc-opcode.h. +#if defined(CONFIG_VSX) || defined(CONFIG_PPC_QPX) + /* they are the same MSR bit */ OMG! +BEGIN_FTR_SECTION \ + SAVE_32VSRS(n,c,base); \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX);\ +BEGIN_FTR_SECTION \ + SAVE_32QRS(n,c,base); \ +END_FTR_SECTION_IFSET(CPU_FTR_QPX); I don't think we want to do this. We are going to end up with 64 NOPS here somewhere. I'd like to see this patch broken into different parts. Also, have you boot tested this change on a VSX enabled box? Mikey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC] Add IBM Blue Gene/Q Platform
Michael Neuling mi...@neuling.org wrote: commit 279c0615917b959a652e81f4ad0d886e2d426d85 Author: Jimi Xenidis ji...@pobox.com Date: Wed Dec 5 13:43:22 2012 -0500 powerpc/book3e: IBM Blue Gene/Q Quad Processing eXtention (QPX) This enables kernel support for the QPX extention and is intended for processors that support it, usually an IBM Blue Gene processor. Turning it on does not effect other processors but it does add code and will quadruple the per thread save and restore area for the FPU (hense the name). If you have enabled VSX it will only double the space. Signed-off-by: Jimi Xenidis ji...@pobox.com Can you give a diagram of how the QPX registers are layed out. +#if defined(CONFIG_PPC_QPX) +#define TS_FPRWIDTH 4 +#elif defined(CONFIG_VSX) Are they 256 bits wide? +#define QVLFDXA(QRT,RA,RB) \ + .long (0x7c00048f | ((QRT) 21) | ((RA) 16) | ((RB) 11)) Put this in ppc-opcode.h. +#if defined(CONFIG_VSX) || defined(CONFIG_PPC_QPX) + /* they are the same MSR bit */ OMG! +BEGIN_FTR_SECTION\ + SAVE_32VSRS(n,c,base); \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ +BEGIN_FTR_SECTION\ + SAVE_32QRS(n,c,base); \ +END_FTR_SECTION_IFSET(CPU_FTR_QPX); I don't think we want to do this. We are going to end up with 64 NOPS here somewhere. I'd like to see this patch broken into different parts. Also, have you boot tested this change on a VSX enabled box? Also, this is going to clash with the transactional memory patches. Mikey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC] Add IBM Blue Gene/Q Platform
Jimi Xenidis ji...@pobox.com wrote: Rather than flood the mailing list with the patches, I've arranged for a git repo to hold the changesets. You can find the repo here: https://github.com/jimix/linux-bgq They are against GregKH's linux-stable.git long-term 3.4.y (y=22) branch. The first 9 (6e58088f..) effect common code and the rest are BGQ specific. Do you actually want this upstream? I assume no. Mikey Here is a are the summary logs: $ git log --reverse linux-stable/linux-3.4.y.. commit 5a8edb2bdd914597693eed299119ff4c2e6d31f2 Author: Jimi Xenidis ji...@pobox.com Date: Fri Nov 9 09:26:00 2012 -0600 powerpc: Fix cputable #ifdef where CONFIG_PPC_A2 is used for CONFIG_PPC_BOOK3E_64 Signed-off-by: Jimi Xenidis ji...@pobox.com commit ea51920d7035c8d23801d6de46261e7d0a537dfd Author: Jimi Xenidis ji...@pobox.com Date: Fri Nov 9 08:58:27 2012 -0600 powerpc/book3e: Remove config for PPC_A2_DD2 since there is no reference to it This must have been leftover from early DD1 days which is not present in any current kernel code. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 08151401a5db4ff0d441a1b7bf8ad92bd92b14c5 Author: Jimi Xenidis ji...@pobox.com Date: Mon Nov 5 09:38:01 2012 -0600 powerpc/dcr: Some native DCR fixes The following fixes have been made: - dcr_read/write_native() must use the indexed version of the m[ft]dcrx since the non-indexed version only allows a 10-bit numerical space, but the C interface allows a full 32-bits. - C bindings for m[ft]dcrx, and the table versions, should use unsigned long so that they are 64/32 bit neutral. - The table versions (__m[ft]cdr) should obtain the table address with LOAD_REG_ADDR(), this will also make it 64/32bit neutral. Signed-off-by: Jimi Xenidis ji...@pobox.com commit c8320a5daaceed03992d763302020834ea8e17dd Author: Jimi Xenidis ji...@pobox.com Date: Mon Nov 5 09:12:00 2012 -0600 powerpc/dcr: Add 64-bit DCR access methods. This patch adds the ability to make 64-bit Device Control Register (DCR) accesses. Signed-off-by: Jimi Xenidis ji...@pobox.com commit a763b3f8453b3bd83d7dded8c6644939863af430 Author: Jimi Xenidis ji...@pobox.com Date: Thu Nov 29 12:49:24 2012 -0500 powerpc/boot: Add a spin_threads hook to platform_ops It is useful for the boot program to arrange for all secondary cpus and threads to enter the kernel in a kexec fashion. This hook makes it possible. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 391e43393380b514d4d02a42d059619542c7597b Author: Jimi Xenidis ji...@pobox.com Date: Thu Nov 29 13:01:23 2012 -0500 powerpc/kexec: Add kexec hold support for Book3e processors This patch add two items: 1) Book3e requires that GPR4 survive the hold process, so we make sure that happens. 2) Book3e has no real mode, and the hold code exploits this. Since these processors ares always translated, we arrange for the kexeced threads to enter the hold code using the normal kernel linear mapping. Signed-off-by: Jimi Xenidis ji...@pobox.com commit f6e3c1f706cb6922349d639a74ff6c50acc8b9f8 Author: Jimi Xenidis ji...@pobox.com Date: Wed Dec 5 13:41:25 2012 -0500 powerpc: Remove unecessary VSX symbols The symbol THREAD_VSR0 is defined to be the same as THREAD_FPR0. Its presence causes build issues with more complex configurations. Signed-off-by: Jimi Xenidis ji...@pobox.com commit 4e817bb42ec8e3d3689877528dd97c4286a870eb Author: Jimi Xenidis ji...@pobox.com Date: Tue Nov 20 10:10:52 2012 -0600 Blue Gene/Q wicked optimizing compiler does not know the rfdi instruction yet Signed-off-by: Jimi Xenidis ji...@pobox.com commit 2071aa58b2f3b33d97c94e3a127f7c5d4ffaeb34 Author: Jimi Xenidis ji...@pobox.com Date: Tue Nov 20 10:14:22 2012 -0600 Blue Gene/Q wicked optimizing compiler does not know the mfdcrx instruction yet Signed-off-by: Jimi Xenidis ji...@pobox.com commit 6e58088fabedbb2d724637b539ba180c03ed8b68 Author: Jimi Xenidis ji...@pobox.com Date: Wed Oct 31 16:33:21 2012 -0500 powerpc/book3e: IBM Blue Gene/Q Boot This patch specifically deals with the initial program load environment so that a boot image (dtbImage.bgq) can be loaded by the BGQ management tools. The boot code is a little odd because it has to deal with the following issues: - Linux boot image wrappers are 32-bit programs - BGQ Tools only load 64bit ELF programs - BGQ Firmware information is typically loaded at an address 4G - BGQ FW information contains 64-bit ABI function pointers (which are actually function descriptors) to access firmware methods - BGQ FW methods must be called in 64-bit mode
Re: [Patch v4 09/12] memory-hotplug: remove page table of x86_64 architecture
On 11/27/2012 06:00 PM, Wen Congyang wrote: For hot removing memory, we sholud remove page table about the memory. So the patch searches a page table about the removed memory, and clear page table. (snip) +void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + unsigned long next; + bool pgd_changed = false; + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); Hi Wu, Here, you expect start and end are physical addresses. But in phys_xxx_remove() function, I think using virtual addresses is just fine. Functions like pmd_addr_end() and pud_index() only calculate an offset. So, would you please tell me if we have to use physical addresses here ? Thanks. :) + + for (; start end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + next = pgd_addr_end(start, end); + + if (!pgd_present(*pgd)) + continue; + + pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + phys_pud_remove(pud, __pa(start), __pa(next)); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + unmap_low_page(pud); + } + + if (pgd_changed) + sync_global_pgds(start, end - 1); + + flush_tlb_all(); +} + #ifdef CONFIG_MEMORY_HOTREMOVE int __ref arch_remove_memory(u64 start, u64 size) { @@ -692,6 +921,8 @@ int __ref arch_remove_memory(u64 start, u64 size) ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + kernel_physical_mapping_remove(start, start + size); + return ret; } #endif ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [Patch v4 09/12] memory-hotplug: remove page table of x86_64 architecture
On 2012/12/7 14:43, Tang Chen wrote: On 11/27/2012 06:00 PM, Wen Congyang wrote: For hot removing memory, we sholud remove page table about the memory. So the patch searches a page table about the removed memory, and clear page table. (snip) +void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ +unsigned long next; +bool pgd_changed = false; + +start = (unsigned long)__va(start); +end = (unsigned long)__va(end); Hi Wu, Here, you expect start and end are physical addresses. But in phys_xxx_remove() function, I think using virtual addresses is just fine. Functions like pmd_addr_end() and pud_index() only calculate an offset. Hi Tang, Virtual addresses will work fine, I used physical addresses in order to keep consistent with phys_pud[pmd/pte]_init(), So I think we should keep this. Thanks, Jianguo Wu So, would you please tell me if we have to use physical addresses here ? Thanks. :) + +for (; start end; start = next) { +pgd_t *pgd = pgd_offset_k(start); +pud_t *pud; + +next = pgd_addr_end(start, end); + +if (!pgd_present(*pgd)) +continue; + +pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); +phys_pud_remove(pud, __pa(start), __pa(next)); +if (free_pud_table(pud, pgd)) +pgd_changed = true; +unmap_low_page(pud); +} + +if (pgd_changed) +sync_global_pgds(start, end - 1); + +flush_tlb_all(); +} + #ifdef CONFIG_MEMORY_HOTREMOVE int __ref arch_remove_memory(u64 start, u64 size) { @@ -692,6 +921,8 @@ int __ref arch_remove_memory(u64 start, u64 size) ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); +kernel_physical_mapping_remove(start, start + size); + return ret; } #endif . ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] vfio powerpc: implemented IOMMU driver for VFIO
VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This patch implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWERPC guest). The counterpart in QEMU is required to support this functionality. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/Kconfig|6 + drivers/vfio/Makefile |1 + drivers/vfio/vfio_iommu_spapr_tce.c | 348 +++ include/linux/vfio.h| 30 +++ 4 files changed, 385 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec..b464687 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate VFIO Non-Privileged userspace driver framework depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 2398d4a..72bfabc 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_VFIO) += vfio.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c new file mode 100644 index 000..b0f81fe --- /dev/null +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -0,0 +1,348 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2012 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy a...@ozlabs.ru + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_type1.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson alex.william...@redhat.com + */ + +#include linux/module.h +#include linux/pci.h +#include linux/slab.h +#include linux/uaccess.h +#include linux/err.h +#include linux/vfio.h +#include asm/iommu.h + +#define DRIVER_VERSION 0.1 +#define DRIVER_AUTHOR a...@ozlabs.ru +#define DRIVER_DESC VFIO IOMMU SPAPR TCE + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group); + +/* + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation + */ + +/* + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +#define NPAGE_TO_SIZE(npage) ((size_t)(npage) PAGE_SHIFT) + +struct vwork { + struct mm_struct*mm; + longnpage; + struct work_struct work; +}; + +/* delayed decrement/increment for locked_vm */ +static void lock_acct_bg(struct work_struct *work) +{ + struct vwork *vwork = container_of(work, struct vwork, work); + struct mm_struct *mm; + + mm = vwork-mm; + down_write(mm-mmap_sem); + mm-locked_vm += vwork-npage; + up_write(mm-mmap_sem); + mmput(mm); + kfree(vwork); +} + +static void lock_acct(long npage) +{ + struct vwork *vwork; + struct mm_struct *mm; + + if (!current-mm) + return; /* process exited */ + + if (down_write_trylock(current-mm-mmap_sem)) { + current-mm-locked_vm += npage; + up_write(current-mm-mmap_sem); + return; + } + + /* +* Couldn't get mmap_sem lock, so must setup to update +* mm-locked_vm later. If locked_vm were atomic, we +* wouldn't need this silliness +*/ + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); + if (!vwork) + return; + mm = get_task_mm(current); + if (!mm) { + kfree(vwork); + return; + } + INIT_WORK(vwork-work, lock_acct_bg); + vwork-mm = mm; + vwork-npage = npage; + schedule_work(vwork-work); +} + +/* + * The container descriptor supports only a single group per container. + * Required by the API as the container is not supplied with the IOMMU group + * at the moment of initialization. + */ +struct tce_container { + struct mutex lock; + struct iommu_table *tbl; +}; + +static void
[PATCH] vfio powerpc: enabled on powernv platform
This patch initializes IOMMU groups based on the IOMMU configuration discovered during the PCI scan on POWERNV (POWER non virtualized) platform. The IOMMU groups are to be used later by VFIO driver (PCI pass through). It also implements an API for mapping/unmapping pages for guest PCI drivers and providing DMA window properties. This API is going to be used later by QEMU-VFIO to handle h_put_tce hypercalls from the KVM guest. Although this driver has been tested only on the POWERNV platform, it should work on any platform which supports TCE tables. To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option and configure VFIO as required. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 10 ++ arch/powerpc/kernel/iommu.c | 214 ++ arch/powerpc/platforms/powernv/pci.c | 134 + drivers/iommu/Kconfig|8 ++ 4 files changed, 366 insertions(+) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cbfe678..be3b11b 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -76,6 +76,9 @@ struct iommu_table { struct iommu_pool large_pool; struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; @@ -147,5 +150,12 @@ static inline void iommu_restore(void) } #endif +extern void iommu_reset_table(struct iommu_table *tbl, bool release); +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry, + unsigned long pages); +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry, + uint64_t tce, enum dma_data_direction direction, + unsigned long pages); + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ff5a6ce..123431a 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -44,6 +44,7 @@ #include asm/kdump.h #include asm/fadump.h #include asm/vio.h +#include asm/tce.h #define DBG(...) @@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ + +/* + * iommu_reset_table is called when it started/stopped being used + */ +void iommu_reset_table(struct iommu_table *tbl, bool release) +{ + /* +* Page at 0 is marked as used in iommu_init_table, +* so here we clear it when called with release=false... +*/ + if (!release (tbl-it_offset == 0)) + clear_bit(0, tbl-it_map); + + iommu_clear_tces(tbl, tbl-it_offset, tbl-it_size); + + memset(tbl-it_map, 0, (tbl-it_size + 7) 3); + + /* +* ... or restore when release=true +*/ + if (release (tbl-it_offset == 0)) + set_bit(0, tbl-it_map); +} +EXPORT_SYMBOL_GPL(iommu_reset_table); + +/* + * Returns the number of used IOMMU pages (4K) within + * the same system page (4K or 64K). + * bitmap_weight is not used as it does not support bigendian maps. + * offset is an IOMMU page number relative to DMA window start. + */ +static int syspage_weight(unsigned long *map, unsigned long offset) +{ + int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE; + + /* Aligns TCE entry number to system page boundary */ + offset = PAGE_MASK IOMMU_PAGE_SHIFT; + + /* Count used 4K pages */ + while (nbits) { + if (test_bit(offset, map)) + ++ret; + --nbits; + ++offset; + } + + return ret; +} + +static void tce_flush(struct iommu_table *tbl) +{ + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); +} + +/* + * iommu_clear_tces clears tces and returned the number of system pages + * which it called put_page() on + */ +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry, + unsigned long pages) +{ + int i, retpages = 0, clr; + unsigned long oldtce, oldweight; + struct page *page; + + for (i = 0; i pages; ++i) { + if (!test_bit(entry + i - tbl-it_offset, tbl-it_map)) + continue; + + oldtce = ppc_md.tce_get(tbl, entry + i); + ppc_md.tce_free(tbl, entry + i, 1); + + oldweight = syspage_weight(tbl-it_map, + entry + i - tbl-it_offset); + clr = __test_and_clear_bit(entry + i - tbl-it_offset, + tbl-it_map); + +
[PATCH 0/6] powerpc: SMT priority (PPR) save and restore
Ben, This patch-set is created against your tree (next branch) and fixed the build failure that you pointed. Changes from the previous version: - Changes for PPR save/restore in denorm_exception_hv, data_access_slb_relon_pSeries and instruction_access_slb_relon_pSeries exception vectors (P8) code. - Fix build failure with ppc64e_defconfig - Macro name changes (HMT_MEDIUM_PPR_DISCARD and HMT_MEDIUM_PPR_SAVE) and other fixes as Michael Neuling suggested [PATCH 0/6] powerpc: SMT priority (PPR) save and restore On P7/P8 systems, users can define SMT priority levels 2,3 and 4 for processes so that some can run higher priority than the other ones. In the current kernel, the default priority is set to 4 which prohibits processes for using higher priority. Also the kernel boosts the priority to 4 during exceptions without saving the user defined priorities when the task enters the kernel. So we will be loosing the process PPR value and can not be restored it back when the task exits the kernel. This patch-set implements saving and restore the user defined PPR value for all tasks. With null_syscall testcase (http://ozlabs.org/~anton/junkcode/null_syscall.c), this feature takes around extra 10 CPU cycles on average for 25 samples. Haren Myneni (6): powerpc: Move branch instruction from ACCOUNT_CPU_USER_ENTRY to caller powerpc: Enable PPR save/restore powerpc: Increase exceptions arrays in paca struct to save PPR powerpc: Define ppr in thread_struct powerpc: Macros for saving/restore PPR powerpc: Implement PPR save/restore arch/powerpc/include/asm/cputable.h |7 +++- arch/powerpc/include/asm/exception-64s.h | 59 +- arch/powerpc/include/asm/paca.h |6 ++-- arch/powerpc/include/asm/ppc_asm.h | 27 +- arch/powerpc/include/asm/processor.h | 12 ++ arch/powerpc/include/asm/reg.h |1 + arch/powerpc/kernel/asm-offsets.c|1 + arch/powerpc/kernel/entry_64.S |6 +++- arch/powerpc/kernel/exceptions-64e.S |3 +- arch/powerpc/kernel/exceptions-64s.S | 23 +++- arch/powerpc/kernel/process.c|2 + 11 files changed, 119 insertions(+), 28 deletions(-) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/6] powerpc: Move branch instruction from ACCOUNT_CPU_USER_ENTRY to caller
[PATCH 1/6] powerpc: Move branch instruction from ACCOUNT_CPU_USER_ENTRY to caller The first instruction in ACCOUNT_CPU_USER_ENTRY is 'beq' which checks for exceptions coming from kernel mode. PPR value will be saved immediately after ACCOUNT_CPU_USER_ENTRY and is also for user level exceptions. So moved this branch instruction in the caller code. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/exception-64s.h |3 ++- arch/powerpc/include/asm/ppc_asm.h |2 -- arch/powerpc/kernel/entry_64.S |3 ++- arch/powerpc/kernel/exceptions-64e.S |3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index ad708dd..697de09 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -224,8 +224,9 @@ do_kvm_##n: \ std r10,0(r1); /* make stack chain pointer */ \ std r0,GPR0(r1);/* save r0 in stackframe*/ \ std r10,GPR1(r1); /* save r1 in stackframe*/ \ + beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r9, r10); \ - std r2,GPR2(r1);/* save r2 in stackframe*/ \ +4: std r2,GPR2(r1);/* save r2 in stackframe*/ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe*/ \ ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index ea2a86e..376e36d 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -30,7 +30,6 @@ #define ACCOUNT_STOLEN_TIME #else #define ACCOUNT_CPU_USER_ENTRY(ra, rb) \ - beq 2f; /* if from kernel mode */ \ MFTB(ra); /* get timebase */ \ ld rb,PACA_STARTTIME_USER(r13);\ std ra,PACA_STARTTIME(r13); \ @@ -38,7 +37,6 @@ ld ra,PACA_USER_TIME(r13); \ add ra,ra,rb; /* add on to user time */ \ std ra,PACA_USER_TIME(r13); \ -2: #define ACCOUNT_CPU_USER_EXIT(ra, rb) \ MFTB(ra); /* get timebase */ \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index ad7..4e78247 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -62,8 +62,9 @@ system_call_common: std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) + beq 2f /* if from kernel mode */ ACCOUNT_CPU_USER_ENTRY(r10, r11) - std r2,GPR2(r1) +2: std r2,GPR2(r1) std r3,GPR3(r1) mfcrr2 std r4,GPR4(r1) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 4684e33..ae54553 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -159,8 +159,9 @@ exc_##n##_common: \ std r9,GPR9(r1);/* save r9 in stackframe */ \ std r10,_NIP(r1); /* save SRR0 to stackframe */ \ std r11,_MSR(r1); /* save SRR1 to stackframe */ \ + beq 2f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \ - ld r3,excf+EX_R10(r13);/* get back r10 */ \ +2: ld r3,excf+EX_R10(r13);/* get back r10 */ \ ld r4,excf+EX_R11(r13);/* get back r11 */ \ mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */ \ std r12,GPR12(r1); /* save r12 in stackframe */\ -- 1.7.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/6] powerpc: Enable PPR save/restore
[PATCH 2/6] powerpc: Enable PPR save/restore SMT thread status register (PPR) is used to set thread priority. This patch enables PPR save/restore feature (CPU_FTR_HAS_PPR) on POWER7 and POWER8 systems. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/cputable.h |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 76f81bd..241d65d 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -171,6 +171,7 @@ extern const char *powerpc_base_platform; #define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0800) #define CPU_FTR_ICSWX LONG_ASM_CONST(0x1000) #define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x2000) +#defineCPU_FTR_HAS_PPR LONG_ASM_CONST(0x4000) #ifndef __ASSEMBLY__ @@ -400,7 +401,8 @@ extern const char *powerpc_base_platform; CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY) + CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \ + CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR) #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -408,7 +410,8 @@ extern const char *powerpc_base_platform; CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY) + CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \ + CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ -- 1.7.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/6] powerpc: Increase exceptions arrays in paca struct to save PPR
[PATCH 3/6] powerpc: Increase exceptions arrays in paca struct to save PPR Using paca to save user defined PPR value in the first level exception vector. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/exception-64s.h |1 + arch/powerpc/include/asm/paca.h |6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 697de09..3b24ca9 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -47,6 +47,7 @@ #define EX_R3 64 #define EX_LR 72 #define EX_CFAR80 +#define EX_PPR 88 /* SMT thread status register (priority) */ #ifdef CONFIG_RELOCATABLE #define EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index e9e7a69..c47d687 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -93,9 +93,9 @@ struct paca_struct { * Now, starting in cacheline 2, the exception save areas */ /* used for most interrupts/exceptions */ - u64 exgen[11] __attribute__((aligned(0x80))); - u64 exmc[11]; /* used for machine checks */ - u64 exslb[11]; /* used for SLB/segment table misses + u64 exgen[12] __attribute__((aligned(0x80))); + u64 exmc[12]; /* used for machine checks */ + u64 exslb[12]; /* used for SLB/segment table misses * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; -- 1.7.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 4/6] powerpc: Define ppr in thread_struct
[PATCH 4/6] powerpc: Define ppr in thread_struct ppr in thread_struct is used to save PPR and restore it before process exits from kernel. This patch sets the default priority to 3 when tasks are created such that users can use 4 for higher priority tasks. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/processor.h | 12 arch/powerpc/kernel/asm-offsets.c|1 + arch/powerpc/kernel/process.c|2 ++ 3 files changed, 15 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 8750204..37f87f0 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -18,6 +18,16 @@ #define TS_FPRWIDTH 1 #endif +#ifdef CONFIG_PPC64 +/* Default SMT priority is set to 3. Use 11- 13bits to save priority. */ +#define PPR_PRIORITY 3 +#ifdef __ASSEMBLY__ +#define INIT_PPR (PPR_PRIORITY 50) +#else +#define INIT_PPR ((u64)PPR_PRIORITY 50) +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PPC64 */ + #ifndef __ASSEMBLY__ #include linux/compiler.h #include linux/cache.h @@ -245,6 +255,7 @@ struct thread_struct { #ifdef CONFIG_PPC64 unsigned long dscr; int dscr_inherit; + unsigned long ppr;/* used to save/restore SMT priority */ #endif }; @@ -278,6 +289,7 @@ struct thread_struct { .fpr = {{0}}, \ .fpscr = { .val = 0, }, \ .fpexc_mode = 0, \ + .ppr = INIT_PPR, \ } #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7523539..41f65ec 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -77,6 +77,7 @@ int main(void) DEFINE(NMI_MASK, NMI_MASK); DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); + DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr)); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ba48233..2563acc 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -814,6 +814,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p-thread.dscr_inherit = current-thread.dscr_inherit; p-thread.dscr = current-thread.dscr; } + if (cpu_has_feature(CPU_FTR_HAS_PPR)) + p-thread.ppr = INIT_PPR; #endif /* * The PPC64 ABI makes use of a TOC to contain function -- 1.7.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 5/6] powerpc: Macros for saving/restore PPR
[PATCH 5/6] powerpc: Macros for saving/restore PPR Several macros are defined for saving and restore user defined PPR value. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/exception-64s.h | 37 ++ arch/powerpc/include/asm/ppc_asm.h | 25 arch/powerpc/include/asm/reg.h |1 + 3 files changed, 63 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 3b24ca9..090fcd1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -108,6 +108,43 @@ #define RESTORE_LR(reg, area) #endif +/* + * PPR save/restore macros used in exceptions_64s.S + * Used for P7 or later processors + */ +#define SAVE_PPR(area, ra, rb) \ +BEGIN_FTR_SECTION_NESTED(940) \ + ld ra,PACACURRENT(r13);\ + ld rb,area+EX_PPR(r13);/* Read PPR from paca */\ + std rb,TASKTHREADPPR(ra); \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,940) + +#define RESTORE_PPR_PACA(area, ra) \ +BEGIN_FTR_SECTION_NESTED(941) \ + ld ra,area+EX_PPR(r13);\ + mtspr SPRN_PPR,ra;\ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,941) + +/* + * Increase the priority on systems where PPR save/restore is not + * implemented/ supported. + */ +#define HMT_MEDIUM_PPR_DISCARD \ +BEGIN_FTR_SECTION_NESTED(942) \ + HMT_MEDIUM; \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,0,942) /*non P7*/ + +/* + * Save PPR in paca whenever some register is available to use. + * Then increase the priority. + */ +#define HMT_MEDIUM_PPR_SAVE(area, ra) \ +BEGIN_FTR_SECTION_NESTED(943) \ + mfspr ra,SPRN_PPR;\ + std ra,area+EX_PPR(r13);\ + HMT_MEDIUM; \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,943) + #define __EXCEPTION_PROLOG_1(area, extra, vec) \ GET_PACA(r13); \ std r9,area+EX_R9(r13); /* save r9 - r12 */ \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 376e36d..c2d0e58 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -389,6 +389,31 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) FTR_SECTION_ELSE_NESTED(848); \ mtocrf (FXM), RS; \ ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_NOEXECUTE, 848) + +/* + * PPR restore macros used in entry_64.S + * Used for P7 or later processors + */ +#define HMT_MEDIUM_LOW_HAS_PPR \ +BEGIN_FTR_SECTION_NESTED(944) \ + HMT_MEDIUM_LOW; \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,944) + +#define SET_DEFAULT_THREAD_PPR(ra, rb) \ +BEGIN_FTR_SECTION_NESTED(945) \ + lis ra,INIT_PPR@highest;/* default ppr=3 */ \ + ld rb,PACACURRENT(r13);\ + sldira,ra,32; /* 11- 13 bits are used for ppr */ \ + std ra,TASKTHREADPPR(rb); \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,945) + +#define RESTORE_PPR(ra, rb)\ +BEGIN_FTR_SECTION_NESTED(946) \ + ld ra,PACACURRENT(r13);\ + ld rb,TASKTHREADPPR(ra); \ + mtspr SPRN_PPR,rb;/* Restore PPR */ \ +END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,946) + #endif /* diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1b853f7..d395426 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -289,6 +289,7 @@ #define SPRN_DBAT6U0x23C /* Data BAT 6 Upper Register */ #define SPRN_DBAT7L0x23F /* Data BAT 7 Lower Register */ #define SPRN_DBAT7U0x23E /* Data BAT 7 Upper Register */ +#define SPRN_PPR 0x380 /* SMT Thread status Register */ #define SPRN_DEC 0x016
[PATCH 6/6] powerpc: Implement PPR save/restore
[PATCH 6/6] powerpc: Implement PPR save/restore When the task enters in to kernel space, the user defined priority (PPR) will be saved in to PACA at the beginning of first level exception vector and then copy from PACA to thread_info in second level vector. PPR will be restored from thread_info before exits the kernel space. P7/P8 temporarily raises the thread priority to higher level during exception until the program executes HMT_* calls. But it will not modify PPR register. So we save PPR value whenever some register is available to use and then calls HMT_MEDIUM to increase the priority. This feature supports on P7 or later processors. We save/ restore PPR for all exception vectors except system call entry. GLIBC will be saving / restore for system calls. So the default PPR value (3) will be set for the system call exit when the task returned to the user space. Signed-off-by: Haren Myneni ha...@us.ibm.com --- arch/powerpc/include/asm/exception-64s.h | 18 ++ arch/powerpc/kernel/entry_64.S |3 +++ arch/powerpc/kernel/exceptions-64s.S | 23 +-- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 090fcd1..c235867 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -147,8 +147,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,943) #define __EXCEPTION_PROLOG_1(area, extra, vec) \ GET_PACA(r13); \ - std r9,area+EX_R9(r13); /* save r9 - r12 */ \ - std r10,area+EX_R10(r13); \ + std r9,area+EX_R9(r13); /* save r9 */ \ + HMT_MEDIUM_PPR_SAVE(area, r9); \ + std r10,area+EX_R10(r13); /* save r10 - r12 */\ BEGIN_FTR_SECTION_NESTED(66); \ mfspr r10,SPRN_CFAR; \ std r10,area+EX_CFAR(r13); \ @@ -264,6 +265,7 @@ do_kvm_##n: \ std r10,GPR1(r1); /* save r1 in stackframe*/ \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r9, r10); \ + SAVE_PPR(area, r9, r10); \ 4: std r2,GPR2(r1);/* save r2 in stackframe*/ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe*/ \ @@ -305,7 +307,7 @@ do_kvm_##n: \ . = loc;\ .globl label##_pSeries; \ label##_pSeries: \ - HMT_MEDIUM; \ + HMT_MEDIUM_PPR_DISCARD; \ SET_SCRATCH0(r13); /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,\ EXC_STD, KVMTEST_PR, vec) @@ -314,7 +316,7 @@ label##_pSeries:\ . = loc;\ .globl label##_hv; \ label##_hv:\ - HMT_MEDIUM; \ + HMT_MEDIUM_PPR_DISCARD; \ SET_SCRATCH0(r13); /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,\ EXC_HV, KVMTEST, vec) @@ -323,7 +325,7 @@ label##_hv: \ . = loc;\ .globl label##_relon_pSeries; \ label##_relon_pSeries: \ - HMT_MEDIUM; \ + HMT_MEDIUM_PPR_DISCARD; \ /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \ @@ -333,7 +335,7 @@ label##_relon_pSeries: \ . = loc;\ .globl label##_relon_hv;\ label##_relon_hv: \ - HMT_MEDIUM; \ + HMT_MEDIUM_PPR_DISCARD; \ /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13