Re: [RFC v9 PATCH 16/21] memory-hotplug: free memmap of sparse-vmemmap

2012-10-04 Thread Yasuaki Ishimatsu

Hi Chen,

Sorry for late reply.

2012/10/02 13:21, Ni zhan Chen wrote:

On 09/05/2012 05:25 PM, we...@cn.fujitsu.com wrote:

From: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
  1. When removing memory, the page structs of the revmoved memory are filled
 with 0FD.
  2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
 In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Benjamin Herrenschmidt b...@kernel.crashing.org
CC: Paul Mackerras pau...@samba.org
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
  arch/ia64/mm/discontig.c  |8 +++
  arch/powerpc/mm/init_64.c |8 +++
  arch/s390/mm/vmem.c   |8 +++
  arch/sparc/mm/init_64.c   |8 +++
  arch/x86/mm/init_64.c |  119 +
  include/linux/mm.h|2 +
  mm/memory_hotplug.c   |   17 +--
  mm/sparse.c   |5 +-
  8 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..0d23b69 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct page *start_page,
  return vmemmap_populate_basepages(start_page, size, node);
  }
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
  void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
  {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 3690c44..835a2b3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -299,6 +299,14 @@ int __meminit vmemmap_populate(struct page *start_page,
  return 0;
  }
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
  void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
  {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index eda55cd..4b42b0b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -227,6 +227,14 @@ out:
  return ret;
  }
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
  void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
  {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index add1cc7..1384826 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2078,6 +2078,14 @@ void __meminit vmemmap_populate_print_last(void)
  }
  }
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
  void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
  {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0075592..4e8f8a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1138,6 +1138,125 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
  return 0;
  }
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+struct page **pp, int *page_size)
+{
+pgd_t *pgd;
+pud_t *pud;
+pmd_t *pmd;
+pte_t *pte;
+void *page_addr;
+unsigned long next;
+
+*pp = NULL;
+
+pgd = pgd_offset_k(addr);
+if (pgd_none(*pgd))
+return pgd_addr_end(addr, end);
+
+pud = pud_offset(pgd, addr);
+if (pud_none(*pud))
+return pud_addr_end(addr, end);
+
+if (!cpu_has_pse) {
+next = (addr + PAGE_SIZE)  PAGE_MASK;
+pmd = pmd_offset(pud, addr);
+if (pmd_none(*pmd))
+return next;
+
+pte = pte_offset_kernel(pmd, addr);
+if (pte_none(*pte))
+return next;
+
+*page_size = PAGE_SIZE;
+*pp = 

Re: [PATCH] powerpc/iommu: Fix multiple issues with IOMMU pools code

2012-10-04 Thread Alexander Graf
Hi Anton,

On 04.10.2012, at 06:57, Anton Blanchard wrote:

 
 Hi Alex,
 
 Looks to be a preempt issue with the iommu pools code. I did find a
 couple more bugs along the way too.
 
 Anton
 --
 
 There are a number of issues in the recent IOMMU pools code:
 
 - On a preempt kernel we might switch CPUs in the middle of building
  a scatter gather list. When this happens the handle hint passed in
  no longer falls within the local CPU's pool. Check for this and
  fall back to the pool hint.
 
 - We were missing a spin_unlock/spin_lock in one spot where we
  switch pools.
 
 - We need to provide locking around dart_tlb_invalidate_all and
  dart_tlb_invalidate_one now that the global lock is gone.
 
 Reported-by: Alexander Graf ag...@suse.de
 Signed-off-by: Anton Blanchard an...@samba.org
 ---
 
 There is still an issue with the lazy u3 flushing, but I wanted
 to get this out for testing.

Yup. It fixes the nfs problem on my U4 based machine.

Tested-by: Alexander Graf ag...@suse.de

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] PPC: Enable the Watchdog vector for 405

2012-10-04 Thread Benjamin Herrenschmidt
On Sun, 2012-09-30 at 17:27 -0600, Jason Gunthorpe wrote:
 diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
 index 4989661..7edd7b1 100644
 --- a/arch/powerpc/kernel/head_40x.S
 +++ b/arch/powerpc/kernel/head_40x.S
 @@ -431,29 +431,19 @@ label:
  
  /* 0x1000 - Programmable Interval Timer (PIT) Exception */
   START_EXCEPTION(0x1000, Decrementer)
 - NORMAL_EXCEPTION_PROLOG
 - lis r0,TSR_PIS@h
 - mtspr   SPRN_TSR,r0 /* Clear the PIT exception */
 - addir3,r1,STACK_FRAME_OVERHEAD
 - EXC_XFER_LITE(0x1000, timer_interrupt)
 + b pit_longer

Looks like you indeed have no choice but move it down, though I dislike
the label name :-)

Look at how we do a similar thing in exceptions-64.S, we basically just
don't use START_EXCEPTION at that location. We put a .= and a
branch, and use the real exception name at the target label.

Or just name it pit_exception if you want to keep things simple. I
just don't like pit_longer :-)
 
 -#if 0
  /* NOTE:
 - * FIT and WDT handlers are not implemented yet.
 + * FIT handler is not implemented yet.
   */

Any reason to comment that out ? Better off also branching out of line
to a stub similar to the PIT one that then calls unknown_exception. That
way if it triggers by accident, you'll get a clean trace.

  /* 0x1010 - Fixed Interval Timer (FIT) Exception
  */
 - STND_EXCEPTION(0x1010,  FITException,   unknown_exception)
 +//   STND_EXCEPTION(0x1010,  FITException,   unknown_exception)
  
  /* 0x1020 - Watchdog Timer (WDT) Exception
  */
 -#ifdef CONFIG_BOOKE_WDT
   CRITICAL_EXCEPTION(0x1020, WDTException, WatchdogException)
 -#else
 - CRITICAL_EXCEPTION(0x1020, WDTException, unknown_exception)
 -#endif
 -#endif

Move it out of line too please. When a given vector slot gets crowded,
I prefer moving everything in it out of line to keep things consistent.
 
  /* 0x1100 - Data TLB Miss Exception
   * As the name implies, translation is not in the MMU, so search the
 @@ -738,6 +728,16 @@ label:
   (MSR_KERNEL  ~(MSR_ME|MSR_DE|MSR_CE)), \
   NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
  
 + /* Programmable Interval Timer (PIT) Exception. The PIT runs into
 +the space reserved for other exceptions, so we branch down
 +to here. */
 +pit_longer:
 + NORMAL_EXCEPTION_PROLOG
 + lis r0,TSR_PIS@h
 + mtspr   SPRN_TSR,r0 /* Clear the PIT exception */
 + addir3,r1,STACK_FRAME_OVERHEAD
 + EXC_XFER_LITE(0x1000, timer_interrupt)
 +
  /*
   * The other Data TLB exceptions bail out to this point
   * if they can't resolve the lightweight TLB fault.
 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
 index ae0843f..0701ec1 100644
 --- a/arch/powerpc/kernel/traps.c
 +++ b/arch/powerpc/kernel/traps.c
 @@ -1514,7 +1514,7 @@ void unrecoverable_exception(struct pt_regs *regs)
   die(Unrecoverable exception, regs, SIGABRT);
  }
  
 -#ifdef CONFIG_BOOKE_WDT
 +#if defined(CONFIG_BOOKE_WDT) | defined(CONFIG_40x)
  /*
   * Default handler for a Watchdog exception,
   * spins until a reboot occurs

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] PPC: Correct the tophys/tovirt macros

2012-10-04 Thread Benjamin Herrenschmidt
On Sun, 2012-09-30 at 17:28 -0600, Jason Gunthorpe wrote:
 asm/page.h discusses the calculation for v2p and p2v, it should be:
  va = pa + KERNELBASE - PHYSICAL_START
 which is the same as:
  va = pa + LOAD_OFFSET
 
 tophys/tovirt were using PAGE_OFFSET, which as page.h says, is almost
 always the same thing.
 
 Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com

It's a bit gross tho in that KERNEL_BASE, PHYSICAL_START and LOAD_OFFSET
are about where the kernel is linked/running, and while the value ends
up happening to also be the p-v offset (and indeed not by accident), it
makes the code less clear and more confusing.

I think the main issue is that we did things wrong when implementing
non-0 based setups. PAGE_OFFSET should have remained what its name
implies which is the offset between p and v.

I don't have the bandwidth to revisit all that, but I really think that
whole are area where PAGE_OFFSET doesn't map 0 needs revisiting.

Ben.

 ---
  arch/powerpc/include/asm/ppc_asm.h |4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/ppc_asm.h 
 b/arch/powerpc/include/asm/ppc_asm.h
 index ea2a86e..44edc3a 100644
 --- a/arch/powerpc/include/asm/ppc_asm.h
 +++ b/arch/powerpc/include/asm/ppc_asm.h
 @@ -461,14 +461,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
  #define fromreal(rd) tovirt(rd,rd)
  
  #define tophys(rd,rs)\
 -0:   addis   rd,rs,-PAGE_OFFSET@h;   \
 +0:   addis   rd,rs,-LOAD_OFFSET@h;   \
   .section .vtop_fixup,aw;\
   .align  1;  \
   .long   0b; \
   .previous
  
  #define tovirt(rd,rs)\
 -0:   addis   rd,rs,PAGE_OFFSET@h;\
 +0:   addis   rd,rs,LOAD_OFFSET@h;\
   .section .ptov_fixup,aw;\
   .align  1;  \
   .long   0b; \


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/3 v2] iommu/fsl: Store iommu domain information pointer in archdata.

2012-10-04 Thread b16395
From: Varun Sethi varun.se...@freescale.com

Add a new field in the device (powerpc) archdata structure for storing iommu 
domain
information pointer. This pointer is stored when the device is attached to a 
particular
domain.

Signed-off-by: Varun Sethi varun.se...@freescale.com
---
 arch/powerpc/include/asm/device.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 77e97dd..6dc79fe 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -28,6 +28,10 @@ struct dev_archdata {
void*iommu_table_base;
} dma_data;
 
+   /* IOMMU domain information pointer. This would be set
+* when this device is attached to an iommu_domain.
+*/
+   void*iommu_domain;
 #ifdef CONFIG_SWIOTLB
dma_addr_t  max_direct_dma_addr;
 #endif
-- 
1.7.4.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 0/3 v2] iommu/fsl: Freescale PAMU driver and IOMMU API implementation.

2012-10-04 Thread b16395
From: Varun Sethi varun.se...@freescale.com

This patchset provides the Freescale PAMU (Peripheral Access Management Unit) 
driver
and the corresponding IOMMU API implementation. PAMU is the IOMMU present on 
Freescale
QorIQ platforms. PAMU can authorize memory access, remap the memory address, 
and remap 
the I/O transaction type.

This set consists of the following patches:
1. Addition of new field in the device (powerpc) archdata structure for storing 
iommu domain information
   pointer. This pointer is stored when the device is attached to a particular 
iommu domain.
2. Addition of domain attributes required by the PAMU driver IOMMU API.
3. PAMU driver and IOMMU API implementation.

This patch set is based on the next branch of the iommu git tree maintained by 
Joerg.

Varun Sethi (3):
  Store iommu domain information pointer in archdata.
  Add iommu domain attributes required by fsl PAMU driver.
  FSL PAMU driver and IOMMU API implementation.

 arch/powerpc/include/asm/device.h |4 +
 drivers/iommu/Kconfig |7 +
 drivers/iommu/Makefile|1 +
 drivers/iommu/fsl_pamu.c  | 1033 +
 drivers/iommu/fsl_pamu.h  |  377 ++
 drivers/iommu/fsl_pamu_domain.c   |  990 +++
 drivers/iommu/fsl_pamu_domain.h   |   94 
 drivers/iommu/fsl_pamu_proto.h|   49 ++
 include/linux/iommu.h |   30 ++
 9 files changed, 2585 insertions(+), 0 deletions(-)
 create mode 100644 drivers/iommu/fsl_pamu.c
 create mode 100644 drivers/iommu/fsl_pamu.h
 create mode 100644 drivers/iommu/fsl_pamu_domain.c
 create mode 100644 drivers/iommu/fsl_pamu_domain.h
 create mode 100644 drivers/iommu/fsl_pamu_proto.h

-- 
1.7.4.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/3 v2] iommu/fsl: Add iommu domain attributes required by fsl PAMU driver.

2012-10-04 Thread b16395
From: Varun Sethi varun.se...@freescale.com

Added the following domain attributes required by FSL PAMU driver:
1. Subwindows field added to the iommu domain geometry attribute.
2. Added new iommu stash attribute, which allows setting of the
   LIODN specific stash id parameter through IOMMU API.
3. Added an attribute for enabling/disabling DMA to a particular
   memory window.

Signed-off-by: Varun Sethi varun.se...@freescale.com
---
 include/linux/iommu.h |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f3b99e1..62e22f0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -44,6 +44,33 @@ struct iommu_domain_geometry {
dma_addr_t aperture_start; /* First address that can be mapped*/
dma_addr_t aperture_end;   /* Last address that can be mapped */
bool force_aperture;   /* DMA only allowed in mappable range? */
+
+   /* The subwindows field indicates number of DMA subwindows supported
+* by the geometry. Following is the interpretation of
+* values for this field:
+* 0 : This implies that the supported geometry size is 1 MB
+ * with each subwindow size being 4KB. Thus number of subwindows
+* being = 1MB/4KB = 256.
+* 1 : Only one DMA window i.e. no subwindows.
+* value other than 0 or 1 would indicate actual number of subwindows.
+*/
+   u32 subwindows;
+};
+
+/* cache stash targets */
+#define L1_CACHE 1
+#define L2_CACHE 2
+#define L3_CACHE 3
+
+/* This attribute corresponds to IOMMUs capable of generating
+ * a stash transaction. A stash transaction is typically a
+ * hardware initiated prefetch of data from memory to cache.
+ * This attribute allows configuring stashig specific parameters
+ * in the IOMMU hardware.
+ */
+struct iommu_stash_attribute {
+   u32 cpu;/* cpu number */
+   u32 cache;  /* cache to stash to: L1,L2,L3 */
 };
 
 struct iommu_domain {
@@ -60,6 +87,14 @@ struct iommu_domain {
 enum iommu_attr {
DOMAIN_ATTR_MAX,
DOMAIN_ATTR_GEOMETRY,
+   /* Set the IOMMU hardware stashing
+* parameters.
+*/
+   DOMAIN_ATTR_STASH,
+   /* Explicity enable/disable DMA for a
+ * particular memory window.
+ */
+   DOMAIN_ATTR_ENABLE,
 };
 
 #ifdef CONFIG_IOMMU_API
-- 
1.7.4.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


R: Re: PCI device not working

2012-10-04 Thread Davide Viti
Hi,
it turns out that if define CONFIG_PCI_NOSCAN in u-boot (as per [1]), the 
devide behind the second controller is detected by the  Linux kernel.

Would 
you suggest any particular patch I should apply to fix this (I'm using kernel 
2.6.34)

thanx alot in advance
Davide

[1] http://permalink.gmane.org/gmane.
linux.ports.ppc.embedded/20140

Messaggio originale
Da: 
zino...@tiscali.it
Data: 28/09/2012 16.48
A: ga...@kernel.crashing.org
Cc: 
linuxppc-dev@lists.ozlabs.org
Ogg: R: Re: PCI device not working

Hi 
Kumar,


It was, can you figure out in u-boot what exact config read on 

the bus would return the correct thing.

The fact that when we probe the 

device at 0001:03 we should get back something like cfg_data=0xabba1b65



here 
follow some details about what is going on inside u-boot; verbosity 
increases 
from [1] to [3]

 [1] PCI printouts when the board come up
 [2] 
output of pci 
[0-3] long u-boot command
 [3] same as [1] but with debug 
print inside 
indirect_read_config_##size() [drivers/pci/pci_indirect.c]

if 
you were curious 
about our u-boot board settings, please refer to:
http:
//www.mail-archive.
com/linuxppc-dev@lists.ozlabs.org/msg62007.html

thanx 
alot,
Davide



*
*[1]*
*
PCIE1 
used as Root Complex (base 
addr ffe09000)
   Scanning PCI bus 01

01  00  1b65  abba  
0280  00
cfg_addr:ffe09000  cfg_data:
ffe09004  indirect_type:0

PCIE1 on bus 00 - 01


PCIE2 used as 
Root Complex (base addr ffe0a000)

   Scanning PCI bus 03

03  00  1b65  abba  0280  00

cfg_addr:ffe0a000  cfg_data:
ffe0a004  indirect_type:0
PCIE2 on bus 02 - 03


*
*
[2]*
*

= pci 0 long
Scanning PCI devices 
on bus 0


Found PCI device 00.00.00:
  vendor ID =   0x1957
  
device 
ID =   0x0100
  command register =0x0006
  

status register = 0x0010
  revision ID = 0x11
  

class code =  0x0b (Processor)
  sub class code 
=  
0x20
  programming interface =   0x00
  cache line 
=  0x08

  latency time =0x00
  header type 
= 0x01
  
BIST =0x00
  base address 
0 =  0xfff0
  
base address 1 =  0x
  
primary bus number =  0x00
  
secondary bus number =0x01
  
subordinate bus number =  0x01
  
secondary latency timer = 0x00
  
IO base = 0x00
  IO 
limit =0x00
  
secondary status =0x
  memory 
base = 0xa000

  memory limit =0xa000
  prefetch 
memory base =
0x1001
  prefetch memory limit =   0x0001
  prefetch 
memory base upper 
=  0x
  prefetch memory limit upper = 0x
  IO 
base upper 16 
bits =   0x
  IO limit upper 16 bits =  0x
  
expansion ROM 
base address =  0x
  interrupt line =  0x00
  
interrupt 
pin =   0x00
  bridge control =  0x

= 
pci 1 
long
Scanning PCI devices on bus 1

Found PCI device 01.00.00:kk
  vendor 

ID =   0x1b65
  device ID =   0xabba
  
command 
register =0x0006
  status register = 0x0010

  revision 
ID = 0x01
  class code =  0x02 
(Network 
controller)
  sub class code =  0x80
  programming 
interface 
=   0x00
  cache line =  0x08
  latency time 

=0x00
  header type = 0x00
  BIST 

=0x00
  base address 0 =  0xa000
  
base 
address 1 =  0xa001
  base address 2 =  
0x

  base address 3 =  0x
  base address 4 
=  
0x
  base address 5 =  0x
  
cardBus CIS pointer 
= 0x
  sub system vendor ID =
0x
  sub system ID 
=   0x
  expansion ROM base address 
=  0x
  interrupt 
line =  0x00
  interrupt pin 
=   0x01
  min Grant 
=   0x00
  max Latency 
= 0x00

= pci 2 long

Scanning PCI devices on bus 2


Found PCI device 02.00.00:
  vendor ID 
=   0x1957
  device 
ID =   0x0100
  command 
register =0x0006
  
status register = 0x0010
  revision 
ID = 0x11
  
class code =  0x0b (Processor)
  
sub class code 
=  0x20
  programming interface =   0x00
  cache 
line 
=  0x08
  latency time =0x00
  header type 

= 0x01
  BIST =0x00
  base address 
0 
=  0xfff0
  base address 1 =  0x
  
primary 
bus number =  0x00
  secondary bus number =0x01
  

Re: PCI device not working

2012-10-04 Thread Kumar Gala

On Oct 4, 2012, at 7:24 AM, Davide Viti wrote:

 Hi,
 it turns out that if define CONFIG_PCI_NOSCAN in u-boot (as per [1]), the 
 devide behind the second controller is detected by the  Linux kernel.
 
 Would 
 you suggest any particular patch I should apply to fix this (I'm using kernel 
 2.6.34)
 
 thanx alot in advance
 Davide
 
 [1] http://permalink.gmane.org/gmane.
 linux.ports.ppc.embedded/20140

My suggestion would be to try and dump all the controller registers between the 
case that works and doesn't and compare.  There's some minor setting difference 
that I'm guessing is causing issues.

- k

 
 Messaggio originale
 Da: 
 zino...@tiscali.it
 Data: 28/09/2012 16.48
 A: ga...@kernel.crashing.org
 Cc: 
 linuxppc-dev@lists.ozlabs.org
 Ogg: R: Re: PCI device not working
 
 Hi 
 Kumar,
 
 
 It was, can you figure out in u-boot what exact config read on 
 
 the bus would return the correct thing.
 
 The fact that when we probe the 
 
 device at 0001:03 we should get back something like cfg_data=0xabba1b65
 
 
 
 here 
 follow some details about what is going on inside u-boot; verbosity 
 increases 
 from [1] to [3]
 
 [1] PCI printouts when the board come up
 [2] 
 output of pci 
 [0-3] long u-boot command
 [3] same as [1] but with debug 
 print inside 
 indirect_read_config_##size() [drivers/pci/pci_indirect.c]
 
 if 
 you were curious 
 about our u-boot board settings, please refer to:
 http:
 //www.mail-archive.
 com/linuxppc-dev@lists.ozlabs.org/msg62007.html
 
 thanx 
 alot,
 Davide
 
 
 
 *
 *[1]*
 *
   PCIE1 
 used as Root Complex (base 
 addr ffe09000)
  Scanning PCI bus 01
 
   01  00  1b65  abba  
 0280  00
   cfg_addr:ffe09000  cfg_data:
 ffe09004  indirect_type:0
 
 PCIE1 on bus 00 - 01
 
 
   PCIE2 used as 
 Root Complex (base addr ffe0a000)
 
  Scanning PCI bus 03
 
   03  00  1b65  abba  0280  00
 
 cfg_addr:ffe0a000  cfg_data:
 ffe0a004  indirect_type:0
   PCIE2 on bus 02 - 03
 
 
 *
 *
 [2]*
 *
 
 = pci 0 long
 Scanning PCI devices 
 on bus 0
 
 
 Found PCI device 00.00.00:
 vendor ID =   0x1957
 
 device 
 ID =   0x0100
 command register =0x0006
 
 
 status register = 0x0010
 revision ID = 0x11
 
 
 class code =  0x0b (Processor)
 sub class code 
 =  
 0x20
 programming interface =   0x00
 cache line 
 =  0x08
 
 latency time =0x00
 header type 
 = 0x01
 
 BIST =0x00
 base address 
 0 =  0xfff0
 
 base address 1 =  0x
 
 primary bus number =  0x00
 
 secondary bus number =0x01
 
 subordinate bus number =  0x01
 
 secondary latency timer = 0x00
 
 IO base = 0x00
 IO 
 limit =0x00
 
 secondary status =0x
 memory 
 base = 0xa000
 
 memory limit =0xa000
 prefetch 
 memory base =
 0x1001
 prefetch memory limit =   0x0001
 prefetch 
 memory base upper 
 =  0x
 prefetch memory limit upper = 0x
 IO 
 base upper 16 
 bits =   0x
 IO limit upper 16 bits =  0x
 
 expansion ROM 
 base address =  0x
 interrupt line =  0x00
 
 interrupt 
 pin =   0x00
 bridge control =  0x
 
 = 
 pci 1 
 long
 Scanning PCI devices on bus 1
 
 Found PCI device 01.00.00:kk
 vendor 
 
 ID =   0x1b65
 device ID =   0xabba
 
 command 
 register =0x0006
 status register = 0x0010
 
 revision 
 ID = 0x01
 class code =  0x02 
 (Network 
 controller)
 sub class code =  0x80
 programming 
 interface 
 =   0x00
 cache line =  0x08
 latency time 
 
 =0x00
 header type = 0x00
 BIST 
 
 =0x00
 base address 0 =  0xa000
 
 base 
 address 1 =  0xa001
 base address 2 =  
 0x
 
 base address 3 =  0x
 base address 4 
 =  
 0x
 base address 5 =  0x
 
 cardBus CIS pointer 
 = 0x
 sub system vendor ID =
 0x
 sub system ID 
 =   0x
 expansion ROM base address 
 =  0x
 interrupt 
 line =  0x00
 interrupt pin 
 =   0x01
 min Grant 
 =   0x00
 max Latency 
 = 0x00
 
 = pci 2 long
 
 Scanning PCI devices on bus 2
 
 
 Found PCI device 02.00.00:
 vendor ID 
 =   0x1957
 device 
 ID =   0x0100
 command 
 register =0x0006
 
 status register = 0x0010
 revision 
 ID = 0x11
 
 class code =  0x0b (Processor)
 
 sub class code 
 =  0x20
 programming 

Re: [PATCH 2/3 v2] iommu/fsl: Add iommu domain attributes required by fsl PAMU driver.

2012-10-04 Thread Kumar Gala

On Oct 4, 2012, at 6:56 AM, b16...@freescale.com b16...@freescale.com wrote:

 From: Varun Sethi varun.se...@freescale.com
 
 Added the following domain attributes required by FSL PAMU driver:
 1. Subwindows field added to the iommu domain geometry attribute.
 2. Added new iommu stash attribute, which allows setting of the
   LIODN specific stash id parameter through IOMMU API.
 3. Added an attribute for enabling/disabling DMA to a particular
   memory window.
 
 Signed-off-by: Varun Sethi varun.se...@freescale.com
 ---
 include/linux/iommu.h |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)
 
 diff --git a/include/linux/iommu.h b/include/linux/iommu.h
 index f3b99e1..62e22f0 100644
 --- a/include/linux/iommu.h
 +++ b/include/linux/iommu.h
 @@ -44,6 +44,33 @@ struct iommu_domain_geometry {
   dma_addr_t aperture_start; /* First address that can be mapped*/
   dma_addr_t aperture_end;   /* Last address that can be mapped */
   bool force_aperture;   /* DMA only allowed in mappable range? */
 +
 + /* The subwindows field indicates number of DMA subwindows supported
 +  * by the geometry. Following is the interpretation of
 +  * values for this field:
 +  * 0 : This implies that the supported geometry size is 1 MB
 + * with each subwindow size being 4KB. Thus number of subwindows
 +  * being = 1MB/4KB = 256.
 +  * 1 : Only one DMA window i.e. no subwindows.
 +  * value other than 0 or 1 would indicate actual number of subwindows.
 +  */
 + u32 subwindows;
 +};
 +
 +/* cache stash targets */
 +#define L1_CACHE 1
 +#define L2_CACHE 2
 +#define L3_CACHE 3

These names are way to generic for being exposed to user space

 +
 +/* This attribute corresponds to IOMMUs capable of generating
 + * a stash transaction. A stash transaction is typically a
 + * hardware initiated prefetch of data from memory to cache.
 + * This attribute allows configuring stashig specific parameters
 + * in the IOMMU hardware.
 + */
 +struct iommu_stash_attribute {
 + u32 cpu;/* cpu number */
 + u32 cache;  /* cache to stash to: L1,L2,L3 */
 };
 
 struct iommu_domain {
 @@ -60,6 +87,14 @@ struct iommu_domain {
 enum iommu_attr {
   DOMAIN_ATTR_MAX,
   DOMAIN_ATTR_GEOMETRY,
 + /* Set the IOMMU hardware stashing
 +  * parameters.
 +  */
 + DOMAIN_ATTR_STASH,
 + /* Explicity enable/disable DMA for a
 + * particular memory window.
 + */
 + DOMAIN_ATTR_ENABLE,
 };
 
 #ifdef CONFIG_IOMMU_API
 -- 
 1.7.4.1
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3 v2] iommu/fsl: Freescale PAMU driver and IOMMU API implementation.

2012-10-04 Thread Kumar Gala

On Oct 4, 2012, at 6:56 AM, b16...@freescale.com b16...@freescale.com wrote:

 From: Varun Sethi varun.se...@freescale.com
 
 Following is a brief description of the PAMU hardware:
 PAMU determines what action to take and whether to authorize the action on 
 the basis
 of the memory address, a Logical IO Device Number (LIODN), and PAACT table 
 (logically)
 indexed by LIODN and address. Hardware devices which need to access memory 
 must provide
 an LIODN in addition to the memory address.
 
 Peripheral Access Authorization and Control Tables (PAACTs) are the primary 
 data structures
 used by PAMU. A PAACT is a table of peripheral access authorization and 
 control entries (PAACE).
 Each PAACE defines the range of I/O bus address space that is accessible by 
 the LIOD and the
 associated access capabilities.
 
 There are two types of PAACTs: primary PAACT (PPAACT) and secondary PAACT 
 (SPAACT). A given physical
 I/O device may be able to act as one or more independent logical I/O devices 
 (LIODs). Each such
 logical I/O device is assigned an identifier called logical I/O device number 
 (LIODN). A LIOD is
 allocated a contiguous portion of the I/O bus address space called the DSA 
 window for performing
 DSA operations. The DSA window may optionally be divided into multiple 
 sub-windows, each of which
 may be used to map to a region in system storage space. The first sub-window 
 is referred to
 as the primary sub-window and the remaining are called secondary sub-windows.
 
 This patch provides the PAMU driver (fsl_pamu.c) and the corresponding IOMMU 
 API implementation
 (fsl_pamu_domain.c). The PAMU hardware driver (fsl_pamu.c) has been derived 
 from the work done
 by Ashish Kalra and Timur Tabi (ti...@freescale.com).
 
 Signed-off-by: Varun Sethi varun.se...@freescale.com
 ---

I'm not seeing any of the comments I made addressed.

What changed in this version?

- k
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [git pull] Please pull powerpc.git merge branch

2012-10-04 Thread Linus Torvalds
On Thu, Oct 4, 2012 at 1:38 AM, Benjamin Herrenschmidt
b...@kernel.crashing.org wrote:

 First, however, a note about the pull request details ... the diffstat looks
 completely on crack, any idea what's up ? It sees piles of files modified
 in various other archs  generic code but I see no patch in that branch
 that touches any of them.

So this happens if you have reverse merges (ie you've pulled my
tree, or some other tree I've pulled), and there is no longer a single
clear common point that you started from. In that case, there is no
simple diff for the what has changed since that original point, and
to get the diff for the merge you actually have to do the merge and
check the end result. git-request-pull doesn't do that, it just
assumes it's the simple case of some single common point.

The fact that you haven't seen it until now just means that you've
generally done a good job at keeping your powerpc tree clean from
other trees, and containing only your own work.

   Linus
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [git pull] Please pull powerpc.git merge branch

2012-10-04 Thread Linus Torvalds
On Thu, Oct 4, 2012 at 1:38 AM, Benjamin Herrenschmidt
b...@kernel.crashing.org wrote:

 The following changes since commit 271fd03a3013b106ccc178d54219c1be0c9759b7:

   powerpc/powernv: I/O and memory alignment for P2P bridges (2012-09-11 
 16:59:47 -0600)

 are available in the git repository at:

   git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git

Hmm. There's nothing there.

Did you mean for me to pull some branch/tag you didn't mention?

  Linus
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 2/3 v2] iommu/fsl: Add iommu domain attributes required by fsl PAMU driver.

2012-10-04 Thread Sethi Varun-B16395


 -Original Message-
 From: Kumar Gala [mailto:ga...@kernel.crashing.org]
 Sent: Thursday, October 04, 2012 6:47 PM
 To: Sethi Varun-B16395
 Cc: joerg.roe...@amd.com; io...@lists.linux-foundation.org; linuxppc-
 d...@lists.ozlabs.org; linux-ker...@vger.kernel.org; Sethi Varun-B16395
 Subject: Re: [PATCH 2/3 v2] iommu/fsl: Add iommu domain attributes
 required by fsl PAMU driver.
 
 
 On Oct 4, 2012, at 6:56 AM, b16...@freescale.com b16...@freescale.com
 wrote:
 
  From: Varun Sethi varun.se...@freescale.com
 
  Added the following domain attributes required by FSL PAMU driver:
  1. Subwindows field added to the iommu domain geometry attribute.
  2. Added new iommu stash attribute, which allows setting of the
LIODN specific stash id parameter through IOMMU API.
  3. Added an attribute for enabling/disabling DMA to a particular
memory window.
 
  Signed-off-by: Varun Sethi varun.se...@freescale.com
  ---
  include/linux/iommu.h |   35 +++
  1 files changed, 35 insertions(+), 0 deletions(-)
 
  diff --git a/include/linux/iommu.h b/include/linux/iommu.h index
  f3b99e1..62e22f0 100644
  --- a/include/linux/iommu.h
  +++ b/include/linux/iommu.h
  @@ -44,6 +44,33 @@ struct iommu_domain_geometry {
  dma_addr_t aperture_start; /* First address that can be mapped
 */
  dma_addr_t aperture_end;   /* Last address that can be mapped
 */
  bool force_aperture;   /* DMA only allowed in mappable range?
 */
  +
  +   /* The subwindows field indicates number of DMA subwindows
 supported
  +* by the geometry. Following is the interpretation of
  +* values for this field:
  +* 0 : This implies that the supported geometry size is 1 MB
  + * with each subwindow size being 4KB. Thus number of
 subwindows
  +* being = 1MB/4KB = 256.
  +* 1 : Only one DMA window i.e. no subwindows.
  +* value other than 0 or 1 would indicate actual number of
 subwindows.
  +*/
  +   u32 subwindows;
  +};
  +
  +/* cache stash targets */
  +#define L1_CACHE 1
  +#define L2_CACHE 2
  +#define L3_CACHE 3
 
 These names are way to generic for being exposed to user space
Will fix naming to IOMMU_ATTR_CACHE_L1 etc.

-Varun


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [RFC][PATCH 1/3] iommu/fsl: Store iommu domain information pointer in archdata.

2012-10-04 Thread Sethi Varun-B16395


 -Original Message-
 From: Kumar Gala [mailto:ga...@kernel.crashing.org]
 Sent: Wednesday, September 19, 2012 7:20 PM
 To: Sethi Varun-B16395
 Cc: io...@lists.linux-foundation.org; joerg.roe...@amd.com; linux-
 ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Sethi Varun-B16395
 Subject: Re: [RFC][PATCH 1/3] iommu/fsl: Store iommu domain information
 pointer in archdata.
 
 
 On Sep 19, 2012, at 8:17 AM, b16...@freescale.com
 b16...@freescale.com wrote:
 
  From: Varun Sethi varun.se...@freescale.com
 
  Add a new field in the device (powerpc) archdata structure for storing
  iommu domain information pointer. This pointer is stored when the
  device is attached to a particular domain.
 
  Signed-off-by: Varun Sethi varun.se...@freescale.com
  ---
  arch/powerpc/include/asm/device.h |4 
  1 files changed, 4 insertions(+), 0 deletions(-)
 
 Not too familiar, but what does the IBM Server IOMMU do for iommu_domain?

[Sethi Varun-B16395] I am not sure if the IBM iommu driver implements the iommu
API.

-Varun
 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] PPC: Correct the tophys/tovirt macros

2012-10-04 Thread Jason Gunthorpe
On Thu, Oct 04, 2012 at 09:19:04PM +1000, Benjamin Herrenschmidt wrote:
 On Sun, 2012-09-30 at 17:28 -0600, Jason Gunthorpe wrote:
  asm/page.h discusses the calculation for v2p and p2v, it should be:
   va = pa + KERNELBASE - PHYSICAL_START
  which is the same as:
   va = pa + LOAD_OFFSET
  
  tophys/tovirt were using PAGE_OFFSET, which as page.h says, is almost
  always the same thing.
  
  Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
 
 It's a bit gross tho in that KERNEL_BASE, PHYSICAL_START and LOAD_OFFSET
 are about where the kernel is linked/running, and while the value ends
 up happening to also be the p-v offset (and indeed not by accident), it
 makes the code less clear and more confusing.

Yes, I found the three names confusing.. However LOAD_OFFSET is the
name that include/asm-generic/vmlinux.lds.h and many arch's use for
P2V translation in the linker script, so at least there is some
precedent.

 I don't have the bandwidth to revisit all that, but I really think that
 whole are area where PAGE_OFFSET doesn't map 0 needs revisiting.

Something like that is also outside my scope.. This patch does fix a
bug I hit when CONFIG_PHYSICAL_START is used, are there problems with
it in other cases?

FWIW, this is part of a larger work that makes CONFIG_PHYSICAL_START
work on PPC404, which is not yet clean enough to post:
https://github.com/jgunthorpe/linux/commit/90df2d247c9db5d39c32dbbada0815f15d7b3be7

Regards,
Jason
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 1/5] rapidio: fix blocking wait for discovery ready

2012-10-04 Thread Bounine, Alexandre
On Wed, October 03, 2012 6:20 PM
Andrew Morton a...@linux-foundation.org wrote:
 
 On Wed,  3 Oct 2012 15:18:39 -0400
 Alexandre Bounine alexandre.boun...@idt.com wrote:
 
  Fix blocking wait loop in the RapidIO discovery routine to avoid
  warning dumps about stalled CPU on x86 platforms.
 
  ...
 
  +   to_end = jiffies + CONFIG_RAPIDIO_DISC_TIMEOUT * HZ;
  +   while (time_before(jiffies, to_end)) {
  +   if (rio_enum_complete(mport))
  +   goto enum_done;
  +
   schedule_timeout_uninterruptible(msecs_to_jiffies(10));
 
 I think a simple msleep(10) would suffice here?
 
Agree, same thing but looks simpler. Will update.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH v2] PPC: Do not make the entire heap executable

2012-10-04 Thread Jason Gunthorpe
On PPC the ELF PLT sections look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:
  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

Teach the ELF loader to check the X bit in the relevant load header and
create 0 filled anonymous mappings that are executable if the load header
requests that.

Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 arch/powerpc/include/asm/page.h|   10 +
 arch/powerpc/include/asm/page_32.h |2 -
 arch/powerpc/include/asm/page_64.h |4 ---
 fs/binfmt_elf.c|   41 +---
 4 files changed, 34 insertions(+), 23 deletions(-)

Some more testing found a bug, updated patch incase anyone wants
to try it.

Changes in v2:
 - In load_elf_interp last_bss can become  elf_bss when vm_brk is
   called. In the unpatched kernel this results in something like
   0xFF00 being passed into vm_brk as the len, which rounds up to
   0 and does nothing, but vm_mmap returns an error.. glibc 2.19
   triggered this case due to the ld.so layout, eglibc 2.13 didn't.

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index f072e97..61e46fc 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -215,15 +215,7 @@ extern long long virt_phys_offset;
 #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET + MEMORY_START)
 #endif
 
-/*
- * Unfortunately the PLT is in the BSS in the PPC32 ELF ABI,
- * and needs to be executable.  This means the whole heap ends
- * up being executable.
- */
-#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \
-VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \
+#define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | \
 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #ifdef __powerpc64__
diff --git a/arch/powerpc/include/asm/page_32.h 
b/arch/powerpc/include/asm/page_32.h
index 68d73b2..aaae5a6 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -7,8 +7,6 @@
 #endif
 #endif
 
-#define VM_DATA_DEFAULT_FLAGS  VM_DATA_DEFAULT_FLAGS32
-
 #ifdef CONFIG_NOT_COHERENT_CACHE
 #define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
 #endif
diff --git a/arch/powerpc/include/asm/page_64.h 
b/arch/powerpc/include/asm/page_64.h
index fed85e6..615d88b 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -136,10 +136,6 @@ do {   \
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 
-#define VM_DATA_DEFAULT_FLAGS \
-   (is_32bit_task() ? \
-VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64)
-
 /*
  * This is the default if a program doesn't have a PT_GNU_STACK
  * program header entry. The PPC64 ELF ABI has a non executable stack
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956..c26b40d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -76,13 +76,20 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) = TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end  start) {
unsigned long addr;
-   addr = vm_brk(start, end - start);
+   /* Map the non-file portion of the last load header. If the
+  header is requesting these pages to be executeable then
+  we have to honour that, otherwise assume they are bss. */
+   if (prot  PROT_EXEC)
+   addr = vm_mmap(0, start, end - start, prot,
+   MAP_PRIVATE | MAP_FIXED, 0);
+   else
+   addr = vm_brk(start, end - start);
if (BAD_ADDR(addr))
return addr;
}
@@ -381,6 +388,7 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;

RE: [PATCH 3/5] rapidio: run discovery as an asynchronous process

2012-10-04 Thread Bounine, Alexandre
On Wed, October 03, 2012 6:30 PM
Andrew Morton a...@linux-foundation.org wrote:
 
 On Wed,  3 Oct 2012 15:18:41 -0400
 Alexandre Bounine alexandre.boun...@idt.com wrote:
 
  ...
 
  +static void __devinit disc_work_handler(struct work_struct *_work)
  +{
  +   struct rio_disc_work *work = container_of(_work,
  + struct rio_disc_work, work);
 
 There's a nice simple way to avoid such ugliness:
 
 --- a/drivers/rapidio/rio.c~rapidio-run-discovery-as-an-asynchronous-
 process-fix
 +++ a/drivers/rapidio/rio.c
 @@ -1269,9 +1269,9 @@ struct rio_disc_work {
 
  static void __devinit disc_work_handler(struct work_struct *_work)
  {
 - struct rio_disc_work *work = container_of(_work,
 -   struct rio_disc_work, work);
 + struct rio_disc_work *work;
 
 + work = container_of(_work, struct rio_disc_work, work);
   pr_debug(RIO: discovery work for mport %d %s\n,
work-mport-id, work-mport-name);
   rio_disc_mport(work-mport);
 _
 

Thank you for the fix. Will avoid that ugliness in the future.

  +   pr_debug(RIO: discovery work for mport %d %s\n,
  +work-mport-id, work-mport-name);
  +   rio_disc_mport(work-mport);
  +
  +   kfree(work);
  +}
  +
   int __devinit rio_init_mports(void)
   {
  struct rio_mport *port;
  +   struct rio_disc_work *work;
  +   int no_disc = 0;
 
  list_for_each_entry(port, rio_mports, node) {
  if (port-host_deviceid = 0)
  rio_enum_mport(port);
  -   else
  -   rio_disc_mport(port);
  +   else if (!no_disc) {
  +   if (!rio_wq) {
  +   rio_wq = alloc_workqueue(riodisc, 0, 0);
  +   if (!rio_wq) {
  +   pr_err(RIO: unable allocate rio_wq\n);
  +   no_disc = 1;
  +   continue;
  +   }
  +   }
  +
  +   work = kzalloc(sizeof *work, GFP_KERNEL);
  +   if (!work) {
  +   pr_err(RIO: no memory for work struct\n);
  +   no_disc = 1;
  +   continue;
  +   }
  +
  +   work-mport = port;
  +   INIT_WORK(work-work, disc_work_handler);
  +   queue_work(rio_wq, work-work);
  +   }
  +   }
 
 I'm having a lot of trouble with `no_disc'.  afacit what it does is to
 cease running async discovery for any remaining devices if the
 workqueue
 allocation failed (vaguely reasonable) or if the allocation of a single
 work item failed (incomprehensible).
 
 But if we don't run discovery, the subsystem is permanently busted for
 at least some devices, isn't it?

This is correct. We are considering ways to restart discovery
process later but it is not applicable now.

 
 And this code is basically untestable unless the programmer does
 deliberate fault injection, which makes it pretty much unmaintainable.
 
 So...  if I haven't totally misunderstood, I suggest a rethink is in
 order?


I will review and simplify. Probably, just try to allocate all required
resources ahead of port list scan. Simple and safe.
 
  +   if (rio_wq) {
  +   pr_debug(RIO: flush discovery workqueue\n);
  +   flush_workqueue(rio_wq);
  +   pr_debug(RIO: flush discovery workqueue finished\n);
  +   destroy_workqueue(rio_wq);
  }
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[GIT PULL] Disintegrate UAPI for powerpc

2012-10-04 Thread David Howells
Can you merge the following branch into the powerpc tree please.

This is to complete part of the UAPI disintegration for which the preparatory
patches were pulled recently.

Note that there are some fixup patches which are at the base of the branch
aimed at you, plus all arches get the asm-generic branch merged in too.

The following changes since commit 612a9aab56a93533e76e3ad91642db7033e03b69:

  Merge branch 'drm-next' of git://people.freedesktop.org/~airlied/linux 
(2012-10-03 23:29:23 -0700)

are available in the git repository at:


  git://git.infradead.org/users/dhowells/linux-headers.git disintegrate-powerpc

for you to fetch changes up to d4b1059feb6486ae0800e936b9dd5fd4e05b9d0c:

  UAPI: (Scripted) Disintegrate arch/powerpc/include/asm (2012-10-04 18:21:17 
+0100)


David Howells (6):
  UAPI: Fix the guards on various asm/unistd.h files
  UAPI: Split compound conditionals containing __KERNEL__ in Arm64
  Merge remote-tracking branch 'c6x/for-linux-next' into uapi-prep
  UAPI: Fix conditional header installation handling (notably kvm_para.h on 
m68k)
  UAPI: (Scripted) Disintegrate include/asm-generic
  UAPI: (Scripted) Disintegrate arch/powerpc/include/asm

Mark Salter (2):
  c6x: make dsk6455 the default config
  c6x: remove c6x signal.h

 arch/arm64/include/asm/hwcap.h|   4 +-
 arch/arm64/include/asm/stat.h |   4 +-
 arch/arm64/include/asm/unistd.h   |   8 +-
 arch/arm64/include/asm/unistd32.h |   4 -
 arch/c6x/Makefile |   2 +
 arch/c6x/include/asm/Kbuild   |   1 +
 arch/c6x/include/asm/signal.h |  17 -
 arch/c6x/include/asm/unistd.h |   4 -
 arch/hexagon/include/asm/unistd.h |   5 -
 arch/openrisc/include/asm/unistd.h|   5 -
 arch/powerpc/include/asm/Kbuild   |  35 -
 arch/powerpc/include/asm/bootx.h  | 123 +--
 arch/powerpc/include/asm/cputable.h   |  35 +-
 arch/powerpc/include/asm/elf.h| 311 +---
 arch/powerpc/include/asm/kvm_para.h   |  70 +-
 arch/powerpc/include/asm/mman.h   |  27 +-
 arch/powerpc/include/asm/nvram.h  |  55 +-
 arch/powerpc/include/asm/ptrace.h | 242 +-
 arch/powerpc/include/asm/signal.h | 143 +---
 arch/powerpc/include/asm/spu_info.h   |  29 +-
 arch/powerpc/include/asm/swab.h   |  15 +-
 arch/powerpc/include/asm/termios.h|  69 +-
 arch/powerpc/include/asm/types.h  |  30 +-
 arch/powerpc/include/asm/unistd.h | 374 +
 arch/powerpc/include/uapi/asm/Kbuild  |  41 +
 arch/powerpc/include/{ = uapi}/asm/auxvec.h  |   0
 arch/powerpc/include/{ = uapi}/asm/bitsperlong.h |   0
 arch/powerpc/include/uapi/asm/bootx.h | 132 
 arch/powerpc/include/{ = uapi}/asm/byteorder.h   |   0
 arch/powerpc/include/uapi/asm/cputable.h  |  36 +
 arch/powerpc/include/uapi/asm/elf.h   | 307 
 arch/powerpc/include/{ = uapi}/asm/errno.h   |   0
 arch/powerpc/include/{ = uapi}/asm/fcntl.h   |   0
 arch/powerpc/include/{ = uapi}/asm/ioctl.h   |   0
 arch/powerpc/include/{ = uapi}/asm/ioctls.h  |   0
 arch/powerpc/include/{ = uapi}/asm/ipcbuf.h  |   0
 arch/powerpc/include/{ = uapi}/asm/kvm.h |   0
 arch/powerpc/include/uapi/asm/kvm_para.h  |  90 +++
 arch/powerpc/include/{ = uapi}/asm/linkage.h |   0
 arch/powerpc/include/uapi/asm/mman.h  |  31 +
 arch/powerpc/include/{ = uapi}/asm/msgbuf.h  |   0
 arch/powerpc/include/uapi/asm/nvram.h |  62 ++
 arch/powerpc/include/{ = uapi}/asm/param.h   |   0
 arch/powerpc/include/{ = uapi}/asm/poll.h|   0
 arch/powerpc/include/{ = uapi}/asm/posix_types.h |   0
 arch/powerpc/include/{ = uapi}/asm/ps3fb.h   |   0
 arch/powerpc/include/uapi/asm/ptrace.h| 259 ++
 arch/powerpc/include/{ = uapi}/asm/resource.h|   0
 arch/powerpc/include/{ = uapi}/asm/seccomp.h |   0
 arch/powerpc/include/{ = uapi}/asm/sembuf.h  |   0
 arch/powerpc/include/{ = uapi}/asm/setup.h   |   0
 arch/powerpc/include/{ = uapi}/asm/shmbuf.h  |   0
 arch/powerpc/include/{ = uapi}/asm/sigcontext.h  |   0
 arch/powerpc/include/{ = uapi}/asm/siginfo.h |   0
 arch/powerpc/include/uapi/asm/signal.h| 145 
 arch/powerpc/include/{ = uapi}/asm/socket.h  |   0
 arch/powerpc/include/{ = uapi}/asm/sockios.h |   0
 arch/powerpc/include/uapi/asm/spu_info.h  |  53 ++
 arch/powerpc/include/{ = uapi}/asm/stat.h|   0
 arch/powerpc/include/{ = uapi}/asm/statfs.h  |   0
 arch/powerpc/include/uapi/asm/swab.h  |  23 +
 arch/powerpc/include/{ = uapi}/asm/termbits.h| 

Re: [git pull] Please pull powerpc.git merge branch

2012-10-04 Thread Benjamin Herrenschmidt
On Thu, 2012-10-04 at 09:00 -0700, Linus Torvalds wrote:
 So this happens if you have reverse merges (ie you've pulled my
 tree, or some other tree I've pulled), and there is no longer a single
 clear common point that you started from. In that case, there is no
 simple diff for the what has changed since that original point, and
 to get the diff for the merge you actually have to do the merge and
 check the end result. git-request-pull doesn't do that, it just
 assumes it's the simple case of some single common point.

That was more/less my guess. Thanks.

 The fact that you haven't seen it until now just means that you've
 generally done a good job at keeping your powerpc tree clean from
 other trees, and containing only your own work. 

Heh, yeah I try to :-) In this specific case, this was a topic branch
from Bjorn which some pre-requisite patches to the generic PCI code that
Gavin did to allow us to get rid of some custom resource allocation
crap, so the getting rid of had a dependency on that topic branch.

Unfortunately, Bjorn had also based that branch on top of some larger
cleanup patch so I ended up pulling that as well from him, which tripped
the whole thing.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [git pull] Please pull powerpc.git merge branch

2012-10-04 Thread Benjamin Herrenschmidt
On Thu, 2012-10-04 at 09:02 -0700, Linus Torvalds wrote:
 Hmm. There's nothing there.
 
 Did you mean for me to pull some branch/tag you didn't mention?

Heh, yeah, the mirrors hadn't updated yet so git request-pull didn't put
the branch name in. It's common but I usually fix it up by hand. Since I
was probably too tired for thinking straight, I forgot that and forgot
to put the proper branch name in the email subject as well... argh :-)

It's in my next branch. Mirrors should be good by now, it's at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git next

Sorry for the misshap.

Cheers,
Ben.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 5/5] rapidio: add destination ID allocation mechanism

2012-10-04 Thread Bounine, Alexandre
On Wed, October 03, 2012 6:36 PM
Andrew Morton a...@linux-foundation.org wrote:
 
 On Wed,  3 Oct 2012 15:18:43 -0400
 Alexandre Bounine alexandre.boun...@idt.com wrote:
 
  ...
 
  +static u16 rio_destid_alloc(struct rio_net *net)
  +{
  +   int destid;
  +   struct rio_id_table *idtab = net-destid_table;
  +
  +   spin_lock(idtab-lock);
  +   destid = find_next_zero_bit(idtab-table, idtab-max, idtab-
 next);
  +   if (destid = idtab-max)
  +   destid = find_first_zero_bit(idtab-table, idtab-max);
  +
  +   if (destid  idtab-max) {
  +   idtab-next = destid + 1;
  +   if (idtab-next = idtab-max)
  +   idtab-next = 0;
  +   set_bit(destid, idtab-table);
  +   destid += idtab-start;
  +   } else
  +   destid = RIO_INVALID_DESTID;
  +
  +   spin_unlock(idtab-lock);
  +   return (u16)destid;
  +}
 
 This is round-robin rather than the simpler first-fit, and this reader
 doesn't know why.  Suggest the addition of a code comment explaining
 this decision.

This is to make debugging easier. Having fresh new destID assigned to
a device after insertion helps to analyze switch routing table updates.
Yes, find-first is sufficient and better understandable (I had it in
early version).
I will switch to find-first scenario to make things clear.   

 
  +/*
  + * rio_destid_reserve - Reserve the specivied destID
  + * net: RIO network
  + * destid: destID to reserve
  + *
  + * Tries to reserve the specified destID.
  + * Returns 0 if successfull.
  + */
  +static int rio_destid_reserve(struct rio_net *net, u16 destid)
  +{
  +   int oldbit;
  +   struct rio_id_table *idtab = net-destid_table;
  +
  +   destid -= idtab-start;
  +   spin_lock(idtab-lock);
  +   oldbit = test_and_set_bit(destid, idtab-table);
  +   spin_unlock(idtab-lock);
  +   return oldbit;
  +}
  +
  +/*
  + * rio_destid_free - free a previously allocated destID
  + * net: RIO network
  + * destid: destID to free
  + *
  + * Makes the specified destID available for use.
  + */
  +static void rio_destid_free(struct rio_net *net, u16 destid)
  +{
  +   struct rio_id_table *idtab = net-destid_table;
  +
  +   destid -= idtab-start;
  +   spin_lock(idtab-lock);
  +   clear_bit(destid, idtab-table);
  +   spin_unlock(idtab-lock);
  +}
  +
  +/*
  + * rio_destid_first - return first destID in use
  + * net: RIO network
  + */
  +static u16 rio_destid_first(struct rio_net *net)
  +{
  +   int destid;
  +   struct rio_id_table *idtab = net-destid_table;
  +
  +   spin_lock(idtab-lock);
  +   destid = find_first_bit(idtab-table, idtab-max);
  +   if (destid = idtab-max)
  +   destid = RIO_INVALID_DESTID;
  +   else
  +   destid += idtab-start;
  +   spin_unlock(idtab-lock);
  +   return (u16)destid;
  +}
  +
  +/*
  + * rio_destid_next - return next destID in use
  + * net: RIO network
  + * from: destination ID from which search shall continue
  + */
 
 All these code comments look like kerneldoc, but they aren't.
 kerneldoc
 uses /** and identifiers have a leading `@'.  And that's OK - one
 doesn't *have* to use kerneldoc.  But a lot of
 drivers/rapidio/rio-scan.c is already using kerneldoc so the
 inconsistency is odd.

Idea here was that keeping static functions out of kerneldoc may
have sense and result in cleaner doc output. This was my first attempt
to take that path. Probably, kerneldoc adjustment patch for entire
file (or even all RapidIO files) would be more appropriate instead of
changing style half-way.
As you noticed, these comments are similar to kerneldoc - easy to get back
to old style. I will restore kerneldoc style for affected functions. 

 
 
  ...
 
  -static struct rio_net __devinit *rio_alloc_net(struct rio_mport
 *port)
  +static struct rio_net __devinit *rio_alloc_net(struct rio_mport
 *port,
  +  int do_enum, u16 start)
   {
  struct rio_net *net;
 
  net = kzalloc(sizeof(struct rio_net), GFP_KERNEL);
  +   if (net  do_enum) {
  +   net-destid_table.table = kzalloc(
  +   BITS_TO_LONGS(RIO_MAX_ROUTE_ENTRIES(port-sys_size))
 *
  +   sizeof(long),
  +   GFP_KERNEL);
 
 kcalloc() would be idiomatic here.

Agree. Will change.

 
  +   if (net-destid_table.table == NULL) {
  +   pr_err(RIO: failed to allocate destID table\n);
  +   kfree(net);
  +   net = NULL;
  +   } else {
  +   net-destid_table.start = start;
  +   net-destid_table.next = 0;
  +   net-destid_table.max =
  +   RIO_MAX_ROUTE_ENTRIES(port-sys_size);
  +   spin_lock_init(net-destid_table.lock);
  +   }
  +   }
  +
 
  ...
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 0/10] memory-hotplug: hot-remove physical memory

2012-10-04 Thread Yasuaki Ishimatsu
The patch-set was divided from following thread's patch-set.

https://lkml.org/lkml/2012/9/5/201

If you want to know the reason, please read following thread.

https://lkml.org/lkml/2012/10/2/83

The patch-set has only the function of kernel core side for physical
memory hot remove. So if you use the patch, please apply following
patches.

- bug fix for memory hot remove
  https://lkml.org/lkml/2012/9/27/39
  https://lkml.org/lkml/2012/10/2/83
  http://www.spinics.net/lists/linux-mm/msg42982.html
  
- acpi framework
  https://lkml.org/lkml/2012/10/3/126
  https://lkml.org/lkml/2012/10/3/641

The patches can free/remove the following things:

  - /sys/firmware/memmap/X/{end, start, type} : [PATCH 2/10]
  - mem_section and related sysfs files   : [PATCH 3-4/10]
  - memmap of sparse-vmemmap  : [PATCH 5-7/10]
  - page table of removed memory  : [RFC PATCH 8/10]
  - node and related sysfs files  : [RFC PATCH 9-10/10]

* [PATCH 1/10] checks whether the memory can be removed or not.

If you find lack of function for physical memory hot-remove, please let me
know.

How to test this patchset?
1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
   ACPI_HOTPLUG_MEMORY must be selected.
2. load the module acpi_memhotplug
3. hotplug the memory device(it depends on your hardware)
   You will see the memory device under the directory /sys/bus/acpi/devices/.
   Its name is PNP0C80:XX.
4. online/offline pages provided by this memory device
   You can write online/offline to /sys/devices/system/memory/memoryX/state to
   online/offline pages provided by this memory device
5. hotremove the memory device
   You can hotremove the memory device by the hardware, or writing 1 to
   /sys/bus/acpi/devices/PNP0C80:XX/eject.

Note: if the memory provided by the memory device is used by the kernel, it
can't be offlined. It is not a bug.

Known problems:
1. memory can't be offlined when CONFIG_MEMCG is selected.
   For example: there is a memory device on node 1. The address range
   is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
   and memory11 under the directory /sys/devices/system/memory/.
   If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
   when we online pages. When we online memory8, the memory stored page cgroup
   is not provided by this memory device. But when we online memory9, the memory
   stored page cgroup may be provided by memory8. So we can't offline memory8
   now. We should offline the memory in the reversed order.
   When the memory device is hotremoved, we will auto offline memory provided
   by this memory device. But we don't know which memory is onlined first, so
   offlining memory may fail. In such case, you should offline the memory by
   hand before hotremoving the memory device.
2. hotremoving memory device may cause kernel panicked
   This bug will be fixed by Liu Jiang's patch:
   https://lkml.org/lkml/2012/7/3/1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/10] memory-hotplug : check whether memory is offline or not when removing memory

2012-10-04 Thread Yasuaki Ishimatsu
When calling remove_memory(), the memory should be offline. If the function
is used to online memory, kernel panic may occur.

So the patch checks whether memory is offline or not.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com 
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

---
 drivers/base/memory.c  |   39 +++
 include/linux/memory.h |5 +
 mm/memory_hotplug.c|   17 +++--
 3 files changed, 59 insertions(+), 2 deletions(-)

Index: linux-3.6/drivers/base/memory.c
===
--- linux-3.6.orig/drivers/base/memory.c2012-10-04 14:22:57.0 
+0900
+++ linux-3.6/drivers/base/memory.c 2012-10-04 14:45:46.653585860 +0900
@@ -70,6 +70,45 @@ void unregister_memory_isolate_notifier(
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+   struct memory_block *mem = NULL;
+   struct mem_section *section;
+   unsigned long start_pfn, end_pfn;
+   unsigned long pfn, section_nr;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = PFN_UP(start + size);
+
+   for (pfn = start_pfn; pfn  end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr = mem-start_section_nr) 
+   (section_nr = mem-end_section_nr))
+   continue;
+
+   mem = find_memory_block_hinted(section, mem);
+   if (!mem)
+   continue;
+   if (mem-state == MEM_OFFLINE)
+   continue;
+
+   kobject_put(mem-dev.kobj);
+   return false;
+   }
+
+   if (mem)
+   kobject_put(mem-dev.kobj);
+
+   return true;
+}
+EXPORT_SYMBOL(is_memblk_offline);
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
Index: linux-3.6/include/linux/memory.h
===
--- linux-3.6.orig/include/linux/memory.h   2012-10-02 18:00:22.0 
+0900
+++ linux-3.6/include/linux/memory.h2012-10-04 14:44:40.902581028 +0900
@@ -106,6 +106,10 @@ static inline int memory_isolate_notify(
 {
return 0;
 }
+static inline bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+   return false;
+}
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
@@ -120,6 +124,7 @@ extern int memory_isolate_notify(unsigne
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+extern bool is_memblk_offline(unsigned long start, unsigned long size);
 #define CONFIG_MEM_BLOCK_SIZE  (PAGES_PER_SECTIONPAGE_SHIFT)
 enum mem_add_context { BOOT, HOTPLUG };
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 14:31:08.0 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 14:58:22.449687986 +0900
@@ -1045,8 +1045,21 @@ int offline_memory(u64 start, u64 size)
 
 int remove_memory(int nid, u64 start, u64 size)
 {
-   /* It is not implemented yet*/
-   return 0;
+   int ret = 0;
+   lock_memory_hotplug();
+   /*
+* The memory might become online by other task, even if you offine it.
+* So we check whether the memory has been onlined or not.
+*/
+   if (!is_memblk_offline(start, size)) {
+   pr_warn(memory removing [mem %#010llx-%#010llx] failed, 
+   because the memmory range is online\n,
+   start, start + size);
+   ret = -EAGAIN;
+   }
+
+   unlock_memory_hotplug();
+   return ret;
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #else

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/10] memory-hotplug : remove /sys/firmware/memmap/X sysfs

2012-10-04 Thread Yasuaki Ishimatsu
When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note : The code does not free firmware_map_entry since there is no way to free
   memory which is allocated by bootmem.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com 
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

---
 drivers/firmware/memmap.c|   98 ++-
 include/linux/firmware-map.h |6 ++
 mm/memory_hotplug.c  |7 ++-
 3 files changed, 108 insertions(+), 3 deletions(-)

Index: linux-3.6/drivers/firmware/memmap.c
===
--- linux-3.6.orig/drivers/firmware/memmap.c2012-10-04 18:27:05.195500420 
+0900
+++ linux-3.6/drivers/firmware/memmap.c 2012-10-04 18:27:18.901514330 +0900
@@ -21,6 +21,7 @@
 #include linux/types.h
 #include linux/bootmem.h
 #include linux/slab.h
+#include linux/mm.h
 
 /*
  * Data types 
--
@@ -41,6 +42,7 @@ struct firmware_map_entry {
const char  *type;  /* type of the memory range */
struct list_headlist;   /* entry for the linked list */
struct kobject  kobj;   /* kobject for each entry */
+   unsigned intbootmem:1; /* allocated from bootmem */
 };
 
 /*
@@ -79,7 +81,26 @@ static const struct sysfs_ops memmap_att
.show = memmap_attr_show,
 };
 
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+   return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+   struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+   if (entry-bootmem)
+   /* There is no way to free memory allocated from bootmem */
+   return;
+
+   kfree(entry);
+}
+
 static struct kobj_type memmap_ktype = {
+   .release= release_firmware_map_entry,
.sysfs_ops  = memmap_attr_ops,
.default_attrs  = def_attrs,
 };
@@ -94,6 +115,7 @@ static struct kobj_type memmap_ktype = {
  * in firmware initialisation code in one single thread of execution.
  */
 static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
 
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap 
entry.
@@ -118,11 +140,25 @@ static int firmware_map_add_entry(u64 st
INIT_LIST_HEAD(entry-list);
kobject_init(entry-kobj, memmap_ktype);
 
+   spin_lock(map_entries_lock);
list_add_tail(entry-list, map_entries);
+   spin_unlock(map_entries_lock);
 
return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+   spin_lock(map_entries_lock);
+   list_del(entry-list);
+   spin_unlock(map_entries_lock);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +180,35 @@ static int add_sysfs_fw_map_entry(struct
return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+   kobject_put(entry-kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   spin_lock(map_entries_lock);
+   list_for_each_entry(entry, map_entries, list)
+   if ((entry-start == start)  (entry-end == end) 
+   (!strcmp(entry-type, type))) {
+   spin_unlock(map_entries_lock);
+   return entry;
+   }
+
+   spin_unlock(map_entries_lock);
+   return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -193,9 +258,36 @@ int __init firmware_map_add_early(u64 st
if (WARN_ON(!entry))
return -ENOMEM;
 
+   entry-bootmem = 1;
return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   entry = 

[PATCH 3/10] memory-hotplug : introduce new function arch_remove_memory() for removing page table depends on architecture

2012-10-04 Thread Yasuaki Ishimatsu
From: Wen Congyang we...@cn.fujitsu.com

For removing memory, we need to remove page table. But it depends
on architecture. So the patch introduce arch_remove_memory() for
removing page table. Now it only calls __remove_pages().

Note: __remove_pages() for some archtecuture is not implemented
  (I don't know how to implement it for s390).

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Benjamin Herrenschmidt b...@kernel.crashing.org
CC: Paul Mackerras pau...@samba.org
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 arch/ia64/mm/init.c|   18 ++
 arch/powerpc/mm/mem.c  |   12 
 arch/s390/mm/init.c|   12 
 arch/sh/mm/init.c  |   17 +
 arch/tile/mm/init.c|8 
 arch/x86/mm/init_32.c  |   12 
 arch/x86/mm/init_64.c  |   15 +++
 include/linux/memory_hotplug.h |1 +
 mm/memory_hotplug.c|1 +
 9 files changed, 96 insertions(+)

Index: linux-3.6/arch/ia64/mm/init.c
===
--- linux-3.6.orig/arch/ia64/mm/init.c  2012-10-04 18:27:03.082498276 +0900
+++ linux-3.6/arch/ia64/mm/init.c   2012-10-04 18:28:50.087606867 +0900
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, 
 
return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start  PAGE_SHIFT;
+   unsigned long nr_pages = size  PAGE_SHIFT;
+   struct zone *zone;
+   int ret;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   ret = __remove_pages(zone, start_pfn, nr_pages);
+   if (ret)
+   pr_warn(%s: Problem encountered in __remove_pages() as
+ret=%d\n, __func__,  ret);
+
+   return ret;
+}
+#endif
 #endif
 
 /*
Index: linux-3.6/arch/powerpc/mm/mem.c
===
--- linux-3.6.orig/arch/powerpc/mm/mem.c2012-10-04 18:27:03.084498278 
+0900
+++ linux-3.6/arch/powerpc/mm/mem.c 2012-10-04 18:28:50.094606874 +0900
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, 
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start  PAGE_SHIFT;
+   unsigned long nr_pages = size  PAGE_SHIFT;
+   struct zone *zone;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
Index: linux-3.6/arch/s390/mm/init.c
===
--- linux-3.6.orig/arch/s390/mm/init.c  2012-10-04 18:27:03.080498274 +0900
+++ linux-3.6/arch/s390/mm/init.c   2012-10-04 18:28:50.104606884 +0900
@@ -257,4 +257,16 @@ int arch_add_memory(int nid, u64 start, 
vmem_remove_mapping(start, size);
return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /*
+* There is no hardware or firmware interface which could trigger a
+* hot memory remove on s390. So there is nothing that needs to be
+* implemented.
+*/
+   return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
Index: linux-3.6/arch/sh/mm/init.c
===
--- linux-3.6.orig/arch/sh/mm/init.c2012-10-04 18:27:03.091498285 +0900
+++ linux-3.6/arch/sh/mm/init.c 2012-10-04 18:28:50.116606897 +0900
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start  PAGE_SHIFT;
+   unsigned long nr_pages = size  PAGE_SHIFT;
+   struct zone *zone;
+   int ret;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   ret = __remove_pages(zone, start_pfn, nr_pages);
+   if (unlikely(ret))
+   pr_warn(%s: Failed, __remove_pages() == %d\n, __func__,
+   ret);
+
+   return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
Index: linux-3.6/arch/tile/mm/init.c
===
--- linux-3.6.orig/arch/tile/mm/init.c  2012-10-04 18:27:03.078498272 +0900
+++ linux-3.6/arch/tile/mm/init.c   2012-10-04 18:28:50.122606903 +0900
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 

[PATCH 4/10] memory-hotplug : unregister memory section on SPARSEMEM_VMEMMAP

2012-10-04 Thread Yasuaki Ishimatsu
Currently __remove_section for SPARSEMEM_VMEMMAP does nothing. But even if
we use SPARSEMEM_VMEMMAP, we can unregister the memory_section.

So the patch add unregister_memory_section() into __remove_section().

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com 
CC: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 mm/memory_hotplug.c |   13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 18:29:50.577668254 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 18:29:58.284676075 +0900
@@ -279,11 +279,14 @@ static int __meminit __add_section(int n
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-   /*
-* XXX: Freeing memmap with vmemmap is not implement yet.
-*  This should be removed later.
-*/
-   return -EBUSY;
+   int ret = -EINVAL;
+
+   if (!valid_section(ms))
+   return ret;
+
+   ret = unregister_memory_section(ms);
+
+   return ret;
 }
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/10] memory-hotplug : memory-hotplug: check page type in get_page_bootmem

2012-10-04 Thread Yasuaki Ishimatsu
The function get_page_bootmem() may be called more than one time to the same
page. There is no need to set page's type, private if the function is not
the first time called to the page.

Note: the patch is just optimization and does not fix any problem.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 mm/memory_hotplug.c |   15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 18:29:58.284676075 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 18:30:03.454680542 +0900
@@ -95,10 +95,17 @@ static void release_memory_resource(stru
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
-   page-lru.next = (struct list_head *) type;
-   SetPagePrivate(page);
-   set_page_private(page, info);
-   atomic_inc(page-_count);
+   unsigned long page_type;
+
+   page_type = (unsigned long)page-lru.next;
+   if (page_type  MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+   page_type  MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE){
+   page-lru.next = (struct list_head *)type;
+   SetPagePrivate(page);
+   set_page_private(page, info);
+   atomic_inc(page-_count);
+   } else
+   atomic_inc(page-_count);
 }
 
 /* reference to __meminit __free_pages_bootmem is valid

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 6/10] memory-hotplug : implement register_page_bootmem_info_section of sparse-vmemmap

2012-10-04 Thread Yasuaki Ishimatsu
For removing memmap region of sparse-vmemmap which is allocated bootmem,
memmap region of sparse-vmemmap needs to be registered by get_page_bootmem().
So the patch searches pages of virtual mapping and registers the pages by
get_page_bootmem().

Note: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390,
and sparc.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 arch/ia64/mm/discontig.c   |6 
 arch/powerpc/mm/init_64.c  |6 
 arch/s390/mm/vmem.c|6 
 arch/sparc/mm/init_64.c|6 
 arch/x86/mm/init_64.c  |   52 +
 include/linux/memory_hotplug.h |   11 +---
 include/linux/mm.h |3 +-
 mm/memory_hotplug.c|   37 ++---
 8 files changed, 113 insertions(+), 14 deletions(-)

Index: linux-3.6/include/linux/memory_hotplug.h
===
--- linux-3.6.orig/include/linux/memory_hotplug.h   2012-10-04 
17:15:03.029828127 +0900
+++ linux-3.6/include/linux/memory_hotplug.h2012-10-04 17:15:59.010833688 
+0900
@@ -163,17 +163,10 @@ static inline void arch_refresh_nodedata
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
 extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
-#endif
+extern void get_page_bootmem(unsigned long ingo, struct page *page,
+unsigned long type);
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 17:15:27.213831361 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 17:37:00.176401540 +0900
@@ -91,9 +91,8 @@ static void release_memory_resource(stru
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page,
-unsigned long type)
+void get_page_bootmem(unsigned long info,  struct page *page,
+ unsigned long type)
 {
unsigned long page_type;
 
@@ -127,6 +126,7 @@ void __ref put_page_bootmem(struct page 
 
 }
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
unsigned long *usemap, mapsize, section_nr, i;
@@ -160,6 +160,36 @@ static void register_page_bootmem_info_s
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+   unsigned long *usemap, mapsize, section_nr, i;
+   struct mem_section *ms;
+   struct page *page, *memmap;
+
+   if (!pfn_valid(start_pfn))
+   return;
+
+   section_nr = pfn_to_section_nr(start_pfn);
+   ms = __nr_to_section(section_nr);
+
+   memmap = sparse_decode_mem_map(ms-section_mem_map, section_nr);
+
+   page = virt_to_page(memmap);
+   mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+   mapsize = PAGE_ALIGN(mapsize)  PAGE_SHIFT;
+
+   register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+   usemap = __nr_to_section(section_nr)-pageblock_flags;
+   page = virt_to_page(usemap);
+
+   mapsize = PAGE_ALIGN(usemap_size())  PAGE_SHIFT;
+
+   for (i = 0; i  mapsize; i++, page++)
+   get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -202,7 +232,6 @@ void register_page_bootmem_info_node(str
register_page_bootmem_info_section(pfn);
}
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
   unsigned long end_pfn)
Index: linux-3.6/arch/ia64/mm/discontig.c
===
--- linux-3.6.orig/arch/ia64/mm/discontig.c 2012-10-01 08:47:46.0 
+0900
+++ linux-3.6/arch/ia64/mm/discontig.c  2012-10-04 17:15:59.209833459 +0900
@@ -822,4 +822,10 @@ int __meminit vmemmap_populate(struct pa
 {
return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+   

[PATCH 7/10] memory-hotplug : remove memmap of sparse-vmemmap

2012-10-04 Thread Yasuaki Ishimatsu
All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
 1. When removing memory, the page structs of the revmoved memory are filled
with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 arch/ia64/mm/discontig.c  |8 +++
 arch/powerpc/mm/init_64.c |8 +++
 arch/s390/mm/vmem.c   |8 +++
 arch/sparc/mm/init_64.c   |8 +++
 arch/x86/mm/init_64.c |  119 ++
 include/linux/mm.h|2 
 mm/memory_hotplug.c   |   17 --
 mm/sparse.c   |5 +
 8 files changed, 158 insertions(+), 17 deletions(-)

Index: linux-3.6/arch/ia64/mm/discontig.c
===
--- linux-3.6.orig/arch/ia64/mm/discontig.c 2012-10-04 18:30:15.475692638 
+0900
+++ linux-3.6/arch/ia64/mm/discontig.c  2012-10-04 18:30:21.145698389 +0900
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct pa
return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
Index: linux-3.6/arch/powerpc/mm/init_64.c
===
--- linux-3.6.orig/arch/powerpc/mm/init_64.c2012-10-04 18:30:15.494692657 
+0900
+++ linux-3.6/arch/powerpc/mm/init_64.c 2012-10-04 18:30:21.150698394 +0900
@@ -299,6 +299,14 @@ int __meminit vmemmap_populate(struct pa
return 0;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
Index: linux-3.6/arch/s390/mm/vmem.c
===
--- linux-3.6.orig/arch/s390/mm/vmem.c  2012-10-04 18:30:15.506692670 +0900
+++ linux-3.6/arch/s390/mm/vmem.c   2012-10-04 18:30:21.157698401 +0900
@@ -227,6 +227,14 @@ out:
return ret;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
Index: linux-3.6/arch/sparc/mm/init_64.c
===
--- linux-3.6.orig/arch/sparc/mm/init_64.c  2012-10-04 18:30:15.512692676 
+0900
+++ linux-3.6/arch/sparc/mm/init_64.c   2012-10-04 18:30:21.163698408 +0900
@@ -2078,6 +2078,14 @@ void __meminit vmemmap_populate_print_la
}
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
Index: linux-3.6/arch/x86/mm/init_64.c
===
--- linux-3.6.orig/arch/x86/mm/init_64.c2012-10-04 18:30:15.517692681 
+0900
+++ linux-3.6/arch/x86/mm/init_64.c 2012-10-04 18:30:21.171698416 +0900
@@ -993,6 +993,125 @@ vmemmap_populate(struct page *start_page
return 0;
 }
 
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+   struct page **pp, int *page_size)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   void *page_addr;
+   unsigned long next;
+
+   *pp = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd))
+   return pgd_addr_end(addr, end);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud))
+   return 

[PATCH 8/10] memory-hotplug : remove page table of x86_64 architecture

2012-10-04 Thread Yasuaki Ishimatsu
From: Wen Congyang we...@cn.fujitsu.com

For hot removing memory, we sholud remove page table about the memory.
So the patch searches a page table about the removed memory, and clear
page table.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 arch/x86/include/asm/pgtable_types.h |1 
 arch/x86/mm/init_64.c|  147 +++
 arch/x86/mm/pageattr.c   |   47 +--
 3 files changed, 173 insertions(+), 22 deletions(-)

Index: linux-3.6/arch/x86/mm/init_64.c
===
--- linux-3.6.orig/arch/x86/mm/init_64.c2012-10-04 18:30:21.171698416 
+0900
+++ linux-3.6/arch/x86/mm/init_64.c 2012-10-04 18:30:27.317704652 +0900
@@ -675,6 +675,151 @@ int arch_add_memory(int nid, u64 start, 
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+static void __meminit
+phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
+{
+   unsigned pages = 0;
+   int i = pte_index(addr);
+
+   pte_t *pte = pte_page + pte_index(addr);
+
+   for (; i  PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+
+   if (addr = end)
+   break;
+
+   if (!pte_present(*pte))
+   continue;
+
+   pages++;
+   set_pte(pte, __pte(0));
+   }
+
+   update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
+{
+   unsigned long pages = 0, next;
+   int i = pmd_index(addr);
+
+   for (; i  PTRS_PER_PMD; i++, addr = next) {
+   unsigned long pte_phys;
+   pmd_t *pmd = pmd_page + pmd_index(addr);
+   pte_t *pte;
+
+   if (addr = end)
+   break;
+
+   next = (addr  PMD_MASK) + PMD_SIZE;
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_large(*pmd)) {
+   if ((addr  ~PMD_MASK) == 0  next = end) {
+   set_pmd(pmd, __pmd(0));
+   pages++;
+   continue;
+   }
+
+   /*
+* We use 2M page, but we need to remove part of them,
+* so split 2M page to 4K page.
+*/
+   pte = alloc_low_page(pte_phys);
+   __split_large_page((pte_t *)pmd, addr, pte);
+
+   spin_lock(init_mm.page_table_lock);
+   pmd_populate_kernel(init_mm, pmd, __va(pte_phys));
+   spin_unlock(init_mm.page_table_lock);
+   }
+
+   spin_lock(init_mm.page_table_lock);
+   pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+   phys_pte_remove(pte, addr, end);
+   unmap_low_page(pte);
+   spin_unlock(init_mm.page_table_lock);
+   }
+   update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
+   unsigned long pages = 0, next;
+   int i = pud_index(addr);
+
+   for (; i  PTRS_PER_PUD; i++, addr = next) {
+   unsigned long pmd_phys;
+   pud_t *pud = pud_page + pud_index(addr);
+   pmd_t *pmd;
+
+   if (addr = end)
+   break;
+
+   next = (addr  PUD_MASK) + PUD_SIZE;
+
+   if (!pud_present(*pud))
+   continue;
+
+   if (pud_large(*pud)) {
+   if ((addr  ~PUD_MASK) == 0  next = end) {
+   set_pud(pud, __pud(0));
+   pages++;
+   continue;
+   }
+
+   /*
+* We use 1G page, but we need to remove part of them,
+* so split 1G page to 2M page.
+*/
+   pmd = alloc_low_page(pmd_phys);
+   __split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
+
+   spin_lock(init_mm.page_table_lock);
+   pud_populate(init_mm, pud, __va(pmd_phys));
+   spin_unlock(init_mm.page_table_lock);
+   }
+
+   pmd = map_low_page(pmd_offset(pud, 0));
+   phys_pmd_remove(pmd, addr, end);
+   unmap_low_page(pmd);
+   __flush_tlb_all();
+   }
+   __flush_tlb_all();
+
+   

[PATCH 9/10] memory-hotplug : memory_hotplug: clear zone when removing the memory

2012-10-04 Thread Yasuaki Ishimatsu
When a memory is added, we update zone's and pgdat's start_pfn and
spanned_pages in the function __add_zone(). So we should revert them
when the memory is removed.

The patch adds a new function __remove_zone() to do this.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 mm/memory_hotplug.c |  207 
 1 file changed, 207 insertions(+)

Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 18:30:21.182698427 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 18:30:31.767709165 +0900
@@ -312,10 +312,213 @@ static int __meminit __add_section(int n
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   struct mem_section *ms;
+
+   for (; start_pfn  end_pfn; start_pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(start_pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(start_pfn)) != nid)
+   continue;
+
+   if (zone  zone != page_zone(pfn_to_page(start_pfn)))
+   continue;
+
+   return start_pfn;
+   }
+
+   return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+   unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   struct mem_section *ms;
+   unsigned long pfn;
+
+   /* pfn is the end pfn of a memory section. */
+   pfn = end_pfn - 1;
+   for (; pfn = start_pfn; pfn -= PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(pfn)) != nid)
+   continue;
+
+   if (zone  zone != page_zone(pfn_to_page(pfn)))
+   continue;
+
+   return pfn;
+   }
+
+   return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   unsigned long zone_start_pfn =  zone-zone_start_pfn;
+   unsigned long zone_end_pfn = zone-zone_start_pfn + zone-spanned_pages;
+   unsigned long pfn;
+   struct mem_section *ms;
+   int nid = zone_to_nid(zone);
+
+   zone_span_writelock(zone);
+   if (zone_start_pfn == start_pfn) {
+   /*
+* If the section is smallest section in the zone, it need
+* shrink zone-zone_start_pfn and zone-zone_spanned_pages.
+* In this case, we find second smallest valid mem_section
+* for shrinking zone.
+*/
+   pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+   zone_end_pfn);
+   if (pfn) {
+   zone-zone_start_pfn = pfn;
+   zone-spanned_pages = zone_end_pfn - pfn;
+   }
+   } else if (zone_end_pfn == end_pfn) {
+   /*
+* If the section is biggest section in the zone, it need
+* shrink zone-spanned_pages.
+* In this case, we find second biggest valid mem_section for
+* shrinking zone.
+*/
+   pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+  start_pfn);
+   if (pfn)
+   zone-spanned_pages = pfn - zone_start_pfn + 1;
+   }
+
+   /*
+* The section is not biggest or smallest mem_section in the zone, it
+* only creates a hole in the zone. So in this case, we need not
+* change the zone. But perhaps, the zone has only hole data. Thus
+* it check the zone has only hole or not.
+*/
+   pfn = zone_start_pfn;
+   for (; pfn  zone_end_pfn; pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (page_zone(pfn_to_page(pfn)) != zone)
+   continue;
+
+/* If the section 

[PATCH 10/10] memory-hotplug : remove sysfs file of node

2012-10-04 Thread Yasuaki Ishimatsu
From: Wen Congyang we...@cn.fujitsu.com

This patch introduces a new function try_offline_node() to
remove sysfs file of node when all memory sections of this
node are removed. If some memory sections of this node are
not removed, this function does nothing.

CC: David Rientjes rient...@google.com
CC: Jiang Liu liu...@gmail.com
CC: Len Brown len.br...@intel.com
CC: Christoph Lameter c...@linux.com
Cc: Minchan Kim minchan@gmail.com
CC: Andrew Morton a...@linux-foundation.org
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 mm/memory_hotplug.c |   54 
 1 file changed, 54 insertions(+)

Index: linux-3.6/mm/memory_hotplug.c
===
--- linux-3.6.orig/mm/memory_hotplug.c  2012-10-04 18:30:31.767709165 +0900
+++ linux-3.6/mm/memory_hotplug.c   2012-10-04 18:32:46.907842637 +0900
@@ -29,6 +29,7 @@
 #include linux/suspend.h
 #include linux/mm_inline.h
 #include linux/firmware-map.h
+#include linux/stop_machine.h
 
 #include asm/tlbflush.h
 
@@ -1276,6 +1277,57 @@ int offline_memory(u64 start, u64 size)
return 0;
 }
 
+static int check_cpu_on_node(void *data)
+{
+   struct pglist_data *pgdat = data;
+   int cpu;
+
+   for_each_online_cpu(cpu) {
+   if (cpu_to_node(cpu) == pgdat-node_id)
+   /*
+* the cpu on this node is onlined, and we can't
+* offline this node.
+*/
+   return -EBUSY;
+   }
+
+   return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+static void try_offline_node(int nid)
+{
+   unsigned long start_pfn = NODE_DATA(nid)-node_start_pfn;
+   unsigned long end_pfn = start_pfn + NODE_DATA(nid)-node_spanned_pages;
+   unsigned long pfn;
+
+   for (pfn = start_pfn; pfn  end_pfn; pfn += PAGES_PER_SECTION) {
+   unsigned long section_nr = pfn_to_section_nr(pfn);
+
+   if (!present_section_nr(section_nr))
+   continue;
+
+   if (pfn_to_nid(pfn) != nid)
+   continue;
+
+   /*
+* some memory sections of this node are not removed, and we
+* can't offline node now.
+*/
+   return;
+   }
+
+   if (stop_machine(check_cpu_on_node, NODE_DATA(nid), NULL))
+   return;
+
+   /*
+* all memory sections of this node are removed, we can offline this
+* node now.
+*/
+   node_set_offline(nid);
+   unregister_one_node(nid);
+}
+
 int __ref remove_memory(int nid, u64 start, u64 size)
 {
int ret = 0;
@@ -1296,6 +1348,8 @@ int __ref remove_memory(int nid, u64 sta
firmware_map_remove(start, start + size, System RAM);
 
arch_remove_memory(start, size);
+
+   try_offline_node(nid);
 out:
unlock_memory_hotplug();
return ret;

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev